sequence

blank_profile_entry `module-attribute`

blank_profile_entry = copy()

{utils.profile_keys, repeat(0))}

aa_nan_counts_alph3 `module-attribute`

aa_nan_counts_alph3 = dict(zip(protein_letters_alph3, repeat(nan)))

{protein_letters_alph3, repeat(numpy.nan))}

nan_profile_entry `module-attribute`

nan_profile_entry = copy()

{utils.profile_keys, repeat(numpy.nan))}

Profile

Profile(entries: Iterable[ProfileEntry], dtype: str = 'profile', **kwargs)

Bases: UserList[ProfileEntry]

Parameters:

entries (Iterable[ProfileEntry]) –

The per-residue entries to create the instance
dtype (str, default: 'profile' ) –

The datatype of the profile.
**kwargs –

Source code in symdesign/structure/sequence.py

def __init__(self, entries: Iterable[ProfileEntry], dtype: str = 'profile', **kwargs):
    """Construct the instance

    Args:
        entries: The per-residue entries to create the instance
        dtype: The datatype of the profile.
        **kwargs:
    """
    super().__init__(initlist=entries, **kwargs)  # initlist sets UserList.data
    self.dtype = dtype

data `instance-attribute`

data: list[ProfileEntry]

[{'A': 0, 'R': 0, ..., 'lod': {'A': -5, 'R': -5, ...}, 'type': 'W', 'info': 3.20, 'weight': 0.73}, {}, ...]

available_keys `property`

available_keys: tuple[Any, ...]

Returns the available ProfileEntry keys that are present in the Profile

lods `property`

lods: list[AminoAcidDistribution]

The log of odds values, given for each amino acid type, for each entry in the Profile

types `property`

types: list[protein_letters_literal]

The amino acid type, for each entry in the Profile

weights `property`

weights: list[float]

The weight assigned to each entry in the Profile

info `property`

info: list[float]

The information present for each entry in the Profile

as_array

as_array(alphabet: str = protein_letters_alph1, lod: bool = False) -> ndarray

Convert the Profile into a numeric array

Parameters:

alphabet (str, default: protein_letters_alph1 ) –

The amino acid alphabet to use. Array values will be returned in this order
lod (bool, default: False ) –

Whether to return the array for the log of odds values

Returns:

ndarray –

The numerically encoded pssm where each entry along axis 0 is the position, and the entries on axis 1 are the frequency data at every indexed amino acid. Indices are according to the specified amino acid alphabet, i.e. array([[0.1, 0.01, 0.12, ...], ...])

Source code in symdesign/structure/sequence.py

def as_array(self, alphabet: str = protein_letters_alph1, lod: bool = False) -> np.ndarray:
    """Convert the Profile into a numeric array

    Args:
        alphabet: The amino acid alphabet to use. Array values will be returned in this order
        lod: Whether to return the array for the log of odds values

    Returns:
        The numerically encoded pssm where each entry along axis 0 is the position, and the entries on axis 1 are
            the frequency data at every indexed amino acid. Indices are according to the specified amino acid
            alphabet, i.e. array([[0.1, 0.01, 0.12, ...], ...])
    """
    if lod:
        if self.lods:
            data_source = self.lods
        else:
            raise ValueError(
                f"There aren't any values available for {self.__class__.__name__}.lods")
    else:
        data_source = self

    return np.array([[position_info[aa] for aa in alphabet]
                     for position_info in data_source], dtype=np.float32)

write

write(file_name: AnyStr = None, name: str = None, out_dir: AnyStr = os.getcwd()) -> AnyStr

Create a PSI-BLAST format PSSM file from a PSSM dictionary. Assumes residue numbering is 1 to last entry

Parameters:

file_name (AnyStr, default: None ) –

The explicit name of the file
name (str, default: None ) –

The name of the file. Will be used as the default file_name base name if file_name not provided
out_dir (AnyStr, default: getcwd() ) –

The location on disk to output the file. Only used if file_name not explicitly provided

Returns:

AnyStr –

Disk location of newly created .pssm file

Source code in symdesign/structure/sequence.py

def write(self, file_name: AnyStr = None, name: str = None, out_dir: AnyStr = os.getcwd()) -> AnyStr:
    """Create a PSI-BLAST format PSSM file from a PSSM dictionary. Assumes residue numbering is 1 to last entry

    Args:
        file_name: The explicit name of the file
        name: The name of the file. Will be used as the default file_name base name if file_name not provided
        out_dir: The location on disk to output the file. Only used if file_name not explicitly provided

    Returns:
        Disk location of newly created .pssm file
    """
    # Format the Profile according to the write_pssm_file mechanism
    # This takes the shape of 1 to the last entry
    data: list[list[float]] = [[position_info[aa] for aa in protein_letters_alph3] for position_info in self]
    if self.dtype == 'fragment':
        # Need to convert np.nan to zeros
        data = [[0 if np.nan else num for num in entry] for entry in data]
        logger.warning(f'Converting {self.dtype} type Profile np.nan values to 0.0')

    # Find out if the pssm has values expressed as frequencies (percentages) or as counts and modify accordingly
    lods = self.lods
    if isinstance(lods[0]['A'], float):
        separation1 = " " * 4
    else:
        separation1 = " " * 3
        # lod_freq = True
    # if type(pssm[first_key]['A']) == float:
    #     counts_freq = True

    if file_name is None:
        if name is None:
            raise ValueError(
                f"Must provide argument 'file_name' or 'name' as a str to {self.write.__name__}")
        else:
            file_name = os.path.join(out_dir, name)

    if os.path.splitext(file_name)[-1] == '':  # No extension
        file_name = f'{file_name}.pssm'

    with open(file_name, 'w') as f:
        f.write(f'\n\n{" " * 12}{separation1.join(protein_letters_alph3)}'
                f'{separation1}{(" " * 3).join(protein_letters_alph3)}\n')
        for residue_number, (entry, lod, _type, info, weight) in enumerate(
                zip(data, lods, self.types, self.info, self.weights), 1):
            if isinstance(lod['A'], float):  # relevant for favor_fragment
                lod_string = \
                    ' '.join(f'{lod[aa]:>4.2f}' for aa in protein_letters_alph3) + ' '
            else:
                lod_string = \
                    ' '.join(f'{lod[aa]:>3d}' for aa in protein_letters_alph3) + ' '

            if isinstance(entry[0], float):  # relevant for freq calculations
                counts_string = ' '.join(f'{floor(value * 100):>3.0f}' for value in entry) + ' '
            else:
                counts_string = ' '.join(f'{value:>3d}' for value in entry) + ' '

            f.write(f'{residue_number:>5d} {_type:1s}   {lod_string:80s} {counts_string:80s} '
                    f'{round(info, 4):4.2f} {round(weight, 4):4.2f}\n')

    return file_name

GeneEntity

GeneEntity(**kwargs)

Bases: ABC

Contains the sequence information for a ContainsResidues.

Parameters:

**kwargs –

Source code in symdesign/structure/sequence.py

def __init__(self, **kwargs):
    """Construct the instance

    Args:
        **kwargs:
    """
    super().__init__(**kwargs)  # GeneEntity
    self._evolutionary_profile = {}  # position specific scoring matrix
    self.h_fields = None
    self.j_couplings = None
    self.profile = {}  # design/structure specific scoring matrix

log `abstractmethod` `property`

log: Logger

sequence `abstractmethod` `property`

sequence: str

reference_sequence `abstractmethod` `property`

reference_sequence: str

number_of_residues `property`

number_of_residues: int

evolutionary_profile `property` `writable`

evolutionary_profile: dict

Access the evolutionary_profile

msa `property` `writable`

msa: MultipleSequenceAlignment | None

The MultipleSequenceAlignment object for the instance

sequence_numeric `property`

sequence_numeric: ndarray

Return the sequence as an integer array (number_of_residuces, alphabet_length) of the amino acid characters

Maps "ACDEFGHIKLMNPQRSTVWY-" to the resulting index

hydrophobic_collapse

hydrophobic_collapse(**kwargs) -> array

Return the hydrophobic collapse for the Sequence

Other Parameters:

hydrophobicity –

int = 'standard' – The hydrophobicity scale to consider. Either 'standard' (FILV), 'expanded' (FMILYVW), or provide one with 'custom' keyword argument
custom –

mapping[str, float | int] = None – A user defined mapping of amino acid type, hydrophobicity value pairs
alphabet_type –

alphabet_types = None – The amino acid alphabet if the sequence consists of integer characters
lower_window –

int = 3 – The smallest window used to measure
upper_window –

int = 9 – The largest window used to measure

Source code in symdesign/structure/sequence.py

def hydrophobic_collapse(self, **kwargs) -> np.array:
    """Return the hydrophobic collapse for the Sequence

    Keyword Args:
        hydrophobicity: int = 'standard' – The hydrophobicity scale to consider. Either 'standard' (FILV),
            'expanded' (FMILYVW), or provide one with 'custom' keyword argument
        custom: mapping[str, float | int] = None – A user defined mapping of amino acid type, hydrophobicity value pairs
        alphabet_type: alphabet_types = None – The amino acid alphabet if the sequence consists of integer characters
        lower_window: int = 3 – The smallest window used to measure
        upper_window: int = 9 – The largest window used to measure
    """
    try:
        return self._hydrophobic_collapse
    except AttributeError:
        self._hydrophobic_collapse = metrics.hydrophobic_collapse_index(self.sequence, **kwargs)
        return self._hydrophobic_collapse

add_evolutionary_profile

add_evolutionary_profile(file: AnyStr = None, out_dir: AnyStr = os.getcwd(), profile_source: alignment_programs_literal = putils.hhblits, force: bool = False, **kwargs)

Add the evolutionary profile to the GeneEntity. If the profile isn't provided, it is generated through search of homologous protein sequences using the profile_source argument

Parameters:

file (AnyStr, default: None ) –

Location where profile file should be loaded from
out_dir (AnyStr, default: getcwd() ) –

Location where sequence files should be written
profile_source (alignment_programs_literal, default: hhblits ) –

One of 'hhblits' or 'psiblast'
force (bool, default: False ) –

Whether to force generation of a new profile

Sets

self.evolutionary_profile (ProfileDict)

Source code in symdesign/structure/sequence.py

def add_evolutionary_profile(self, file: AnyStr = None, out_dir: AnyStr = os.getcwd(),
                             profile_source: alignment_programs_literal = putils.hhblits, force: bool = False,
                             **kwargs):
    """Add the evolutionary profile to the GeneEntity. If the profile isn't provided, it is generated through search
    of homologous protein sequences using the profile_source argument

    Args:
        file: Location where profile file should be loaded from
        out_dir: Location where sequence files should be written
        profile_source: One of 'hhblits' or 'psiblast'
        force: Whether to force generation of a new profile

    Sets:
        self.evolutionary_profile (ProfileDict)
    """
    if profile_source not in alignment_programs:  # [putils.hhblits, 'psiblast']:
        raise ValueError(
            f'{self.add_evolutionary_profile.__name__}: Profile generation only possible from '
            f'{", ".join(alignment_programs)}, not {profile_source}')

    if file is not None:
        pssm_file = file
    else:  # Check to see if the files of interest already exist
        # Extract/Format Sequence Information. SEQRES is prioritized if available
        name = self.name
        sequence_file = self.write_sequence_to_fasta(out_dir=out_dir)
        temp_file = Path(out_dir, f'{name}.hold')
        pssm_file = os.path.join(out_dir, f'{name}.hmm')
        if not os.path.exists(pssm_file) or force:
            if not os.path.exists(temp_file):  # No work on this pssm file has been initiated
                # Create blocking file to prevent excess work
                with open(temp_file, 'w') as f:
                    self.log.info(f"Fetching '{name}' sequence data")

                self.log.info(f'Generating Evolutionary Profile for {name}')
                getattr(self, profile_source)(sequence_file=sequence_file, out_dir=out_dir)
                temp_file.unlink(missing_ok=True)
                # if os.path.exists(temp_file):
                #     os.remove(temp_file)
            else:  # Block is in place, another process is working
                self.log.info(f"Waiting for '{name}' profile generation...")
                while not os.path.exists(pssm_file):
                    if int(time.time()) - int(os.path.getmtime(temp_file)) > 5400:  # > 1 hr 30 minutes have passed
                        # os.remove(temp_file)
                        temp_file.unlink(missing_ok=True)
                        raise StructureException(
                            f'{self.add_evolutionary_profile.__name__}: Generation of the profile for {name} '
                            'took longer than the time limit\nKilled')
                    time.sleep(20)

    # Set self.evolutionary_profile
    self.evolutionary_profile = parse_hhblits_pssm(pssm_file)

create_null_profile

create_null_profile(nan: bool = False, zero_index: bool = False, **kwargs) -> ProfileDict

Make a blank profile

Parameters:

nan (bool, default: False ) –

Whether to fill the null profile with np.nan
zero_index (bool, default: False ) –

bool = False - If True, return the dictionary with zero indexing

Returns:

ProfileDict –

Dictionary containing profile information with keys as the index (zero or one-indexed), values as PSSM
Ex ( ProfileDict ) –

{1: {'A': 0, 'R': 0, ..., 'lod': {'A': -5, 'R': -5, ...}, 'type': 'W', 'info': 3.20, 'weight': 0.73}, 2: {}, ...}

Source code in symdesign/structure/sequence.py

def create_null_profile(self, nan: bool = False, zero_index: bool = False, **kwargs) -> ProfileDict:
    """Make a blank profile

    Args:
        nan: Whether to fill the null profile with np.nan
        zero_index: bool = False - If True, return the dictionary with zero indexing

    Returns:
        Dictionary containing profile information with keys as the index (zero or one-indexed), values as PSSM
        Ex: {1: {'A': 0, 'R': 0, ..., 'lod': {'A': -5, 'R': -5, ...}, 'type': 'W', 'info': 3.20, 'weight': 0.73},
             2: {}, ...}
    """
    offset = 0 if zero_index else ZERO_OFFSET

    if nan:
        _profile_entry = nan_profile_entry
    else:
        _profile_entry = blank_profile_entry

    profile = {residue: _profile_entry.copy()
               for residue in range(offset, offset + self.number_of_residues)}

    for residue_data, residue_type in zip(profile.values(), self.sequence):
        residue_data['type'] = residue_type

    return profile

create_null_entries `staticmethod`

create_null_entries(entry_numbers: Iterable[int], nan: bool = False, **kwargs) -> ProfileDict

Make a blank profile

Parameters:

entry_numbers (Iterable[int]) –

The numbers to generate null entries for
nan (bool, default: False ) –

Whether to fill the null profile with np.nan

Returns:

ProfileDict –

Dictionary containing profile information with the specified entries as the index, values as PSSM
Ex ( ProfileDict ) –

{1: {'A': 0, 'R': 0, ..., 'lod': {'A': -5, 'R': -5, ...}, 'info': 3.20, 'weight': 0.73}, 2: {}, ...}
ProfileDict –

Importantly, there is no 'type' key. This must be added

Source code in symdesign/structure/sequence.py

@staticmethod
def create_null_entries(entry_numbers: Iterable[int], nan: bool = False, **kwargs) -> ProfileDict:
    """Make a blank profile

    Args:
        entry_numbers: The numbers to generate null entries for
        nan: Whether to fill the null profile with np.nan

    Returns:
        Dictionary containing profile information with the specified entries as the index, values as PSSM
        Ex: {1: {'A': 0, 'R': 0, ..., 'lod': {'A': -5, 'R': -5, ...}, 'info': 3.20, 'weight': 0.73},
             2: {}, ...}
        Importantly, there is no 'type' key. This must be added
    """
    # offset = 0 if zero_index else ZERO_OFFSET

    if nan:
        _profile_entry = nan_profile_entry
    else:
        _profile_entry = blank_profile_entry

    return {entry: _profile_entry.copy() for entry in entry_numbers}

hhblits

hhblits(out_dir: AnyStr = os.getcwd(), **kwargs) -> list[str] | None

Generate a position specific scoring matrix from hhblits using Hidden Markov Models

Parameters:

out_dir (AnyStr, default: getcwd() ) –

Disk location where generated file should be written

Other Parameters:

sequence_file –

AnyStr = None - The file containing the sequence to use
threads –

Number of cpu's to use for the process
return_command –

Whether to simply return the hhblits command

Returns:

list[str] | None –

The command if return_command is True, otherwise None

Source code in symdesign/structure/sequence.py

def hhblits(self, out_dir: AnyStr = os.getcwd(), **kwargs) -> list[str] | None:
    """Generate a position specific scoring matrix from hhblits using Hidden Markov Models

    Args:
        out_dir: Disk location where generated file should be written

    Keyword Args:
        sequence_file: AnyStr = None - The file containing the sequence to use
        threads: Number of cpu's to use for the process
        return_command: Whether to simply return the hhblits command

    Returns:
        The command if return_command is True, otherwise None
    """
    result = hhblits(self.name, out_dir=out_dir, **kwargs)

    if result:  # return_command is True
        return result
    # Otherwise, make alignment file(s)
    name = self.name
    # Set file attributes according to logic of hhblits()
    a3m_file = os.path.join(out_dir, f'{name}.a3m')
    msa_file = os.path.join(out_dir, f'{name}.sto')
    fasta_msa = os.path.join(out_dir, f'{name}.fasta')
    # Preferred alignment type
    p = subprocess.Popen([putils.reformat_msa_exe_path, a3m_file, msa_file, '-num', '-uc'])
    p.communicate()
    p = subprocess.Popen([putils.reformat_msa_exe_path, a3m_file, fasta_msa, '-M', 'first', '-r'])
    p.communicate()

add_msa_from_file

add_msa_from_file(msa_file: AnyStr, file_format: msa_supported_types_literal = 'stockholm')

Add a multiple sequence alignment to the profile. Handles correct sizing of the MSA

Parameters:

msa_file (AnyStr) –

The multiple sequence alignment file to add to the Entity
file_format (msa_supported_types_literal, default: 'stockholm' ) –

The file type to read the multiple sequence alignment

Source code in symdesign/structure/sequence.py

def add_msa_from_file(self, msa_file: AnyStr, file_format: msa_supported_types_literal = 'stockholm'):
    """Add a multiple sequence alignment to the profile. Handles correct sizing of the MSA

    Args:
        msa_file: The multiple sequence alignment file to add to the Entity
        file_format: The file type to read the multiple sequence alignment
    """
    if file_format == 'stockholm':
        constructor = MultipleSequenceAlignment.from_stockholm
    elif file_format == 'fasta':
        constructor = MultipleSequenceAlignment.from_fasta
    else:
        raise ValueError(
            f"The file format '{file_format}' isn't an available format. Available formats include "
            f"{msa_supported_types}")

    self.msa = constructor(msa_file)

collapse_profile

collapse_profile(msa_file: AnyStr = None, **kwargs) -> ndarray

Make a profile out of the hydrophobic collapse index (HCI) for each sequence in a multiple sequence alignment

Takes ~5-10 seconds depending on the size of the msa

Calculate HCI for each sequence in the MSA (which are different lengths). This is the Hydro Collapse array. For each sequence, make a Gap mask, with full shape (length, number_of_residues) to account for gaps in each sequence. Apply the mask using a map between the Gap mask and the Hydro Collapse array. Finally, drop the columns from the array that are gaps in the reference sequence.

iter array - Gap mask - Hydro Collapse array - Aligned HCI - - Final HCI

iter - - - - - - 0 is gap - - compute for each - account for gaps - (drop idx 2)

it 1 2 3 4 - - 0 | 1 | 2 - - - - - - - 0 | 1 | 2 - - - - - - 0 | 1 | 2 - - - - - - - 0 | 1 | 3 | ... N

0 0 1 2 2 - - 1 | 1 | 0 - - - - - - 0.5 0.2 0.5 - - = - - 0.5 0.2 0.0 - -> - - 0.5 0.2 0.4 ... 0.3

1 0 0 1 2 - - 0 | 1 | 1 - - - - - - 0.4 0.7 0.4 - - = - - 0.0 0.4 0.7 - -> - - 0.0 0.4 0.4 ... 0.1

2 0 0 1 2 - - 0 | 1 | 1 - - - - - - 0.3 0.6 0.3 - - = - - 0.0 0.3 0.6 - -> - - 0.0 0.3 0.4 ... 0.0

Where index 0 is the MSA query sequence

After iteration cumulative summation, the iterator is multiplied by the gap mask. Next the Hydro Collapse array value is accessed by the gaped iterator. This places the Hydro Collapse array or np.nan (if there is a 0 index, i.e. a gap). After calculation, the element at index 2 is dropped from the array when the aligned sequence gaps are removed. Finally, only the indices of the query sequence are left in the profile, essentially giving the HCI for each sequence in the native context, adjusted to the specific context of the protein sequence at hand

Parameters:

msa_file (AnyStr, default: None ) –

The multiple sequence alignment file to use for collapse. Will use .msa attribute if not provided

Other Parameters:

file_format –

msa_supported_types_literal = 'stockholm' - The file type to read the multiple sequence alignment
hydrophobicity –

int = 'standard' – The hydrophobicity scale to consider. Either 'standard' (FILV), 'expanded' (FMILYVW), or provide one with 'custom' keyword argument
custom –

mapping[str, float | int] = None – A user defined mapping of amino acid type, hydrophobicity value pairs
alphabet_type –

alphabet_types = None – The amino acid alphabet if the sequence consists of integer characters
lower_window –

int = 3 – The smallest window used to measure
upper_window –

int = 9 – The largest window used to measure

Returns:

ndarray –

Array with shape (length, number_of_residues) containing the hydrophobic collapse values for per-residue, per-sequence in the profile. The "query" sequence from the MultipleSequenceAlignment.query is located at index 0 on axis=0

Source code in symdesign/structure/sequence.py

def collapse_profile(self, msa_file: AnyStr = None, **kwargs) -> np.ndarray:
    """Make a profile out of the hydrophobic collapse index (HCI) for each sequence in a multiple sequence alignment

    Takes ~5-10 seconds depending on the size of the msa

    Calculate HCI for each sequence in the MSA (which are different lengths). This is the Hydro Collapse array. For
    each sequence, make a Gap mask, with full shape (length, number_of_residues) to account for gaps in
    each sequence. Apply the mask using a map between the Gap mask and the Hydro Collapse array. Finally, drop the
    columns from the array that are gaps in the reference sequence.

    iter array   -   Gap mask      -       Hydro Collapse array     -     Aligned HCI     - -     Final HCI

    ------------

    iter - - - - - - 0 is gap    - -     compute for each     -     account for gaps   -  (drop idx 2)

    it 1 2 3 4  - - 0 | 1 | 2 - - - - - - - 0 | 1 | 2 - - - - - - 0 | 1 | 2 - - - - - - - 0 | 1 | 3 | ... N

    0 0 1 2 2  - - 1 | 1 | 0 - - -   - - - 0.5 0.2 0.5 - -  = - - 0.5 0.2 0.0 -  ->   - - 0.5 0.2 0.4 ... 0.3

    1 0 0 1 2  - - 0 | 1 | 1 - - -   - - - 0.4 0.7 0.4 - -  = - - 0.0 0.4 0.7 -  ->   - - 0.0 0.4 0.4 ... 0.1

    2 0 0 1 2  - - 0 | 1 | 1 - - -   - - - 0.3 0.6 0.3 - -  = - - 0.0 0.3 0.6 -  ->   - - 0.0 0.3 0.4 ... 0.0

    Where index 0 is the MSA query sequence

    After iteration cumulative summation, the iterator is multiplied by the gap mask. Next the Hydro Collapse array
    value is accessed by the gaped iterator. This places the Hydro Collapse array or np.nan (if there is a 0 index,
    i.e. a gap). After calculation, the element at index 2 is dropped from the array when the aligned sequence gaps
    are removed. Finally, only the indices of the query sequence are left in the profile, essentially giving the HCI
    for each sequence in the native context, adjusted to the specific context of the protein sequence at hand

    Args:
        msa_file: The multiple sequence alignment file to use for collapse. Will use .msa attribute if not provided

    Keyword Args:
        file_format: msa_supported_types_literal = 'stockholm' - The file type to read the multiple sequence
            alignment
        hydrophobicity: int = 'standard' – The hydrophobicity scale to consider. Either 'standard' (FILV),
            'expanded' (FMILYVW), or provide one with 'custom' keyword argument
        custom: mapping[str, float | int] = None – A user defined mapping of amino acid type, hydrophobicity value
            pairs
        alphabet_type: alphabet_types = None – The amino acid alphabet if the sequence consists of integer
            characters
        lower_window: int = 3 – The smallest window used to measure
        upper_window: int = 9 – The largest window used to measure

    Returns:
        Array with shape (length, number_of_residues) containing the hydrophobic collapse values for
            per-residue, per-sequence in the profile. The "query" sequence from the MultipleSequenceAlignment.query
            is located at index 0 on axis=0
    """
    try:
        return self._collapse_profile
    except AttributeError:
        msa = self.msa
        if not msa:
            self.add_msa_from_file(msa_file)
            msa = self.msa

        # Make the output array. Use one additional length to add np.nan value at the 0 index for gaps
        evolutionary_collapse_np = np.zeros((msa.length, msa.number_of_positions + 1))
        evolutionary_collapse_np[:, 0] = np.nan  # np.nan for all missing indices
        for idx, sequence in enumerate(msa.sequences):
            non_gaped_sequence = str(sequence).replace('-', '')
            evolutionary_collapse_np[idx, 1:len(non_gaped_sequence) + 1] = \
                metrics.hydrophobic_collapse_index(non_gaped_sequence, **kwargs)
        # Todo this should be possible now metrics.hydrophobic_collapse_index(self.msa.array)

        msa_sequence_indices = msa.sequence_indices
        iterator_np = np.cumsum(msa_sequence_indices, axis=1) * msa_sequence_indices
        aligned_hci_np = np.take_along_axis(evolutionary_collapse_np, iterator_np, axis=1)
        # Select only the query sequence indices
        # sequence_hci_np = aligned_hci_np[:, self.msa.query_indices]
        # print('aligned_hci_np', aligned_hci_np.shape, aligned_hci_np)
        # print('self.msa.query_indices', self.msa.query_indices.shape, self.msa.query_indices)
        self._collapse_profile = aligned_hci_np[:, msa.query_indices]
        # self._collapse_profile = pd.DataFrame(aligned_hci_np[:, self.msa.query_indices],
        #                                       columns=list(range(1, self.msa.query_length + 1)))  # One-indexed
        # summary = pd.concat([sequence_hci_df, pd.concat([sequence_hci_df.mean(), sequence_hci_df.std()], axis=1,
        #                                                 keys=['mean', 'std']).T])

        return self._collapse_profile

direct_coupling_analysis

direct_coupling_analysis(msa_file: AnyStr = None, **kwargs) -> ndarray

Using boltzmann machine direct coupling analysis (bmDCA), score each sequence in an alignment based on the statistical energy compared to the learned DCA model

Parameters:

msa_file (AnyStr, default: None ) –

The multiple sequence alignment file to use for collapse. Will use .msa attribute if not provided

Other Parameters:

file_format –

msa_supported_types_literal = 'stockholm' - The file type to read the multiple sequence alignment

Returns:

ndarray –

Array with shape (length, number_of_residues) where the values are the energy for each residue/sequence based on direct coupling analysis parameters

Source code in symdesign/structure/sequence.py

def direct_coupling_analysis(self, msa_file: AnyStr = None, **kwargs) -> np.ndarray:
    """Using boltzmann machine direct coupling analysis (bmDCA), score each sequence in an alignment based on the
     statistical energy compared to the learned DCA model

    Args:
        msa_file: The multiple sequence alignment file to use for collapse. Will use .msa attribute if not provided

    Keyword Args:
        file_format: msa_supported_types_literal = 'stockholm' - The file type to read the multiple sequence
            alignment

    Returns:
        Array with shape (length, number_of_residues) where the values are the energy for each residue/sequence
            based on direct coupling analysis parameters
    """
    # Check if required attributes are present
    _raise = False
    missing_attrs = []
    msa = self.msa
    if not msa:
        if msa_file:
            self.add_msa_from_file(msa_file)
            msa = self.msa
        else:
            missing_attrs.append('.msa')
    h_fields = self.h_fields
    if not h_fields:
        missing_attrs.append('.h_fields')
    j_couplings = self.j_couplings
    if not j_couplings:
        missing_attrs.append('.j_couplings')

    if missing_attrs:
        raise AttributeError(
            f"The required attribute(s) {', '.join(missing_attrs)} aren't available. Add to the "
            f'Entity before {self.direct_coupling_analysis.__name__}')

    analysis_length = msa.query_length
    idx_range = np.arange(analysis_length)
    # h_fields = bmdca.load_fields(os.path.join(data_dir, '%s_bmDCA' % self.name, 'parameters_h_final.bin'))
    # h_fields = h_fields.T  # this isn't required when coming in Fortran order, i.e. (21, analysis_length)
    # sum the h_fields values for each sequence position in every sequence
    h_values = h_fields[msa.numerical_alignment, idx_range[None, :]].sum(axis=1)
    h_sum = h_values.sum(axis=1)

    # coming in as a 4 dimension (analysis_length, analysis_length, alphabet_number, alphabet_number) ndarray
    # j_couplings = bmdca.load_couplings(os.path.join(data_dir, '%s_bmDCA' % self.name, 'parameters_J_final.bin'))
    i_idx = np.repeat(idx_range, analysis_length)
    j_idx = np.tile(idx_range, analysis_length)
    i_aa = np.repeat(msa.numerical_alignment, analysis_length)
    j_aa = np.tile(msa.numerical_alignment, msa.query_length)
    j_values = np.zeros((msa.length, len(i_idx)))
    for idx in range(msa.length):
        j_values[idx] = j_couplings[i_idx, j_idx, i_aa, j_aa]
    # this mask is not necessary when the array comes in as a non-symmetry matrix. All i > j result in 0 values...
    # mask = np.triu(np.ones((analysis_length, analysis_length)), k=1).flatten()
    # j_sum = j_values[:, mask].sum(axis=1)
    # sum the j_values for every design (axis 0) at every residue position (axis 1)
    j_values = np.array(np.split(j_values, 3, axis=1)).sum(axis=2).T
    j_sum = j_values.sum(axis=1)
    # couplings_idx = np.stack((i_idx, j_idx, i_aa, j_aa), axis=1)
    # this stacks all arrays like so
    #  [[[ i_idx1, i_idx2, ..., i_idxN],
    #    [ j_idx1, j_idx2, ..., j_idxN],  <- this is for one sequence
    #    [ i_aa 1, i_aa 2, ..., i_aa N],
    #    [ j_aa 1, j_aa 2, ..., j_aa N]],
    #   [[NEXT SEQUENCE],
    #    [
    # this stacks all arrays the transpose, which would match the indexing style on j_couplings much better...
    # couplings_idx = np.stack((i_idx, j_idx, i_aa, j_aa), axis=2)
    # j_sum = np.zeros((self.msa.length, len(couplings_idx)))
    # for idx in range(self.msa.length):
    #     j_sum[idx] = j_couplings[couplings_idx[idx]]
    # return -h_sum - j_sum
    return -h_values - j_values

write_sequence_to_fasta

write_sequence_to_fasta(dtype: sequence_type_literal = 'reference_sequence', file_name: AnyStr = None, name: str = None, out_dir: AnyStr = os.getcwd()) -> AnyStr

Write a sequence to a .fasta file with fasta format and return the file location. '.fasta' is appended if not specified in the name argument

Parameters:

dtype (sequence_type_literal, default: 'reference_sequence' ) –

The type of sequence to write. Can be the the keywords 'reference_sequence' or 'sequence'
file_name (AnyStr, default: None ) –

The explicit name of the file
name (str, default: None ) –

The name of the sequence record. If not provided, the instance name will be used. Will be used as the default file_name base name if file_name not provided
out_dir (AnyStr, default: getcwd() ) –

The location on disk to output the file. Only used if file_name not explicitly provided

Returns: The path to the output file

Source code in symdesign/structure/sequence.py

def write_sequence_to_fasta(self, dtype: sequence_type_literal = 'reference_sequence', file_name: AnyStr = None,
                            name: str = None, out_dir: AnyStr = os.getcwd()) -> AnyStr:
    """Write a sequence to a .fasta file with fasta format and return the file location.
    '.fasta' is appended if not specified in the name argument

    Args:
        dtype: The type of sequence to write. Can be the the keywords 'reference_sequence' or 'sequence'
        file_name: The explicit name of the file
        name: The name of the sequence record. If not provided, the instance name will be used.
            Will be used as the default file_name base name if file_name not provided
        out_dir: The location on disk to output the file. Only used if file_name not explicitly provided
    Returns:
        The path to the output file
    """
    if dtype in sequence_types:
        # Get the attribute from the instance
        sequence = getattr(self, dtype)
    else:
        raise ValueError(
            f"Couldn't find a sequence matching the {dtype=}"
        )

    if name is None:
        name = self.name
    if file_name is None:
        file_name = os.path.join(out_dir, name)
        if not file_name.endswith('.fasta'):
            file_name = f'{file_name}.fasta'

    return write_sequence_to_fasta(sequence=sequence, name=name, file_name=file_name)

sequence_to_one_hot

sequence_to_one_hot(sequence: Sequence[str], translation_table: dict[str, int] = None, alphabet_order: int = 1) -> ndarray

Convert a sequence into a numeric array

Parameters:

sequence (Sequence[str]) –

The sequence to encode
translation_table (dict[str, int], default: None ) –

If a translation table (in bytes) is provided, it will be used. If not, use alphabet_order
alphabet_order (int, default: 1 ) –

The alphabetical order of the amino acid alphabet. Can be either 1 or 3

Returns:

ndarray –

The one-hot encoded sequence with shape (sequence length, translation_table length)

Source code in symdesign/structure/sequence.py

def sequence_to_one_hot(sequence: Sequence[str], translation_table: dict[str, int] = None,
                        alphabet_order: int = 1) -> np.ndarray:
    """Convert a sequence into a numeric array

    Args:
        sequence: The sequence to encode
        translation_table: If a translation table (in bytes) is provided, it will be used. If not, use alphabet_order
        alphabet_order: The alphabetical order of the amino acid alphabet. Can be either 1 or 3

    Returns:
        The one-hot encoded sequence with shape (sequence length, translation_table length)
    """
    numeric_sequence = sequence_to_numeric(sequence, translation_table, alphabet_order=alphabet_order)
    if translation_table is None:
        # Assumes that alphabet_order is used and there aren't missing letters...
        num_entries = 20
    else:
        num_entries = len(translation_table)
    one_hot = np.zeros((len(sequence), num_entries), dtype=np.int32)
    try:
        one_hot[:, numeric_sequence] = 1
    except IndexError:  # Our assumption above was wrong
        from symdesign import sequence as _seq
        embedding = getattr(_seq, f'numerical_translation_alph{alphabet_order}_bytes') \
            if translation_table is None else translation_table
        raise ValueError(
            f"Couldn't produce a proper one-hot encoding for the provided sequence embedding: {embedding}")
    return one_hot

sequence_to_numeric

sequence_to_numeric(sequence: Sequence[str], translation_table: dict[str, int] = None, alphabet_order: int = 1) -> ndarray

Convert a sequence into a numeric array

Parameters:

sequence (Sequence[str]) –

The sequence to encode
translation_table (dict[str, int], default: None ) –

If a translation table (in bytes) is provided, it will be used. If not, use alphabet_order
alphabet_order (int, default: 1 ) –

The alphabetical order of the amino acid alphabet. Can be either 1 or 3

Returns:

ndarray –

The numerically encoded sequence where each entry along axis=0 is the indexed amino acid. Indices are according to the 1 letter alphabetical amino acid

Source code in symdesign/structure/sequence.py

def sequence_to_numeric(sequence: Sequence[str], translation_table: dict[str, int] = None,
                        alphabet_order: int = 1) -> np.ndarray:
    """Convert a sequence into a numeric array

    Args:
        sequence: The sequence to encode
        translation_table: If a translation table (in bytes) is provided, it will be used. If not, use alphabet_order
        alphabet_order: The alphabetical order of the amino acid alphabet. Can be either 1 or 3

    Returns:
        The numerically encoded sequence where each entry along axis=0 is the indexed amino acid. Indices are according
            to the 1 letter alphabetical amino acid
    """
    _array = np.array(list(sequence), np.string_)
    if translation_table is not None:
        return np.vectorize(translation_table.__getitem__)(_array)
    else:
        if alphabet_order == 1:
            return np.vectorize(numerical_translation_alph1_bytes.__getitem__)(_array)
        elif alphabet_order == 3:
            raise NotImplementedError('Need to make the "numerical_translation_alph3_bytes" table')
            return np.vectorize(numerical_translation_alph3_bytes.__getitem__)(_array)
        else:
            raise ValueError(
                f"The 'alphabet_order' {alphabet_order} isn't valid. Choose from either 1 or 3")

sequences_to_numeric

sequences_to_numeric(sequences: Iterable[Sequence[str]], translation_table: dict[str, int] = None, alphabet_order: int = 1) -> ndarray

Convert sequences into a numeric array

Parameters:

sequences (Iterable[Sequence[str]]) –

The sequences to encode
translation_table (dict[str, int], default: None ) –

If a translation table (in bytes) is provided, it will be used. If not, use alphabet_order
alphabet_order (int, default: 1 ) –

The alphabetical order of the amino acid alphabet. Can be either 1 or 3

Returns:

ndarray –

The numerically encoded sequence where each entry along axis=0 is the indexed amino acid. Indices are according to the 1 letter alphabetical amino acid

Source code in symdesign/structure/sequence.py

def sequences_to_numeric(sequences: Iterable[Sequence[str]], translation_table: dict[str, int] = None,
                         alphabet_order: int = 1) -> np.ndarray:
    """Convert sequences into a numeric array

    Args:
        sequences: The sequences to encode
        translation_table: If a translation table (in bytes) is provided, it will be used. If not, use alphabet_order
        alphabet_order: The alphabetical order of the amino acid alphabet. Can be either 1 or 3

    Returns:
        The numerically encoded sequence where each entry along axis=0 is the indexed amino acid. Indices are according
            to the 1 letter alphabetical amino acid
    """
    _array = np.array([list(sequence) for sequence in sequences], np.string_)
    if translation_table is not None:
        return np.vectorize(translation_table.__getitem__)(_array)
    else:
        if alphabet_order == 1:
            return np.vectorize(numerical_translation_alph1_bytes.__getitem__)(_array)
        elif alphabet_order == 3:
            raise NotImplementedError('Need to make the "numerical_translation_alph3_bytes" table')
            return np.vectorize(numerical_translation_alph3_bytes.__getitem__)(_array)
        else:
            raise ValueError(
                f"The 'alphabet_order' {alphabet_order} isn't valid. Choose from either 1 or 3")

pssm_as_array

pssm_as_array(pssm: ProfileDict, alphabet: str = protein_letters_alph1, lod: bool = False) -> ndarray

Convert a position specific profile matrix into a numeric array

Parameters:

pssm (ProfileDict) –

{1: {'A': 0, 'R': 0, ..., 'lod': {'A': -5, 'R': -5, ...}, 'type': 'W', 'info': 3.20, 'weight': 0.73}, 2: {}, ...}
alphabet (str, default: protein_letters_alph1 ) –

The amino acid alphabet to use. Array values will be returned in this order
lod (bool, default: False ) –

Whether to return the array for the log of odds values

Returns:

ndarray –

The numerically encoded pssm where each entry along axis 0 is the position, and the entries on axis 1 are the frequency data at every indexed amino acid. Indices are according to the specified amino acid alphabet, i.e array([[0.1, 0.01, 0.12, ...], ...])

Source code in symdesign/structure/sequence.py

def pssm_as_array(pssm: ProfileDict, alphabet: str = protein_letters_alph1, lod: bool = False) \
        -> np.ndarray:
    """Convert a position specific profile matrix into a numeric array

    Args:
        pssm: {1: {'A': 0, 'R': 0, ..., 'lod': {'A': -5, 'R': -5, ...}, 'type': 'W', 'info': 3.20, 'weight': 0.73},
                  2: {}, ...}
        alphabet: The amino acid alphabet to use. Array values will be returned in this order
        lod: Whether to return the array for the log of odds values

    Returns:
        The numerically encoded pssm where each entry along axis 0 is the position, and the entries on axis 1 are the
            frequency data at every indexed amino acid. Indices are according to the specified amino acid alphabet,
            i.e array([[0.1, 0.01, 0.12, ...], ...])
    """
    if lod:
        return np.array([[position_info['lod'][aa] for aa in alphabet]
                         for position_info in pssm.values()], dtype=np.float32)
    else:
        return np.array([[position_info[aa] for aa in alphabet]
                         for position_info in pssm.values()], dtype=np.float32)

concatenate_profile

concatenate_profile(profiles: Iterable[Any], start_at: int = 1) -> dict[int, Any]

Combine a list of profiles (parsed PSSMs) by incrementing the entry index for each additional profile

Parameters:

profiles (Iterable[Any]) –

The profiles to concatenate
start_at (int, default: 1 ) –

The integer to start the resulting dictionary at

Returns The concatenated input profiles, make a concatenated PSSM {1: {'A': 0.04, 'C': 0.12, ..., 'lod': {'A': -5, 'C': -9, ...}, 'type': 'W', 'info': 0.00, 'weight': 0.00}, ...}}

Source code in symdesign/structure/sequence.py

def concatenate_profile(profiles: Iterable[Any], start_at: int = 1) -> dict[int, Any]:
    """Combine a list of profiles (parsed PSSMs) by incrementing the entry index for each additional profile

    Args:
        profiles: The profiles to concatenate
        start_at: The integer to start the resulting dictionary at

    Returns
        The concatenated input profiles, make a concatenated PSSM
            {1: {'A': 0.04, 'C': 0.12, ..., 'lod': {'A': -5, 'C': -9, ...}, 'type': 'W', 'info': 0.00,
                 'weight': 0.00}, ...}}
    """
    _count = count(start_at)
    return {next(_count): position_profile for profile in profiles for position_profile in profile.values()}

write_pssm_file

write_pssm_file(pssm: ProfileDict, file_name: AnyStr = None, name: str = None, out_dir: AnyStr = os.getcwd()) -> AnyStr | None

Create a PSI-BLAST format PSSM file from a PSSM dictionary. Assumes residue numbering is correct!

Parameters:

pssm (ProfileDict) –

A dictionary which has the keys: 'A', 'C', ... (all aa's), 'lod', 'type', 'info', 'weight'
file_name (AnyStr, default: None ) –

The explicit name of the file
name (str, default: None ) –

The name of the file. Will be used as the default file_name base name if file_name not provided
out_dir (AnyStr, default: getcwd() ) –

The location on disk to output the file. Only used if file_name not explicitly provided

Returns:

AnyStr | None –

Disk location of newly created .pssm file

Source code in symdesign/structure/sequence.py

def write_pssm_file(pssm: ProfileDict, file_name: AnyStr = None, name: str = None,
                    out_dir: AnyStr = os.getcwd()) -> AnyStr | None:
    """Create a PSI-BLAST format PSSM file from a PSSM dictionary. Assumes residue numbering is correct!

    Args:
        pssm: A dictionary which has the keys: 'A', 'C', ... (all aa's), 'lod', 'type', 'info', 'weight'
        file_name: The explicit name of the file
        name: The name of the file. Will be used as the default file_name base name if file_name not provided
        out_dir: The location on disk to output the file. Only used if file_name not explicitly provided

    Returns:
        Disk location of newly created .pssm file
    """
    if not pssm:
        return None

    # Find out if the pssm has values expressed as frequencies (percentages) or as counts and modify accordingly
    if isinstance(list(pssm.values())[0]['lod']['A'], float):
        separation1 = " " * 4
    else:
        separation1 = " " * 3

    if file_name is None:
        if name is None:
            raise ValueError(f'Must provide argument "file_name" or "name" as a str to {write_sequences.__name__}')
        else:
            file_name = os.path.join(out_dir, name)

    if os.path.splitext(file_name)[-1] == '':  # No extension
        file_name = f'{file_name}.pssm'

    with open(file_name, 'w') as f:
        f.write(f'\n\n{" " * 12}{separation1.join(protein_letters_alph3)}'
                f'{separation1}{(" " * 3).join(protein_letters_alph3)}\n')
        for residue_number, profile in pssm.items():
            if isinstance(profile['lod']['A'], float):  # lod_freq:  # relevant for favor_fragment
                lod_string = ' '.join(f'{profile["lod"][aa]:>4.2f}' for aa in protein_letters_alph3) \
                    + ' '
            else:
                lod_string = ' '.join(f'{profile["lod"][aa]:>3d}' for aa in protein_letters_alph3) \
                    + ' '

            if isinstance(profile['A'], float):  # counts_freq: # relevant for freq calculations
                counts_string = ' '.join(f'{floor(profile[aa] * 100):>3.0f}' for aa in
                                         protein_letters_alph3) \
                    + ' '
            else:
                counts_string = ' '.join(f'{profile[aa]:>3d}' for aa in protein_letters_alph3) \
                    + ' '
            f.write(f'{residue_number:>5d} {profile["type"]:1s}   {lod_string:80s} {counts_string:80s} '
                    f'{round(profile.get("info", 0.), 4):4.2f} {round(profile.get("weight", 0.), 4):4.2f}\n')

    return file_name

format_frequencies

format_frequencies(frequency_list: list, flip: bool = False) -> dict[str, dict[str, float]]

Format list of paired frequency data into parsable paired format

Parameters:

frequency_list (list) –

[(('D', 'A'), 0.0822), (('D', 'V'), 0.0685), ...]
flip (bool, default: False ) –

Whether to invert the mapping of internal tuple

Returns: {'A': {'S': 0.02, 'T': 0.12}, ...}

Source code in symdesign/structure/sequence.py

def format_frequencies(frequency_list: list, flip: bool = False) -> dict[str, dict[str, float]]:
    """Format list of paired frequency data into parsable paired format

    Args:
        frequency_list: [(('D', 'A'), 0.0822), (('D', 'V'), 0.0685), ...]
        flip: Whether to invert the mapping of internal tuple
    Returns:
        {'A': {'S': 0.02, 'T': 0.12}, ...}
    """
    if flip:
        i, j = 1, 0
    else:
        i, j = 0, 1
    freq_d = {}
    for tup in frequency_list:
        aa_mapped = tup[0][i]  # 0
        aa_paired = tup[0][j]  # 1
        freq = tup[1]
        if aa_mapped in freq_d:
            freq_d[aa_mapped][aa_paired] = freq
        else:
            freq_d[aa_mapped] = {aa_paired: freq}

    return freq_d

overlap_consensus

overlap_consensus(issm, aa_set)

Find the overlap constrained consensus sequence

Parameters:

issm (dict) –

{1: {'A': 0.1, 'C': 0.0, ...}, 14: {...}, ...}
aa_set (dict) –

{residue: {'A', 'I', 'M', 'V'}, ...}

Returns: (dict): {23: 'T', 29: 'A', ...}

Source code in symdesign/structure/sequence.py

def overlap_consensus(issm, aa_set):  # UNUSED
    """Find the overlap constrained consensus sequence

    Args:
        issm (dict): {1: {'A': 0.1, 'C': 0.0, ...}, 14: {...}, ...}
        aa_set (dict): {residue: {'A', 'I', 'M', 'V'}, ...}
    Returns:
        (dict): {23: 'T', 29: 'A', ...}
    """
    consensus = {}
    for res in aa_set:
        max_freq = 0.0
        for aa in aa_set[res]:
            # if max_freq < issm[(res, partner)][]:
            if issm[res][aa] > max_freq:
                max_freq = issm[res][aa]
                consensus[res] = aa

    return consensus

get_cluster_dicts

get_cluster_dicts(db: str = putils.biological_interfaces, id_list: list[str] = None) -> dict[str, dict]

Generate an interface specific scoring matrix from the fragment library

Parameters:

db (str, default: biological_interfaces ) –

The source of the fragment information
id_list (list[str], default: None ) –

[1_2_24, ...]

Returns:

cluster_dict ( dict[str, dict] ) –

{'1_2_45': {'size': ..., 'rmsd': ..., 'rep': ..., 'mapped': ..., 'paired': ...}, ...}

Source code in symdesign/structure/sequence.py

def get_cluster_dicts(db: str = putils.biological_interfaces, id_list: list[str] = None) -> dict[str, dict]:
    """Generate an interface specific scoring matrix from the fragment library

    Args:
        db: The source of the fragment information
        id_list: [1_2_24, ...]

    Returns:
         cluster_dict: {'1_2_45': {'size': ..., 'rmsd': ..., 'rep': ..., 'mapped': ..., 'paired': ...}, ...}
    """
    info_db = putils.frag_directory[db]
    if id_list is None:
        directory_list = utils.get_base_root_paths_recursively(info_db)
    else:
        directory_list = []
        for _id in id_list:
            c_id = _id.split('_')
            _dir = os.path.join(info_db, c_id[0], c_id[0] + '_' + c_id[1], c_id[0] + '_' + c_id[1] + '_' + c_id[2])
            directory_list.append(_dir)

    cluster_dict = {}
    for cluster in directory_list:
        filename = os.path.join(cluster, os.path.basename(cluster) + '.pkl')
        cluster_dict[os.path.basename(cluster)] = utils.unpickle(filename)

    return cluster_dict

fragment_overlap

fragment_overlap(residues, interaction_graph, freq_map)

Take fragment contact list to find the possible AA types allowed in fragment pairs from the contact list

Parameters:

residues (iter) –

Iterable of residue numbers
interaction_graph (dict) –

{52: [54, 56, 72, 206], ...}
freq_map (dict) –

{(78, 87, ...): {'A': {'S': 0.02, 'T': 0.12}, ...}, ...}

Returns: overlap (dict): {residue: {'A', 'I', 'M', 'V'}, ...}

Source code in symdesign/structure/sequence.py

def fragment_overlap(residues, interaction_graph, freq_map):
    """Take fragment contact list to find the possible AA types allowed in fragment pairs from the contact list

    Args:
        residues (iter): Iterable of residue numbers
        interaction_graph (dict): {52: [54, 56, 72, 206], ...}
        freq_map (dict): {(78, 87, ...): {'A': {'S': 0.02, 'T': 0.12}, ...}, ...}
    Returns:
        overlap (dict): {residue: {'A', 'I', 'M', 'V'}, ...}
    """
    overlap = {}
    for res in residues:
        overlap[res] = set()
        if res in interaction_graph:  # check for existence as some fragment info is not in the interface set
            # overlap[res] = set()
            for partner in interaction_graph[res]:
                if (res, partner) in freq_map:
                    overlap[res] |= set(freq_map[(res, partner)].keys())

    for res in residues:
        if res in interaction_graph:  # check for existence as some fragment info is not in the interface set
            for partner in interaction_graph[res]:
                if (res, partner) in freq_map:
                    overlap[res] &= set(freq_map[(res, partner)].keys())

    return overlap

offset_index

offset_index(dictionary: dict[int, Any], to_zero: bool = False) -> dict[int, dict]

Modify the index of a sequence dictionary. Default is to one-indexed. to_zero=True gives zero-indexed

Source code in symdesign/structure/sequence.py

def offset_index(dictionary: dict[int, Any], to_zero: bool = False) -> dict[int, dict]:
    """Modify the index of a sequence dictionary. Default is to one-indexed. to_zero=True gives zero-indexed"""
    if to_zero:
        return {residue - ZERO_OFFSET: dictionary[residue] for residue in dictionary}
    else:
        return {residue + ZERO_OFFSET: dictionary[residue] for residue in dictionary}

residue_object_to_number

residue_object_to_number(residue_dict: dict[str, Iterable['structure.base.Residue']]) -> dict[str, list[tuple[int, ...]]]

Convert sets of PDB.Residue objects to residue numbers

Parameters:

residue_dict (dict) –

{'key1': [(residue1_ca_atom, residue2_ca_atom, ...), ...] ...}

Returns: residue_dict (dict): {'key1': [(78, 87, ...),], ...} - Entry mapped to residue sets

Source code in symdesign/structure/sequence.py

def residue_object_to_number(
    residue_dict: dict[str, Iterable['structure.base.Residue']]
) -> dict[str, list[tuple[int, ...]]]:
    """Convert sets of PDB.Residue objects to residue numbers

    Args:
        residue_dict (dict): {'key1': [(residue1_ca_atom, residue2_ca_atom, ...), ...] ...}
    Returns:
        residue_dict (dict): {'key1': [(78, 87, ...),], ...} - Entry mapped to residue sets
    """
    for entry in residue_dict:
        pairs = []
        # for _set in range(len(residue_dict[entry])):
        for j, _set in enumerate(residue_dict[entry]):
            residue_num_set = []
            # for i, residue in enumerate(residue_dict[entry][_set]):
            for residue in _set:
                resi_number = residue.residue_number
                residue_num_set.append(resi_number)
            pairs.append(tuple(residue_num_set))
        residue_dict[entry] = pairs

    return residue_dict

convert_to_residue_cluster_map

convert_to_residue_cluster_map(residue_cluster_dict, frag_range)

Make a residue and cluster/fragment index map

Parameters:

residue_cluster_dict (dict) –

{'1_2_45': [(residue1_ca_atom, residue2_ca_atom), ...] ...}
frag_range (dict) –

A range of the fragment size to search over. Ex: (-2, 3) for fragments of length 5

Returns: cluster_map (dict): {48: {'source': 'mapped', 'cluster': [(-2, 1_1_54), ...]}, ...} Where the key is the 0 indexed residue id

Source code in symdesign/structure/sequence.py

def convert_to_residue_cluster_map(residue_cluster_dict, frag_range):
    """Make a residue and cluster/fragment index map

    Args:
        residue_cluster_dict (dict): {'1_2_45': [(residue1_ca_atom, residue2_ca_atom), ...] ...}
        frag_range (dict): A range of the fragment size to search over. Ex: (-2, 3) for fragments of length 5
    Returns:
        cluster_map (dict): {48: {'source': 'mapped', 'cluster': [(-2, 1_1_54), ...]}, ...}
            Where the key is the 0 indexed residue id
    """
    cluster_map = {}
    for cluster in residue_cluster_dict:
        for pair in range(len(residue_cluster_dict[cluster])):
            for i, residue_atom in enumerate(residue_cluster_dict[cluster][pair]):
                # for each residue in map add the same cluster to the range of fragment residue numbers
                residue_num = residue_atom.residue_number - ZERO_OFFSET  # zero index
                for j in range(*frag_range):
                    if residue_num + j not in cluster_map:
                        if i == 0:
                            cluster_map[residue_num + j] = {'source': 'mapped', 'cluster': []}
                        else:
                            cluster_map[residue_num + j] = {'source': 'paired', 'cluster': []}
                    cluster_map[residue_num + j]['cluster'].append((j, cluster))

    return cluster_map

consensus_sequence

consensus_sequence(pssm)

Return the consensus sequence from a PSSM

Parameters:

pssm (dict) –

pssm dictionary

Return: consensus_identities (dict): {1: 'M', 2: 'H', ...} One-indexed

Source code in symdesign/structure/sequence.py

def consensus_sequence(pssm):
    """Return the consensus sequence from a PSSM

    Args:
        pssm (dict): pssm dictionary
    Return:
        consensus_identities (dict): {1: 'M', 2: 'H', ...} One-indexed
    """
    consensus_identities = {}
    for residue in pssm:
        max_lod = 0
        max_res = pssm[residue]['type']
        for aa in protein_letters_alph3:
            if pssm[residue]['lod'][aa] > max_lod:
                max_lod = pssm[residue]['lod'][aa]
                max_res = aa
        consensus_identities[residue + ZERO_OFFSET] = max_res

    return consensus_identities

sequence_difference

sequence_difference(seq1: Sequence, seq2: Sequence, d: dict = None, matrix: str = 'BLOSUM62') -> float

Returns the sequence difference between two sequence iterators

Parameters:

seq1 (Sequence) –

Either an iterable with residue type as array, or key, with residue type as d[seq1][residue]['type']
seq2 (Sequence) –

Either an iterable with residue type as array, or key, with residue type as d[seq2][residue]['type']
d (dict, default: None ) –

The dictionary to look up seq1 and seq2 if they are keys in d
matrix (str, default: 'BLOSUM62' ) –

The type of matrix to score the sequence differences on

Returns: The computed sequence difference between seq1 and seq2

Source code in symdesign/structure/sequence.py

def sequence_difference(seq1: Sequence, seq2: Sequence, d: dict = None, matrix: str = 'BLOSUM62') -> float:  # TODO AMS
    """Returns the sequence difference between two sequence iterators

    Args:
        seq1: Either an iterable with residue type as array, or key, with residue type as d[seq1][residue]['type']
        seq2: Either an iterable with residue type as array, or key, with residue type as d[seq2][residue]['type']
        d: The dictionary to look up seq1 and seq2 if they are keys in d
        matrix: The type of matrix to score the sequence differences on
    Returns:
        The computed sequence difference between seq1 and seq2
    """
    if d is not None:
        # seq1 = d[seq1]
        # seq2 = d[seq2]
        # for residue in d[seq1]:
        #     s.append((d[seq1][residue]['type'], d[seq2][residue]['type']))
        pairs = [(d[seq1][residue]['type'], d[seq2][residue]['type']) for residue in d[seq1]]
    else:
        pairs = zip(seq1, seq2)

    matrix_ = substitution_matrices.load(matrix)
    scores = [matrix_.get((letter1, letter2), (letter2, letter1)) for letter1, letter2 in pairs]

    return sum(scores)

msa_from_dictionary

msa_from_dictionary(named_sequences: dict[str, str]) -> MultipleSeqAlignment

Create a MultipleSequenceAlignment from a dictionary of named sequences

Parameters:

named_sequences (dict[str, str]) –

{name: sequence, ...} ex: {'clean_asu': 'MNTEELQVAAFEI...', ...}

Returns: The MultipleSequenceAlignment object for the provided sequences

Source code in symdesign/structure/sequence.py

def msa_from_dictionary(named_sequences: dict[str, str]) -> MultipleSeqAlignment:
    """Create a MultipleSequenceAlignment from a dictionary of named sequences

    Args:
        named_sequences: {name: sequence, ...} ex: {'clean_asu': 'MNTEELQVAAFEI...', ...}
    Returns:
        The MultipleSequenceAlignment object for the provided sequences
    """
    return MultipleSeqAlignment([SeqRecord(Seq(sequence), annotations={'molecule_type': 'Protein'}, id=name)
                                 for name, sequence in named_sequences.items()])

msa_from_seq_records

msa_from_seq_records(seq_records: Iterable[SeqRecord]) -> MultipleSeqAlignment

Create a BioPython Multiple Sequence Alignment from a SeqRecord Iterable

Parameters:

seq_records (Iterable[SeqRecord]) –

{name: sequence, ...} ex: {'clean_asu': 'MNTEELQVAAFEI...', ...}

Returns: [SeqRecord(Seq('MNTEELQVAAFEI...', ...), id="Alpha"), SeqRecord(Seq('MNTEEL-VAAFEI...', ...), id="Beta"), ...]

Source code in symdesign/structure/sequence.py

def msa_from_seq_records(seq_records: Iterable[SeqRecord]) -> MultipleSeqAlignment:
    """Create a BioPython Multiple Sequence Alignment from a SeqRecord Iterable

    Args:
        seq_records: {name: sequence, ...} ex: {'clean_asu': 'MNTEELQVAAFEI...', ...}
    Returns:
        [SeqRecord(Seq('MNTEELQVAAFEI...', ...), id="Alpha"),
         SeqRecord(Seq('MNTEEL-VAAFEI...', ...), id="Beta"), ...]
    """
    return MultipleSeqAlignment(seq_records)

make_mutations_chain_agnostic

make_mutations_chain_agnostic(mutations)

Remove chain identifier from mutation dictionary

Parameters:

mutations (dict) –

{design: {chain_id: {mutation_index: {'from': 'A', 'to': 'K'}, ...}, ...}, ...}

Returns: (dict): {design: {mutation_index: {'from': 'A', 'to': 'K'}, ...}, ...}

Source code in symdesign/structure/sequence.py

def make_mutations_chain_agnostic(mutations):
    """Remove chain identifier from mutation dictionary

    Args:
        mutations (dict): {design: {chain_id: {mutation_index: {'from': 'A', 'to': 'K'}, ...}, ...}, ...}
    Returns:
        (dict): {design: {mutation_index: {'from': 'A', 'to': 'K'}, ...}, ...}
    """
    flattened_mutations = {}
    for design, chain_mutations in mutations.items():
        flattened_mutations[design] = {}
        for chain, mutations in chain_mutations.items():
            flattened_mutations[design].update(mutations)

    return flattened_mutations

simplify_mutation_dict

simplify_mutation_dict(mutations: dict[str, mutation_dictionary], to: bool = True) -> dict[str, mutation_dictionary]

Simplify mutation dictionary to 'to'/'from' AA key

Parameters:

mutations (dict[str, mutation_dictionary]) –

Ex: {alias: {mutation_index: {'from': 'A', 'to': 'K'}, ...}, ...}, ...}
to (bool, default: True ) –

Whether to simplify with the 'to' AA key (True) or the 'from' AA key (False)

Returns: The simplified mutation dictionary. Ex: {alias: {mutation_index: 'K', ...}, ...}

Source code in symdesign/structure/sequence.py

def simplify_mutation_dict(mutations: dict[str, mutation_dictionary], to: bool = True) \
        -> dict[str, mutation_dictionary]:
    """Simplify mutation dictionary to 'to'/'from' AA key

    Args:
        mutations: Ex: {alias: {mutation_index: {'from': 'A', 'to': 'K'}, ...}, ...}, ...}
        to: Whether to simplify with the 'to' AA key (True) or the 'from' AA key (False)
    Returns:
        The simplified mutation dictionary. Ex: {alias: {mutation_index: 'K', ...}, ...}
    """
    simplification = 'to' if to else 'from'

    for alias, indexed_mutations in mutations.items():
        for index, mutation in indexed_mutations.items():
            mutations[alias][index] = mutation[simplification]

    return mutations

weave_mutation_dict

weave_mutation_dict(sorted_freq, mut_prob, resi_divergence, int_divergence, des_divergence)

Make final dictionary, index to sequence

Parameters:

sorted_freq (dict) –

{15: ['S', 'A', 'T'], ... }
mut_prob (dict) –

{15: {'A': 0.05, 'C': 0.001, 'D': 0.1, ...}, 16: {}, ...}
resi_divergence (dict) –

{15: 0.732, 16: 0.552, ...}
int_divergence (dict) –

{15: 0.732, 16: 0.552, ...}
des_divergence (dict) –

{15: 0.732, 16: 0.552, ...}

Returns: weaved_dict (dict): {16: {'S': 0.134, 'A': 0.050, ..., 'jsd': 0.732, 'int_jsd': 0.412}, ...}

Source code in symdesign/structure/sequence.py

def weave_mutation_dict(sorted_freq, mut_prob, resi_divergence, int_divergence, des_divergence):
    """Make final dictionary, index to sequence

    Args:
        sorted_freq (dict): {15: ['S', 'A', 'T'], ... }
        mut_prob (dict): {15: {'A': 0.05, 'C': 0.001, 'D': 0.1, ...}, 16: {}, ...}
        resi_divergence (dict): {15: 0.732, 16: 0.552, ...}
        int_divergence (dict): {15: 0.732, 16: 0.552, ...}
        des_divergence (dict): {15: 0.732, 16: 0.552, ...}
    Returns:
        weaved_dict (dict): {16: {'S': 0.134, 'A': 0.050, ..., 'jsd': 0.732, 'int_jsd': 0.412}, ...}
    """
    weaved_dict = {}
    for residue in sorted_freq:
        final_resi = residue + ZERO_OFFSET
        weaved_dict[final_resi] = {}
        for aa in sorted_freq[residue]:
            weaved_dict[final_resi][aa] = round(mut_prob[residue][aa], 3)
        weaved_dict[final_resi]['jsd'] = resi_divergence[residue]
        weaved_dict[final_resi]['int_jsd'] = int_divergence[residue]
        weaved_dict[final_resi]['des_jsd'] = des_divergence[residue]

    return weaved_dict

clean_gaped_columns

clean_gaped_columns(alignment_dict, correct_index)

Cleans an alignment dictionary by revising key list with correctly indexed positions. 0 indexed

Source code in symdesign/structure/sequence.py

def clean_gaped_columns(alignment_dict, correct_index):  # UNUSED
    """Cleans an alignment dictionary by revising key list with correctly indexed positions. 0 indexed"""
    return {i: alignment_dict[index] for i, index in enumerate(correct_index)}

msa_to_prob_distribution

msa_to_prob_distribution(alignment)

Turn Alignment dictionary into a probability distribution

Parameters:

alignment (dict) –

{'meta': {'num_sequences': 214, 'query': 'MGSTHLVLK...', 'query_with_gaps': 'MGS--THLVLK...'}, 'msa': (Bio.Align.MultipleSeqAlignment) 'counts': {1: {'A': 13, 'C': 1, 'D': 23, ...}, 2: {}, ...}, 'frequencies': {1: {'A': 0.05, 'C': 0.001, 'D': 0.1, ...}, 2: {}, ...}, 'rep': {1: 210, 2:211, ...}} The msa formatted with counts and indexed by residue

Returns: (dict): {'meta': {'num_sequences': 214, 'query': 'MGSTHLVLK...', 'query_with_gaps': 'MGS--THLVLK...'}, 'msa': (Bio.Align.MultipleSeqAlignment) 'counts': {1: {'A': 13, 'C': 1, 'D': 23, ...}, 2: {}, ...}, 'frequencies': {1: {'A': 0.05, 'C': 0.001, 'D': 0.1, ...}, 2: {}, ...}, 'rep': {1: 210, 2:211, ...}} The msa formatted with counts and indexed by residue

Source code in symdesign/structure/sequence.py

def msa_to_prob_distribution(alignment):
    """Turn Alignment dictionary into a probability distribution

    Args:
        alignment (dict): {'meta': {'num_sequences': 214, 'query': 'MGSTHLVLK...', 'query_with_gaps': 'MGS--THLVLK...'},
                           'msa': (Bio.Align.MultipleSeqAlignment)
                           'counts': {1: {'A': 13, 'C': 1, 'D': 23, ...}, 2: {}, ...},
                           'frequencies': {1: {'A': 0.05, 'C': 0.001, 'D': 0.1, ...}, 2: {}, ...},
                           'rep': {1: 210, 2:211, ...}}
            The msa formatted with counts and indexed by residue
    Returns:
        (dict): {'meta': {'num_sequences': 214, 'query': 'MGSTHLVLK...', 'query_with_gaps': 'MGS--THLVLK...'},
                 'msa': (Bio.Align.MultipleSeqAlignment)
                 'counts': {1: {'A': 13, 'C': 1, 'D': 23, ...}, 2: {}, ...},
                 'frequencies': {1: {'A': 0.05, 'C': 0.001, 'D': 0.1, ...}, 2: {}, ...},
                 'rep': {1: 210, 2:211, ...}}
            The msa formatted with counts and indexed by residue
    """
    alignment['frequencies'] = {}
    for residue, amino_acid_counts in alignment['counts'].items():
        total_column_weight = alignment['rep'][residue]
        assert total_column_weight != 0, '%s: Processing error... Downstream cannot divide by 0. Position = %s' \
                                         % (msa_to_prob_distribution.__name__, residue)
        alignment['frequencies'][residue] = {aa: count / total_column_weight for aa, count in amino_acid_counts.items()}

    return alignment

window_score

window_score(scores: dict[int, float] | Sequence[float], window_len: int, lambda_: float = 0.5) -> dict

Takes an MSA score dict and transforms so that each position is a weighted average of the surrounding positions. Positions with scores less than zero are not changed and are ignored calculation

Modified from Capra and Singh 2007 code

Notes

The input should be a one-indexed dictionary (if a dictionary)

Parameters:

scores (dict[int, float] | Sequence[float]) –

A dictionary with scores.
window_len (int) –

Number of residues on either side of the current residue
lambda_ (float, default: 0.5 ) –

Float between 0 and 1 which parameterizes the amount of the score to pull from

Returns:

dict –

A dictionary with the modified scores for the specified window, one-indexed

Source code in symdesign/structure/sequence.py

def window_score(scores: dict[int, float] | Sequence[float], window_len: int, lambda_: float = 0.5) -> dict:  # UNUSED  incorporate into MultipleSequenceAlignment
    """Takes an MSA score dict and transforms so that each position is a weighted average of the surrounding positions.
    Positions with scores less than zero are not changed and are ignored calculation

    Modified from Capra and Singh 2007 code

    Notes:
        The input should be a one-indexed dictionary (if a dictionary)

    Args:
        scores: A dictionary with scores.
        window_len: Number of residues on either side of the current residue
        lambda_: Float between 0 and 1 which parameterizes the amount of the score to pull from

    Returns:
        A dictionary with the modified scores for the specified window, one-indexed
    """
    if window_len == 0:
        try:
            return dict(enumerate(scores.values(), ZERO_OFFSET))
        except AttributeError:  # Not a dict
            return dict(enumerate(scores, ZERO_OFFSET))
    else:
        window_scores = {}
        number_scores = len(scores)
        for i in range(number_scores + ZERO_OFFSET):
            s = number_terms = 0
            if i <= window_len:
                for j in range(ZERO_OFFSET, i + window_len + ZERO_OFFSET):
                    if i != j:
                        number_terms += 1
                        s += scores[j]
            elif i + window_len > number_scores:
                for j in range(i - window_len, number_scores + ZERO_OFFSET):
                    if i != j:
                        number_terms += 1
                        s += scores[j]
            else:
                for j in range(i - window_len, i + window_len + ZERO_OFFSET):
                    if i != j:
                        number_terms += 1
                        s += scores[j]
            window_scores[i] = (1 - lambda_) * (s / number_terms) + lambda_ * scores[i]

        return window_scores

rank_possibilities

rank_possibilities(probability_dict)

Gather alternative residues and sort them by probability.

Parameters:

probability_dict (dict) –

{15: {'A': 0.05, 'C': 0.001, 'D': 0.1, ...}, 16: {}, ...}

Returns: sorted_alternates_dict (dict): {15: ['S', 'A', 'T'], ... }

Source code in symdesign/structure/sequence.py

def rank_possibilities(probability_dict):  # UNUSED  incorporate into MultipleSequenceAlignment
    """Gather alternative residues and sort them by probability.

    Args:
        probability_dict (dict): {15: {'A': 0.05, 'C': 0.001, 'D': 0.1, ...}, 16: {}, ...}
    Returns:
         sorted_alternates_dict (dict): {15: ['S', 'A', 'T'], ... }
    """
    sorted_alternates_dict = {}
    for residue in probability_dict:
        residue_probability_list = []
        for aa in probability_dict[residue]:
            if probability_dict[residue][aa] > 0:
                residue_probability_list.append((aa, round(probability_dict[residue][aa], 5)))  # tuple instead of list
        residue_probability_list.sort(key=lambda tup: tup[1], reverse=True)
        # [('S', 0.13190), ('A', 0.0500), ...]
        sorted_alternates_dict[residue] = [aa[0] for aa in residue_probability_list]

    return sorted_alternates_dict

multi_chain_alignment

multi_chain_alignment(mutated_sequences, **kwargs)

Combines different chain's Multiple Sequence Alignments into a single MSA. One-indexed

Parameters:

mutated_sequences (dict) –

{chain: {name: sequence, ...}

Returns: (MultipleSequenceAlignment): The MSA object with counts, frequencies, sequences, and indexed by residue

Source code in symdesign/structure/sequence.py

def multi_chain_alignment(mutated_sequences, **kwargs):
    """Combines different chain's Multiple Sequence Alignments into a single MSA. One-indexed

    Args:
        mutated_sequences (dict): {chain: {name: sequence, ...}
    Returns:
        (MultipleSequenceAlignment): The MSA object with counts, frequencies, sequences, and indexed by residue
    """
    #         (dict): {'meta': {'num_sequences': 214, 'query': 'MGSTHLVLK...', 'query_with_gaps': 'MGS--THLVLK...'},
    #                  'msa': (Bio.Align.MultipleSeqAlignment)
    #                  'counts': {1: {'A': 13, 'C': 1, 'D': 23, ...}, 2: {}, ...},
    #                  'frequencies': {1: {'A': 0.05, 'C': 0.001, 'D': 0.1, ...}, 2: {}, ...},
    #                  'rep': {1: 210, 2:211, ...}}
    #             The msa formatted with counts and indexed by residue

    # Combine alignments for all chains from design file Ex: A: 1-102, B: 1-130. Alignment: 1-232
    total_alignment = None
    for idx, named_sequences in enumerate(mutated_sequences.values()):
        if idx == 0:
            total_alignment = msa_from_dictionary(named_sequences)[:, :]
        else:
            total_alignment += msa_from_dictionary(named_sequences)[:, :]

    if total_alignment:
        return MultipleSequenceAlignment(alignment=total_alignment, **kwargs)
    else:
        raise RuntimeError(f'{multi_chain_alignment.__name__} - No sequences were found!')

sequence

blank_profile_entry module-attribute

aa_nan_counts_alph3 module-attribute

nan_profile_entry module-attribute

Profile

data instance-attribute

available_keys property

lods property

types property

weights property

info property

as_array

write

GeneEntity

log abstractmethod property

sequence abstractmethod property

reference_sequence abstractmethod property

number_of_residues property

evolutionary_profile property writable

msa property writable

sequence_numeric property

hydrophobic_collapse

add_evolutionary_profile

create_null_profile

create_null_entries staticmethod

hhblits

add_msa_from_file

collapse_profile

direct_coupling_analysis

write_sequence_to_fasta

sequence_to_one_hot

sequence_to_numeric

sequences_to_numeric

pssm_as_array

concatenate_profile

write_pssm_file

format_frequencies

overlap_consensus

get_cluster_dicts

fragment_overlap

offset_index

residue_object_to_number

convert_to_residue_cluster_map

consensus_sequence

sequence_difference

msa_from_dictionary

msa_from_seq_records

make_mutations_chain_agnostic

simplify_mutation_dict

weave_mutation_dict

clean_gaped_columns

msa_to_prob_distribution

window_score

rank_possibilities

multi_chain_alignment

blank_profile_entry `module-attribute`

aa_nan_counts_alph3 `module-attribute`

nan_profile_entry `module-attribute`

data `instance-attribute`

available_keys `property`

lods `property`

types `property`

weights `property`

info `property`

log `abstractmethod` `property`

sequence `abstractmethod` `property`

reference_sequence `abstractmethod` `property`

number_of_residues `property`

evolutionary_profile `property` `writable`

msa `property` `writable`

sequence_numeric `property`

create_null_entries `staticmethod`