Skip to content

sequence

blank_profile_entry module-attribute

blank_profile_entry = copy()

{utils.profile_keys, repeat(0))}

aa_nan_counts_alph3 module-attribute

aa_nan_counts_alph3 = dict(zip(protein_letters_alph3, repeat(nan)))

{protein_letters_alph3, repeat(numpy.nan))}

nan_profile_entry module-attribute

nan_profile_entry = copy()

{utils.profile_keys, repeat(numpy.nan))}

Profile

Profile(entries: Iterable[ProfileEntry], dtype: str = 'profile', **kwargs)

Bases: UserList[ProfileEntry]

Parameters:

  • entries (Iterable[ProfileEntry]) –

    The per-residue entries to create the instance

  • dtype (str, default: 'profile' ) –

    The datatype of the profile.

  • **kwargs
Source code in symdesign/structure/sequence.py
251
252
253
254
255
256
257
258
259
260
def __init__(self, entries: Iterable[ProfileEntry], dtype: str = 'profile', **kwargs):
    """Construct the instance

    Args:
        entries: The per-residue entries to create the instance
        dtype: The datatype of the profile.
        **kwargs:
    """
    super().__init__(initlist=entries, **kwargs)  # initlist sets UserList.data
    self.dtype = dtype

data instance-attribute

data: list[ProfileEntry]

[{'A': 0, 'R': 0, ..., 'lod': {'A': -5, 'R': -5, ...}, 'type': 'W', 'info': 3.20, 'weight': 0.73}, {}, ...]

available_keys property

available_keys: tuple[Any, ...]

Returns the available ProfileEntry keys that are present in the Profile

lods property

lods: list[AminoAcidDistribution]

The log of odds values, given for each amino acid type, for each entry in the Profile

types property

types: list[protein_letters_literal]

The amino acid type, for each entry in the Profile

weights property

weights: list[float]

The weight assigned to each entry in the Profile

info property

info: list[float]

The information present for each entry in the Profile

as_array

as_array(alphabet: str = protein_letters_alph1, lod: bool = False) -> ndarray

Convert the Profile into a numeric array

Parameters:

  • alphabet (str, default: protein_letters_alph1 ) –

    The amino acid alphabet to use. Array values will be returned in this order

  • lod (bool, default: False ) –

    Whether to return the array for the log of odds values

Returns:

  • ndarray

    The numerically encoded pssm where each entry along axis 0 is the position, and the entries on axis 1 are the frequency data at every indexed amino acid. Indices are according to the specified amino acid alphabet, i.e. array([[0.1, 0.01, 0.12, ...], ...])

Source code in symdesign/structure/sequence.py
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
def as_array(self, alphabet: str = protein_letters_alph1, lod: bool = False) -> np.ndarray:
    """Convert the Profile into a numeric array

    Args:
        alphabet: The amino acid alphabet to use. Array values will be returned in this order
        lod: Whether to return the array for the log of odds values

    Returns:
        The numerically encoded pssm where each entry along axis 0 is the position, and the entries on axis 1 are
            the frequency data at every indexed amino acid. Indices are according to the specified amino acid
            alphabet, i.e. array([[0.1, 0.01, 0.12, ...], ...])
    """
    if lod:
        if self.lods:
            data_source = self.lods
        else:
            raise ValueError(
                f"There aren't any values available for {self.__class__.__name__}.lods")
    else:
        data_source = self

    return np.array([[position_info[aa] for aa in alphabet]
                     for position_info in data_source], dtype=np.float32)

write

write(file_name: AnyStr = None, name: str = None, out_dir: AnyStr = os.getcwd()) -> AnyStr

Create a PSI-BLAST format PSSM file from a PSSM dictionary. Assumes residue numbering is 1 to last entry

Parameters:

  • file_name (AnyStr, default: None ) –

    The explicit name of the file

  • name (str, default: None ) –

    The name of the file. Will be used as the default file_name base name if file_name not provided

  • out_dir (AnyStr, default: getcwd() ) –

    The location on disk to output the file. Only used if file_name not explicitly provided

Returns:

  • AnyStr

    Disk location of newly created .pssm file

Source code in symdesign/structure/sequence.py
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
def write(self, file_name: AnyStr = None, name: str = None, out_dir: AnyStr = os.getcwd()) -> AnyStr:
    """Create a PSI-BLAST format PSSM file from a PSSM dictionary. Assumes residue numbering is 1 to last entry

    Args:
        file_name: The explicit name of the file
        name: The name of the file. Will be used as the default file_name base name if file_name not provided
        out_dir: The location on disk to output the file. Only used if file_name not explicitly provided

    Returns:
        Disk location of newly created .pssm file
    """
    # Format the Profile according to the write_pssm_file mechanism
    # This takes the shape of 1 to the last entry
    data: list[list[float]] = [[position_info[aa] for aa in protein_letters_alph3] for position_info in self]
    if self.dtype == 'fragment':
        # Need to convert np.nan to zeros
        data = [[0 if np.nan else num for num in entry] for entry in data]
        logger.warning(f'Converting {self.dtype} type Profile np.nan values to 0.0')

    # Find out if the pssm has values expressed as frequencies (percentages) or as counts and modify accordingly
    lods = self.lods
    if isinstance(lods[0]['A'], float):
        separation1 = " " * 4
    else:
        separation1 = " " * 3
        # lod_freq = True
    # if type(pssm[first_key]['A']) == float:
    #     counts_freq = True

    if file_name is None:
        if name is None:
            raise ValueError(
                f"Must provide argument 'file_name' or 'name' as a str to {self.write.__name__}")
        else:
            file_name = os.path.join(out_dir, name)

    if os.path.splitext(file_name)[-1] == '':  # No extension
        file_name = f'{file_name}.pssm'

    with open(file_name, 'w') as f:
        f.write(f'\n\n{" " * 12}{separation1.join(protein_letters_alph3)}'
                f'{separation1}{(" " * 3).join(protein_letters_alph3)}\n')
        for residue_number, (entry, lod, _type, info, weight) in enumerate(
                zip(data, lods, self.types, self.info, self.weights), 1):
            if isinstance(lod['A'], float):  # relevant for favor_fragment
                lod_string = \
                    ' '.join(f'{lod[aa]:>4.2f}' for aa in protein_letters_alph3) + ' '
            else:
                lod_string = \
                    ' '.join(f'{lod[aa]:>3d}' for aa in protein_letters_alph3) + ' '

            if isinstance(entry[0], float):  # relevant for freq calculations
                counts_string = ' '.join(f'{floor(value * 100):>3.0f}' for value in entry) + ' '
            else:
                counts_string = ' '.join(f'{value:>3d}' for value in entry) + ' '

            f.write(f'{residue_number:>5d} {_type:1s}   {lod_string:80s} {counts_string:80s} '
                    f'{round(info, 4):4.2f} {round(weight, 4):4.2f}\n')

    return file_name

GeneEntity

GeneEntity(**kwargs)

Bases: ABC

Contains the sequence information for a ContainsResidues.

Parameters:

  • **kwargs
Source code in symdesign/structure/sequence.py
406
407
408
409
410
411
412
413
414
415
416
def __init__(self, **kwargs):
    """Construct the instance

    Args:
        **kwargs:
    """
    super().__init__(**kwargs)  # GeneEntity
    self._evolutionary_profile = {}  # position specific scoring matrix
    self.h_fields = None
    self.j_couplings = None
    self.profile = {}  # design/structure specific scoring matrix

log abstractmethod property

log: Logger

sequence abstractmethod property

sequence: str

reference_sequence abstractmethod property

reference_sequence: str

number_of_residues property

number_of_residues: int

evolutionary_profile property writable

evolutionary_profile: dict

Access the evolutionary_profile

msa property writable

msa: MultipleSequenceAlignment | None

The MultipleSequenceAlignment object for the instance

sequence_numeric property

sequence_numeric: ndarray

Return the sequence as an integer array (number_of_residuces, alphabet_length) of the amino acid characters

Maps "ACDEFGHIKLMNPQRSTVWY-" to the resulting index

hydrophobic_collapse

hydrophobic_collapse(**kwargs) -> array

Return the hydrophobic collapse for the Sequence

Other Parameters:

  • hydrophobicity

    int = 'standard' – The hydrophobicity scale to consider. Either 'standard' (FILV), 'expanded' (FMILYVW), or provide one with 'custom' keyword argument

  • custom

    mapping[str, float | int] = None – A user defined mapping of amino acid type, hydrophobicity value pairs

  • alphabet_type

    alphabet_types = None – The amino acid alphabet if the sequence consists of integer characters

  • lower_window

    int = 3 – The smallest window used to measure

  • upper_window

    int = 9 – The largest window used to measure

Source code in symdesign/structure/sequence.py
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
def hydrophobic_collapse(self, **kwargs) -> np.array:
    """Return the hydrophobic collapse for the Sequence

    Keyword Args:
        hydrophobicity: int = 'standard' – The hydrophobicity scale to consider. Either 'standard' (FILV),
            'expanded' (FMILYVW), or provide one with 'custom' keyword argument
        custom: mapping[str, float | int] = None – A user defined mapping of amino acid type, hydrophobicity value pairs
        alphabet_type: alphabet_types = None – The amino acid alphabet if the sequence consists of integer characters
        lower_window: int = 3 – The smallest window used to measure
        upper_window: int = 9 – The largest window used to measure
    """
    try:
        return self._hydrophobic_collapse
    except AttributeError:
        self._hydrophobic_collapse = metrics.hydrophobic_collapse_index(self.sequence, **kwargs)
        return self._hydrophobic_collapse

add_evolutionary_profile

add_evolutionary_profile(file: AnyStr = None, out_dir: AnyStr = os.getcwd(), profile_source: alignment_programs_literal = putils.hhblits, force: bool = False, **kwargs)

Add the evolutionary profile to the GeneEntity. If the profile isn't provided, it is generated through search of homologous protein sequences using the profile_source argument

Parameters:

  • file (AnyStr, default: None ) –

    Location where profile file should be loaded from

  • out_dir (AnyStr, default: getcwd() ) –

    Location where sequence files should be written

  • profile_source (alignment_programs_literal, default: hhblits ) –

    One of 'hhblits' or 'psiblast'

  • force (bool, default: False ) –

    Whether to force generation of a new profile

Sets

self.evolutionary_profile (ProfileDict)

Source code in symdesign/structure/sequence.py
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
def add_evolutionary_profile(self, file: AnyStr = None, out_dir: AnyStr = os.getcwd(),
                             profile_source: alignment_programs_literal = putils.hhblits, force: bool = False,
                             **kwargs):
    """Add the evolutionary profile to the GeneEntity. If the profile isn't provided, it is generated through search
    of homologous protein sequences using the profile_source argument

    Args:
        file: Location where profile file should be loaded from
        out_dir: Location where sequence files should be written
        profile_source: One of 'hhblits' or 'psiblast'
        force: Whether to force generation of a new profile

    Sets:
        self.evolutionary_profile (ProfileDict)
    """
    if profile_source not in alignment_programs:  # [putils.hhblits, 'psiblast']:
        raise ValueError(
            f'{self.add_evolutionary_profile.__name__}: Profile generation only possible from '
            f'{", ".join(alignment_programs)}, not {profile_source}')

    if file is not None:
        pssm_file = file
    else:  # Check to see if the files of interest already exist
        # Extract/Format Sequence Information. SEQRES is prioritized if available
        name = self.name
        sequence_file = self.write_sequence_to_fasta(out_dir=out_dir)
        temp_file = Path(out_dir, f'{name}.hold')
        pssm_file = os.path.join(out_dir, f'{name}.hmm')
        if not os.path.exists(pssm_file) or force:
            if not os.path.exists(temp_file):  # No work on this pssm file has been initiated
                # Create blocking file to prevent excess work
                with open(temp_file, 'w') as f:
                    self.log.info(f"Fetching '{name}' sequence data")

                self.log.info(f'Generating Evolutionary Profile for {name}')
                getattr(self, profile_source)(sequence_file=sequence_file, out_dir=out_dir)
                temp_file.unlink(missing_ok=True)
                # if os.path.exists(temp_file):
                #     os.remove(temp_file)
            else:  # Block is in place, another process is working
                self.log.info(f"Waiting for '{name}' profile generation...")
                while not os.path.exists(pssm_file):
                    if int(time.time()) - int(os.path.getmtime(temp_file)) > 5400:  # > 1 hr 30 minutes have passed
                        # os.remove(temp_file)
                        temp_file.unlink(missing_ok=True)
                        raise StructureException(
                            f'{self.add_evolutionary_profile.__name__}: Generation of the profile for {name} '
                            'took longer than the time limit\nKilled')
                    time.sleep(20)

    # Set self.evolutionary_profile
    self.evolutionary_profile = parse_hhblits_pssm(pssm_file)

create_null_profile

create_null_profile(nan: bool = False, zero_index: bool = False, **kwargs) -> ProfileDict

Make a blank profile

Parameters:

  • nan (bool, default: False ) –

    Whether to fill the null profile with np.nan

  • zero_index (bool, default: False ) –

    bool = False - If True, return the dictionary with zero indexing

Returns:

  • ProfileDict

    Dictionary containing profile information with keys as the index (zero or one-indexed), values as PSSM

  • Ex ( ProfileDict ) –

    {1: {'A': 0, 'R': 0, ..., 'lod': {'A': -5, 'R': -5, ...}, 'type': 'W', 'info': 3.20, 'weight': 0.73}, 2: {}, ...}

Source code in symdesign/structure/sequence.py
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
def create_null_profile(self, nan: bool = False, zero_index: bool = False, **kwargs) -> ProfileDict:
    """Make a blank profile

    Args:
        nan: Whether to fill the null profile with np.nan
        zero_index: bool = False - If True, return the dictionary with zero indexing

    Returns:
        Dictionary containing profile information with keys as the index (zero or one-indexed), values as PSSM
        Ex: {1: {'A': 0, 'R': 0, ..., 'lod': {'A': -5, 'R': -5, ...}, 'type': 'W', 'info': 3.20, 'weight': 0.73},
             2: {}, ...}
    """
    offset = 0 if zero_index else ZERO_OFFSET

    if nan:
        _profile_entry = nan_profile_entry
    else:
        _profile_entry = blank_profile_entry

    profile = {residue: _profile_entry.copy()
               for residue in range(offset, offset + self.number_of_residues)}

    for residue_data, residue_type in zip(profile.values(), self.sequence):
        residue_data['type'] = residue_type

    return profile

create_null_entries staticmethod

create_null_entries(entry_numbers: Iterable[int], nan: bool = False, **kwargs) -> ProfileDict

Make a blank profile

Parameters:

  • entry_numbers (Iterable[int]) –

    The numbers to generate null entries for

  • nan (bool, default: False ) –

    Whether to fill the null profile with np.nan

Returns:

  • ProfileDict

    Dictionary containing profile information with the specified entries as the index, values as PSSM

  • Ex ( ProfileDict ) –

    {1: {'A': 0, 'R': 0, ..., 'lod': {'A': -5, 'R': -5, ...}, 'info': 3.20, 'weight': 0.73}, 2: {}, ...}

  • ProfileDict

    Importantly, there is no 'type' key. This must be added

Source code in symdesign/structure/sequence.py
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
@staticmethod
def create_null_entries(entry_numbers: Iterable[int], nan: bool = False, **kwargs) -> ProfileDict:
    """Make a blank profile

    Args:
        entry_numbers: The numbers to generate null entries for
        nan: Whether to fill the null profile with np.nan

    Returns:
        Dictionary containing profile information with the specified entries as the index, values as PSSM
        Ex: {1: {'A': 0, 'R': 0, ..., 'lod': {'A': -5, 'R': -5, ...}, 'info': 3.20, 'weight': 0.73},
             2: {}, ...}
        Importantly, there is no 'type' key. This must be added
    """
    # offset = 0 if zero_index else ZERO_OFFSET

    if nan:
        _profile_entry = nan_profile_entry
    else:
        _profile_entry = blank_profile_entry

    return {entry: _profile_entry.copy() for entry in entry_numbers}

hhblits

hhblits(out_dir: AnyStr = os.getcwd(), **kwargs) -> list[str] | None

Generate a position specific scoring matrix from hhblits using Hidden Markov Models

Parameters:

  • out_dir (AnyStr, default: getcwd() ) –

    Disk location where generated file should be written

Other Parameters:

  • sequence_file

    AnyStr = None - The file containing the sequence to use

  • threads

    Number of cpu's to use for the process

  • return_command

    Whether to simply return the hhblits command

Returns:

  • list[str] | None

    The command if return_command is True, otherwise None

Source code in symdesign/structure/sequence.py
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
def hhblits(self, out_dir: AnyStr = os.getcwd(), **kwargs) -> list[str] | None:
    """Generate a position specific scoring matrix from hhblits using Hidden Markov Models

    Args:
        out_dir: Disk location where generated file should be written

    Keyword Args:
        sequence_file: AnyStr = None - The file containing the sequence to use
        threads: Number of cpu's to use for the process
        return_command: Whether to simply return the hhblits command

    Returns:
        The command if return_command is True, otherwise None
    """
    result = hhblits(self.name, out_dir=out_dir, **kwargs)

    if result:  # return_command is True
        return result
    # Otherwise, make alignment file(s)
    name = self.name
    # Set file attributes according to logic of hhblits()
    a3m_file = os.path.join(out_dir, f'{name}.a3m')
    msa_file = os.path.join(out_dir, f'{name}.sto')
    fasta_msa = os.path.join(out_dir, f'{name}.fasta')
    # Preferred alignment type
    p = subprocess.Popen([putils.reformat_msa_exe_path, a3m_file, msa_file, '-num', '-uc'])
    p.communicate()
    p = subprocess.Popen([putils.reformat_msa_exe_path, a3m_file, fasta_msa, '-M', 'first', '-r'])
    p.communicate()

add_msa_from_file

add_msa_from_file(msa_file: AnyStr, file_format: msa_supported_types_literal = 'stockholm')

Add a multiple sequence alignment to the profile. Handles correct sizing of the MSA

Parameters:

  • msa_file (AnyStr) –

    The multiple sequence alignment file to add to the Entity

  • file_format (msa_supported_types_literal, default: 'stockholm' ) –

    The file type to read the multiple sequence alignment

Source code in symdesign/structure/sequence.py
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
def add_msa_from_file(self, msa_file: AnyStr, file_format: msa_supported_types_literal = 'stockholm'):
    """Add a multiple sequence alignment to the profile. Handles correct sizing of the MSA

    Args:
        msa_file: The multiple sequence alignment file to add to the Entity
        file_format: The file type to read the multiple sequence alignment
    """
    if file_format == 'stockholm':
        constructor = MultipleSequenceAlignment.from_stockholm
    elif file_format == 'fasta':
        constructor = MultipleSequenceAlignment.from_fasta
    else:
        raise ValueError(
            f"The file format '{file_format}' isn't an available format. Available formats include "
            f"{msa_supported_types}")

    self.msa = constructor(msa_file)

collapse_profile

collapse_profile(msa_file: AnyStr = None, **kwargs) -> ndarray

Make a profile out of the hydrophobic collapse index (HCI) for each sequence in a multiple sequence alignment

Takes ~5-10 seconds depending on the size of the msa

Calculate HCI for each sequence in the MSA (which are different lengths). This is the Hydro Collapse array. For each sequence, make a Gap mask, with full shape (length, number_of_residues) to account for gaps in each sequence. Apply the mask using a map between the Gap mask and the Hydro Collapse array. Finally, drop the columns from the array that are gaps in the reference sequence.

iter array - Gap mask - Hydro Collapse array - Aligned HCI - - Final HCI


iter - - - - - - 0 is gap - - compute for each - account for gaps - (drop idx 2)

it 1 2 3 4 - - 0 | 1 | 2 - - - - - - - 0 | 1 | 2 - - - - - - 0 | 1 | 2 - - - - - - - 0 | 1 | 3 | ... N

0 0 1 2 2 - - 1 | 1 | 0 - - - - - - 0.5 0.2 0.5 - - = - - 0.5 0.2 0.0 - -> - - 0.5 0.2 0.4 ... 0.3

1 0 0 1 2 - - 0 | 1 | 1 - - - - - - 0.4 0.7 0.4 - - = - - 0.0 0.4 0.7 - -> - - 0.0 0.4 0.4 ... 0.1

2 0 0 1 2 - - 0 | 1 | 1 - - - - - - 0.3 0.6 0.3 - - = - - 0.0 0.3 0.6 - -> - - 0.0 0.3 0.4 ... 0.0

Where index 0 is the MSA query sequence

After iteration cumulative summation, the iterator is multiplied by the gap mask. Next the Hydro Collapse array value is accessed by the gaped iterator. This places the Hydro Collapse array or np.nan (if there is a 0 index, i.e. a gap). After calculation, the element at index 2 is dropped from the array when the aligned sequence gaps are removed. Finally, only the indices of the query sequence are left in the profile, essentially giving the HCI for each sequence in the native context, adjusted to the specific context of the protein sequence at hand

Parameters:

  • msa_file (AnyStr, default: None ) –

    The multiple sequence alignment file to use for collapse. Will use .msa attribute if not provided

Other Parameters:

  • file_format

    msa_supported_types_literal = 'stockholm' - The file type to read the multiple sequence alignment

  • hydrophobicity

    int = 'standard' – The hydrophobicity scale to consider. Either 'standard' (FILV), 'expanded' (FMILYVW), or provide one with 'custom' keyword argument

  • custom

    mapping[str, float | int] = None – A user defined mapping of amino acid type, hydrophobicity value pairs

  • alphabet_type

    alphabet_types = None – The amino acid alphabet if the sequence consists of integer characters

  • lower_window

    int = 3 – The smallest window used to measure

  • upper_window

    int = 9 – The largest window used to measure

Returns:

  • ndarray

    Array with shape (length, number_of_residues) containing the hydrophobic collapse values for per-residue, per-sequence in the profile. The "query" sequence from the MultipleSequenceAlignment.query is located at index 0 on axis=0

Source code in symdesign/structure/sequence.py
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
def collapse_profile(self, msa_file: AnyStr = None, **kwargs) -> np.ndarray:
    """Make a profile out of the hydrophobic collapse index (HCI) for each sequence in a multiple sequence alignment

    Takes ~5-10 seconds depending on the size of the msa

    Calculate HCI for each sequence in the MSA (which are different lengths). This is the Hydro Collapse array. For
    each sequence, make a Gap mask, with full shape (length, number_of_residues) to account for gaps in
    each sequence. Apply the mask using a map between the Gap mask and the Hydro Collapse array. Finally, drop the
    columns from the array that are gaps in the reference sequence.

    iter array   -   Gap mask      -       Hydro Collapse array     -     Aligned HCI     - -     Final HCI

    ------------

    iter - - - - - - 0 is gap    - -     compute for each     -     account for gaps   -  (drop idx 2)

    it 1 2 3 4  - - 0 | 1 | 2 - - - - - - - 0 | 1 | 2 - - - - - - 0 | 1 | 2 - - - - - - - 0 | 1 | 3 | ... N

    0 0 1 2 2  - - 1 | 1 | 0 - - -   - - - 0.5 0.2 0.5 - -  = - - 0.5 0.2 0.0 -  ->   - - 0.5 0.2 0.4 ... 0.3

    1 0 0 1 2  - - 0 | 1 | 1 - - -   - - - 0.4 0.7 0.4 - -  = - - 0.0 0.4 0.7 -  ->   - - 0.0 0.4 0.4 ... 0.1

    2 0 0 1 2  - - 0 | 1 | 1 - - -   - - - 0.3 0.6 0.3 - -  = - - 0.0 0.3 0.6 -  ->   - - 0.0 0.3 0.4 ... 0.0

    Where index 0 is the MSA query sequence

    After iteration cumulative summation, the iterator is multiplied by the gap mask. Next the Hydro Collapse array
    value is accessed by the gaped iterator. This places the Hydro Collapse array or np.nan (if there is a 0 index,
    i.e. a gap). After calculation, the element at index 2 is dropped from the array when the aligned sequence gaps
    are removed. Finally, only the indices of the query sequence are left in the profile, essentially giving the HCI
    for each sequence in the native context, adjusted to the specific context of the protein sequence at hand

    Args:
        msa_file: The multiple sequence alignment file to use for collapse. Will use .msa attribute if not provided

    Keyword Args:
        file_format: msa_supported_types_literal = 'stockholm' - The file type to read the multiple sequence
            alignment
        hydrophobicity: int = 'standard' – The hydrophobicity scale to consider. Either 'standard' (FILV),
            'expanded' (FMILYVW), or provide one with 'custom' keyword argument
        custom: mapping[str, float | int] = None – A user defined mapping of amino acid type, hydrophobicity value
            pairs
        alphabet_type: alphabet_types = None – The amino acid alphabet if the sequence consists of integer
            characters
        lower_window: int = 3 – The smallest window used to measure
        upper_window: int = 9 – The largest window used to measure

    Returns:
        Array with shape (length, number_of_residues) containing the hydrophobic collapse values for
            per-residue, per-sequence in the profile. The "query" sequence from the MultipleSequenceAlignment.query
            is located at index 0 on axis=0
    """
    try:
        return self._collapse_profile
    except AttributeError:
        msa = self.msa
        if not msa:
            self.add_msa_from_file(msa_file)
            msa = self.msa

        # Make the output array. Use one additional length to add np.nan value at the 0 index for gaps
        evolutionary_collapse_np = np.zeros((msa.length, msa.number_of_positions + 1))
        evolutionary_collapse_np[:, 0] = np.nan  # np.nan for all missing indices
        for idx, sequence in enumerate(msa.sequences):
            non_gaped_sequence = str(sequence).replace('-', '')
            evolutionary_collapse_np[idx, 1:len(non_gaped_sequence) + 1] = \
                metrics.hydrophobic_collapse_index(non_gaped_sequence, **kwargs)
        # Todo this should be possible now metrics.hydrophobic_collapse_index(self.msa.array)

        msa_sequence_indices = msa.sequence_indices
        iterator_np = np.cumsum(msa_sequence_indices, axis=1) * msa_sequence_indices
        aligned_hci_np = np.take_along_axis(evolutionary_collapse_np, iterator_np, axis=1)
        # Select only the query sequence indices
        # sequence_hci_np = aligned_hci_np[:, self.msa.query_indices]
        # print('aligned_hci_np', aligned_hci_np.shape, aligned_hci_np)
        # print('self.msa.query_indices', self.msa.query_indices.shape, self.msa.query_indices)
        self._collapse_profile = aligned_hci_np[:, msa.query_indices]
        # self._collapse_profile = pd.DataFrame(aligned_hci_np[:, self.msa.query_indices],
        #                                       columns=list(range(1, self.msa.query_length + 1)))  # One-indexed
        # summary = pd.concat([sequence_hci_df, pd.concat([sequence_hci_df.mean(), sequence_hci_df.std()], axis=1,
        #                                                 keys=['mean', 'std']).T])

        return self._collapse_profile

direct_coupling_analysis

direct_coupling_analysis(msa_file: AnyStr = None, **kwargs) -> ndarray

Using boltzmann machine direct coupling analysis (bmDCA), score each sequence in an alignment based on the statistical energy compared to the learned DCA model

Parameters:

  • msa_file (AnyStr, default: None ) –

    The multiple sequence alignment file to use for collapse. Will use .msa attribute if not provided

Other Parameters:

  • file_format

    msa_supported_types_literal = 'stockholm' - The file type to read the multiple sequence alignment

Returns:

  • ndarray

    Array with shape (length, number_of_residues) where the values are the energy for each residue/sequence based on direct coupling analysis parameters

Source code in symdesign/structure/sequence.py
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
def direct_coupling_analysis(self, msa_file: AnyStr = None, **kwargs) -> np.ndarray:
    """Using boltzmann machine direct coupling analysis (bmDCA), score each sequence in an alignment based on the
     statistical energy compared to the learned DCA model

    Args:
        msa_file: The multiple sequence alignment file to use for collapse. Will use .msa attribute if not provided

    Keyword Args:
        file_format: msa_supported_types_literal = 'stockholm' - The file type to read the multiple sequence
            alignment

    Returns:
        Array with shape (length, number_of_residues) where the values are the energy for each residue/sequence
            based on direct coupling analysis parameters
    """
    # Check if required attributes are present
    _raise = False
    missing_attrs = []
    msa = self.msa
    if not msa:
        if msa_file:
            self.add_msa_from_file(msa_file)
            msa = self.msa
        else:
            missing_attrs.append('.msa')
    h_fields = self.h_fields
    if not h_fields:
        missing_attrs.append('.h_fields')
    j_couplings = self.j_couplings
    if not j_couplings:
        missing_attrs.append('.j_couplings')

    if missing_attrs:
        raise AttributeError(
            f"The required attribute(s) {', '.join(missing_attrs)} aren't available. Add to the "
            f'Entity before {self.direct_coupling_analysis.__name__}')

    analysis_length = msa.query_length
    idx_range = np.arange(analysis_length)
    # h_fields = bmdca.load_fields(os.path.join(data_dir, '%s_bmDCA' % self.name, 'parameters_h_final.bin'))
    # h_fields = h_fields.T  # this isn't required when coming in Fortran order, i.e. (21, analysis_length)
    # sum the h_fields values for each sequence position in every sequence
    h_values = h_fields[msa.numerical_alignment, idx_range[None, :]].sum(axis=1)
    h_sum = h_values.sum(axis=1)

    # coming in as a 4 dimension (analysis_length, analysis_length, alphabet_number, alphabet_number) ndarray
    # j_couplings = bmdca.load_couplings(os.path.join(data_dir, '%s_bmDCA' % self.name, 'parameters_J_final.bin'))
    i_idx = np.repeat(idx_range, analysis_length)
    j_idx = np.tile(idx_range, analysis_length)
    i_aa = np.repeat(msa.numerical_alignment, analysis_length)
    j_aa = np.tile(msa.numerical_alignment, msa.query_length)
    j_values = np.zeros((msa.length, len(i_idx)))
    for idx in range(msa.length):
        j_values[idx] = j_couplings[i_idx, j_idx, i_aa, j_aa]
    # this mask is not necessary when the array comes in as a non-symmetry matrix. All i > j result in 0 values...
    # mask = np.triu(np.ones((analysis_length, analysis_length)), k=1).flatten()
    # j_sum = j_values[:, mask].sum(axis=1)
    # sum the j_values for every design (axis 0) at every residue position (axis 1)
    j_values = np.array(np.split(j_values, 3, axis=1)).sum(axis=2).T
    j_sum = j_values.sum(axis=1)
    # couplings_idx = np.stack((i_idx, j_idx, i_aa, j_aa), axis=1)
    # this stacks all arrays like so
    #  [[[ i_idx1, i_idx2, ..., i_idxN],
    #    [ j_idx1, j_idx2, ..., j_idxN],  <- this is for one sequence
    #    [ i_aa 1, i_aa 2, ..., i_aa N],
    #    [ j_aa 1, j_aa 2, ..., j_aa N]],
    #   [[NEXT SEQUENCE],
    #    [
    # this stacks all arrays the transpose, which would match the indexing style on j_couplings much better...
    # couplings_idx = np.stack((i_idx, j_idx, i_aa, j_aa), axis=2)
    # j_sum = np.zeros((self.msa.length, len(couplings_idx)))
    # for idx in range(self.msa.length):
    #     j_sum[idx] = j_couplings[couplings_idx[idx]]
    # return -h_sum - j_sum
    return -h_values - j_values

write_sequence_to_fasta

write_sequence_to_fasta(dtype: sequence_type_literal = 'reference_sequence', file_name: AnyStr = None, name: str = None, out_dir: AnyStr = os.getcwd()) -> AnyStr

Write a sequence to a .fasta file with fasta format and return the file location. '.fasta' is appended if not specified in the name argument

Parameters:

  • dtype (sequence_type_literal, default: 'reference_sequence' ) –

    The type of sequence to write. Can be the the keywords 'reference_sequence' or 'sequence'

  • file_name (AnyStr, default: None ) –

    The explicit name of the file

  • name (str, default: None ) –

    The name of the sequence record. If not provided, the instance name will be used. Will be used as the default file_name base name if file_name not provided

  • out_dir (AnyStr, default: getcwd() ) –

    The location on disk to output the file. Only used if file_name not explicitly provided

Returns: The path to the output file

Source code in symdesign/structure/sequence.py
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
def write_sequence_to_fasta(self, dtype: sequence_type_literal = 'reference_sequence', file_name: AnyStr = None,
                            name: str = None, out_dir: AnyStr = os.getcwd()) -> AnyStr:
    """Write a sequence to a .fasta file with fasta format and return the file location.
    '.fasta' is appended if not specified in the name argument

    Args:
        dtype: The type of sequence to write. Can be the the keywords 'reference_sequence' or 'sequence'
        file_name: The explicit name of the file
        name: The name of the sequence record. If not provided, the instance name will be used.
            Will be used as the default file_name base name if file_name not provided
        out_dir: The location on disk to output the file. Only used if file_name not explicitly provided
    Returns:
        The path to the output file
    """
    if dtype in sequence_types:
        # Get the attribute from the instance
        sequence = getattr(self, dtype)
    else:
        raise ValueError(
            f"Couldn't find a sequence matching the {dtype=}"
        )

    if name is None:
        name = self.name
    if file_name is None:
        file_name = os.path.join(out_dir, name)
        if not file_name.endswith('.fasta'):
            file_name = f'{file_name}.fasta'

    return write_sequence_to_fasta(sequence=sequence, name=name, file_name=file_name)

sequence_to_one_hot

sequence_to_one_hot(sequence: Sequence[str], translation_table: dict[str, int] = None, alphabet_order: int = 1) -> ndarray

Convert a sequence into a numeric array

Parameters:

  • sequence (Sequence[str]) –

    The sequence to encode

  • translation_table (dict[str, int], default: None ) –

    If a translation table (in bytes) is provided, it will be used. If not, use alphabet_order

  • alphabet_order (int, default: 1 ) –

    The alphabetical order of the amino acid alphabet. Can be either 1 or 3

Returns:

  • ndarray

    The one-hot encoded sequence with shape (sequence length, translation_table length)

Source code in symdesign/structure/sequence.py
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
def sequence_to_one_hot(sequence: Sequence[str], translation_table: dict[str, int] = None,
                        alphabet_order: int = 1) -> np.ndarray:
    """Convert a sequence into a numeric array

    Args:
        sequence: The sequence to encode
        translation_table: If a translation table (in bytes) is provided, it will be used. If not, use alphabet_order
        alphabet_order: The alphabetical order of the amino acid alphabet. Can be either 1 or 3

    Returns:
        The one-hot encoded sequence with shape (sequence length, translation_table length)
    """
    numeric_sequence = sequence_to_numeric(sequence, translation_table, alphabet_order=alphabet_order)
    if translation_table is None:
        # Assumes that alphabet_order is used and there aren't missing letters...
        num_entries = 20
    else:
        num_entries = len(translation_table)
    one_hot = np.zeros((len(sequence), num_entries), dtype=np.int32)
    try:
        one_hot[:, numeric_sequence] = 1
    except IndexError:  # Our assumption above was wrong
        from symdesign import sequence as _seq
        embedding = getattr(_seq, f'numerical_translation_alph{alphabet_order}_bytes') \
            if translation_table is None else translation_table
        raise ValueError(
            f"Couldn't produce a proper one-hot encoding for the provided sequence embedding: {embedding}")
    return one_hot

sequence_to_numeric

sequence_to_numeric(sequence: Sequence[str], translation_table: dict[str, int] = None, alphabet_order: int = 1) -> ndarray

Convert a sequence into a numeric array

Parameters:

  • sequence (Sequence[str]) –

    The sequence to encode

  • translation_table (dict[str, int], default: None ) –

    If a translation table (in bytes) is provided, it will be used. If not, use alphabet_order

  • alphabet_order (int, default: 1 ) –

    The alphabetical order of the amino acid alphabet. Can be either 1 or 3

Returns:

  • ndarray

    The numerically encoded sequence where each entry along axis=0 is the indexed amino acid. Indices are according to the 1 letter alphabetical amino acid

Source code in symdesign/structure/sequence.py
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
def sequence_to_numeric(sequence: Sequence[str], translation_table: dict[str, int] = None,
                        alphabet_order: int = 1) -> np.ndarray:
    """Convert a sequence into a numeric array

    Args:
        sequence: The sequence to encode
        translation_table: If a translation table (in bytes) is provided, it will be used. If not, use alphabet_order
        alphabet_order: The alphabetical order of the amino acid alphabet. Can be either 1 or 3

    Returns:
        The numerically encoded sequence where each entry along axis=0 is the indexed amino acid. Indices are according
            to the 1 letter alphabetical amino acid
    """
    _array = np.array(list(sequence), np.string_)
    if translation_table is not None:
        return np.vectorize(translation_table.__getitem__)(_array)
    else:
        if alphabet_order == 1:
            return np.vectorize(numerical_translation_alph1_bytes.__getitem__)(_array)
        elif alphabet_order == 3:
            raise NotImplementedError('Need to make the "numerical_translation_alph3_bytes" table')
            return np.vectorize(numerical_translation_alph3_bytes.__getitem__)(_array)
        else:
            raise ValueError(
                f"The 'alphabet_order' {alphabet_order} isn't valid. Choose from either 1 or 3")

sequences_to_numeric

sequences_to_numeric(sequences: Iterable[Sequence[str]], translation_table: dict[str, int] = None, alphabet_order: int = 1) -> ndarray

Convert sequences into a numeric array

Parameters:

  • sequences (Iterable[Sequence[str]]) –

    The sequences to encode

  • translation_table (dict[str, int], default: None ) –

    If a translation table (in bytes) is provided, it will be used. If not, use alphabet_order

  • alphabet_order (int, default: 1 ) –

    The alphabetical order of the amino acid alphabet. Can be either 1 or 3

Returns:

  • ndarray

    The numerically encoded sequence where each entry along axis=0 is the indexed amino acid. Indices are according to the 1 letter alphabetical amino acid

Source code in symdesign/structure/sequence.py
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
def sequences_to_numeric(sequences: Iterable[Sequence[str]], translation_table: dict[str, int] = None,
                         alphabet_order: int = 1) -> np.ndarray:
    """Convert sequences into a numeric array

    Args:
        sequences: The sequences to encode
        translation_table: If a translation table (in bytes) is provided, it will be used. If not, use alphabet_order
        alphabet_order: The alphabetical order of the amino acid alphabet. Can be either 1 or 3

    Returns:
        The numerically encoded sequence where each entry along axis=0 is the indexed amino acid. Indices are according
            to the 1 letter alphabetical amino acid
    """
    _array = np.array([list(sequence) for sequence in sequences], np.string_)
    if translation_table is not None:
        return np.vectorize(translation_table.__getitem__)(_array)
    else:
        if alphabet_order == 1:
            return np.vectorize(numerical_translation_alph1_bytes.__getitem__)(_array)
        elif alphabet_order == 3:
            raise NotImplementedError('Need to make the "numerical_translation_alph3_bytes" table')
            return np.vectorize(numerical_translation_alph3_bytes.__getitem__)(_array)
        else:
            raise ValueError(
                f"The 'alphabet_order' {alphabet_order} isn't valid. Choose from either 1 or 3")

pssm_as_array

pssm_as_array(pssm: ProfileDict, alphabet: str = protein_letters_alph1, lod: bool = False) -> ndarray

Convert a position specific profile matrix into a numeric array

Parameters:

  • pssm (ProfileDict) –

    {1: {'A': 0, 'R': 0, ..., 'lod': {'A': -5, 'R': -5, ...}, 'type': 'W', 'info': 3.20, 'weight': 0.73}, 2: {}, ...}

  • alphabet (str, default: protein_letters_alph1 ) –

    The amino acid alphabet to use. Array values will be returned in this order

  • lod (bool, default: False ) –

    Whether to return the array for the log of odds values

Returns:

  • ndarray

    The numerically encoded pssm where each entry along axis 0 is the position, and the entries on axis 1 are the frequency data at every indexed amino acid. Indices are according to the specified amino acid alphabet, i.e array([[0.1, 0.01, 0.12, ...], ...])

Source code in symdesign/structure/sequence.py
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
def pssm_as_array(pssm: ProfileDict, alphabet: str = protein_letters_alph1, lod: bool = False) \
        -> np.ndarray:
    """Convert a position specific profile matrix into a numeric array

    Args:
        pssm: {1: {'A': 0, 'R': 0, ..., 'lod': {'A': -5, 'R': -5, ...}, 'type': 'W', 'info': 3.20, 'weight': 0.73},
                  2: {}, ...}
        alphabet: The amino acid alphabet to use. Array values will be returned in this order
        lod: Whether to return the array for the log of odds values

    Returns:
        The numerically encoded pssm where each entry along axis 0 is the position, and the entries on axis 1 are the
            frequency data at every indexed amino acid. Indices are according to the specified amino acid alphabet,
            i.e array([[0.1, 0.01, 0.12, ...], ...])
    """
    if lod:
        return np.array([[position_info['lod'][aa] for aa in alphabet]
                         for position_info in pssm.values()], dtype=np.float32)
    else:
        return np.array([[position_info[aa] for aa in alphabet]
                         for position_info in pssm.values()], dtype=np.float32)

concatenate_profile

concatenate_profile(profiles: Iterable[Any], start_at: int = 1) -> dict[int, Any]

Combine a list of profiles (parsed PSSMs) by incrementing the entry index for each additional profile

Parameters:

  • profiles (Iterable[Any]) –

    The profiles to concatenate

  • start_at (int, default: 1 ) –

    The integer to start the resulting dictionary at

Returns The concatenated input profiles, make a concatenated PSSM {1: {'A': 0.04, 'C': 0.12, ..., 'lod': {'A': -5, 'C': -9, ...}, 'type': 'W', 'info': 0.00, 'weight': 0.00}, ...}}

Source code in symdesign/structure/sequence.py
167
168
169
170
171
172
173
174
175
176
177
178
179
180
def concatenate_profile(profiles: Iterable[Any], start_at: int = 1) -> dict[int, Any]:
    """Combine a list of profiles (parsed PSSMs) by incrementing the entry index for each additional profile

    Args:
        profiles: The profiles to concatenate
        start_at: The integer to start the resulting dictionary at

    Returns
        The concatenated input profiles, make a concatenated PSSM
            {1: {'A': 0.04, 'C': 0.12, ..., 'lod': {'A': -5, 'C': -9, ...}, 'type': 'W', 'info': 0.00,
                 'weight': 0.00}, ...}}
    """
    _count = count(start_at)
    return {next(_count): position_profile for profile in profiles for position_profile in profile.values()}

write_pssm_file

write_pssm_file(pssm: ProfileDict, file_name: AnyStr = None, name: str = None, out_dir: AnyStr = os.getcwd()) -> AnyStr | None

Create a PSI-BLAST format PSSM file from a PSSM dictionary. Assumes residue numbering is correct!

Parameters:

  • pssm (ProfileDict) –

    A dictionary which has the keys: 'A', 'C', ... (all aa's), 'lod', 'type', 'info', 'weight'

  • file_name (AnyStr, default: None ) –

    The explicit name of the file

  • name (str, default: None ) –

    The name of the file. Will be used as the default file_name base name if file_name not provided

  • out_dir (AnyStr, default: getcwd() ) –

    The location on disk to output the file. Only used if file_name not explicitly provided

Returns:

  • AnyStr | None

    Disk location of newly created .pssm file

Source code in symdesign/structure/sequence.py
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
def write_pssm_file(pssm: ProfileDict, file_name: AnyStr = None, name: str = None,
                    out_dir: AnyStr = os.getcwd()) -> AnyStr | None:
    """Create a PSI-BLAST format PSSM file from a PSSM dictionary. Assumes residue numbering is correct!

    Args:
        pssm: A dictionary which has the keys: 'A', 'C', ... (all aa's), 'lod', 'type', 'info', 'weight'
        file_name: The explicit name of the file
        name: The name of the file. Will be used as the default file_name base name if file_name not provided
        out_dir: The location on disk to output the file. Only used if file_name not explicitly provided

    Returns:
        Disk location of newly created .pssm file
    """
    if not pssm:
        return None

    # Find out if the pssm has values expressed as frequencies (percentages) or as counts and modify accordingly
    if isinstance(list(pssm.values())[0]['lod']['A'], float):
        separation1 = " " * 4
    else:
        separation1 = " " * 3

    if file_name is None:
        if name is None:
            raise ValueError(f'Must provide argument "file_name" or "name" as a str to {write_sequences.__name__}')
        else:
            file_name = os.path.join(out_dir, name)

    if os.path.splitext(file_name)[-1] == '':  # No extension
        file_name = f'{file_name}.pssm'

    with open(file_name, 'w') as f:
        f.write(f'\n\n{" " * 12}{separation1.join(protein_letters_alph3)}'
                f'{separation1}{(" " * 3).join(protein_letters_alph3)}\n')
        for residue_number, profile in pssm.items():
            if isinstance(profile['lod']['A'], float):  # lod_freq:  # relevant for favor_fragment
                lod_string = ' '.join(f'{profile["lod"][aa]:>4.2f}' for aa in protein_letters_alph3) \
                    + ' '
            else:
                lod_string = ' '.join(f'{profile["lod"][aa]:>3d}' for aa in protein_letters_alph3) \
                    + ' '

            if isinstance(profile['A'], float):  # counts_freq: # relevant for freq calculations
                counts_string = ' '.join(f'{floor(profile[aa] * 100):>3.0f}' for aa in
                                         protein_letters_alph3) \
                    + ' '
            else:
                counts_string = ' '.join(f'{profile[aa]:>3d}' for aa in protein_letters_alph3) \
                    + ' '
            f.write(f'{residue_number:>5d} {profile["type"]:1s}   {lod_string:80s} {counts_string:80s} '
                    f'{round(profile.get("info", 0.), 4):4.2f} {round(profile.get("weight", 0.), 4):4.2f}\n')

    return file_name

format_frequencies

format_frequencies(frequency_list: list, flip: bool = False) -> dict[str, dict[str, float]]

Format list of paired frequency data into parsable paired format

Parameters:

  • frequency_list (list) –

    [(('D', 'A'), 0.0822), (('D', 'V'), 0.0685), ...]

  • flip (bool, default: False ) –

    Whether to invert the mapping of internal tuple

Returns: {'A': {'S': 0.02, 'T': 0.12}, ...}

Source code in symdesign/structure/sequence.py
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
def format_frequencies(frequency_list: list, flip: bool = False) -> dict[str, dict[str, float]]:
    """Format list of paired frequency data into parsable paired format

    Args:
        frequency_list: [(('D', 'A'), 0.0822), (('D', 'V'), 0.0685), ...]
        flip: Whether to invert the mapping of internal tuple
    Returns:
        {'A': {'S': 0.02, 'T': 0.12}, ...}
    """
    if flip:
        i, j = 1, 0
    else:
        i, j = 0, 1
    freq_d = {}
    for tup in frequency_list:
        aa_mapped = tup[0][i]  # 0
        aa_paired = tup[0][j]  # 1
        freq = tup[1]
        if aa_mapped in freq_d:
            freq_d[aa_mapped][aa_paired] = freq
        else:
            freq_d[aa_mapped] = {aa_paired: freq}

    return freq_d

overlap_consensus

overlap_consensus(issm, aa_set)

Find the overlap constrained consensus sequence

Parameters:

  • issm (dict) –

    {1: {'A': 0.1, 'C': 0.0, ...}, 14: {...}, ...}

  • aa_set (dict) –

    {residue: {'A', 'I', 'M', 'V'}, ...}

Returns: (dict): {23: 'T', 29: 'A', ...}

Source code in symdesign/structure/sequence.py
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
def overlap_consensus(issm, aa_set):  # UNUSED
    """Find the overlap constrained consensus sequence

    Args:
        issm (dict): {1: {'A': 0.1, 'C': 0.0, ...}, 14: {...}, ...}
        aa_set (dict): {residue: {'A', 'I', 'M', 'V'}, ...}
    Returns:
        (dict): {23: 'T', 29: 'A', ...}
    """
    consensus = {}
    for res in aa_set:
        max_freq = 0.0
        for aa in aa_set[res]:
            # if max_freq < issm[(res, partner)][]:
            if issm[res][aa] > max_freq:
                max_freq = issm[res][aa]
                consensus[res] = aa

    return consensus

get_cluster_dicts

get_cluster_dicts(db: str = putils.biological_interfaces, id_list: list[str] = None) -> dict[str, dict]

Generate an interface specific scoring matrix from the fragment library

Parameters:

  • db (str, default: biological_interfaces ) –

    The source of the fragment information

  • id_list (list[str], default: None ) –

    [1_2_24, ...]

Returns:

  • cluster_dict ( dict[str, dict] ) –

    {'1_2_45': {'size': ..., 'rmsd': ..., 'rep': ..., 'mapped': ..., 'paired': ...}, ...}

Source code in symdesign/structure/sequence.py
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
def get_cluster_dicts(db: str = putils.biological_interfaces, id_list: list[str] = None) -> dict[str, dict]:
    """Generate an interface specific scoring matrix from the fragment library

    Args:
        db: The source of the fragment information
        id_list: [1_2_24, ...]

    Returns:
         cluster_dict: {'1_2_45': {'size': ..., 'rmsd': ..., 'rep': ..., 'mapped': ..., 'paired': ...}, ...}
    """
    info_db = putils.frag_directory[db]
    if id_list is None:
        directory_list = utils.get_base_root_paths_recursively(info_db)
    else:
        directory_list = []
        for _id in id_list:
            c_id = _id.split('_')
            _dir = os.path.join(info_db, c_id[0], c_id[0] + '_' + c_id[1], c_id[0] + '_' + c_id[1] + '_' + c_id[2])
            directory_list.append(_dir)

    cluster_dict = {}
    for cluster in directory_list:
        filename = os.path.join(cluster, os.path.basename(cluster) + '.pkl')
        cluster_dict[os.path.basename(cluster)] = utils.unpickle(filename)

    return cluster_dict

fragment_overlap

fragment_overlap(residues, interaction_graph, freq_map)

Take fragment contact list to find the possible AA types allowed in fragment pairs from the contact list

Parameters:

  • residues (iter) –

    Iterable of residue numbers

  • interaction_graph (dict) –

    {52: [54, 56, 72, 206], ...}

  • freq_map (dict) –

    {(78, 87, ...): {'A': {'S': 0.02, 'T': 0.12}, ...}, ...}

Returns: overlap (dict): {residue: {'A', 'I', 'M', 'V'}, ...}

Source code in symdesign/structure/sequence.py
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
def fragment_overlap(residues, interaction_graph, freq_map):
    """Take fragment contact list to find the possible AA types allowed in fragment pairs from the contact list

    Args:
        residues (iter): Iterable of residue numbers
        interaction_graph (dict): {52: [54, 56, 72, 206], ...}
        freq_map (dict): {(78, 87, ...): {'A': {'S': 0.02, 'T': 0.12}, ...}, ...}
    Returns:
        overlap (dict): {residue: {'A', 'I', 'M', 'V'}, ...}
    """
    overlap = {}
    for res in residues:
        overlap[res] = set()
        if res in interaction_graph:  # check for existence as some fragment info is not in the interface set
            # overlap[res] = set()
            for partner in interaction_graph[res]:
                if (res, partner) in freq_map:
                    overlap[res] |= set(freq_map[(res, partner)].keys())

    for res in residues:
        if res in interaction_graph:  # check for existence as some fragment info is not in the interface set
            for partner in interaction_graph[res]:
                if (res, partner) in freq_map:
                    overlap[res] &= set(freq_map[(res, partner)].keys())

    return overlap

offset_index

offset_index(dictionary: dict[int, Any], to_zero: bool = False) -> dict[int, dict]

Modify the index of a sequence dictionary. Default is to one-indexed. to_zero=True gives zero-indexed

Source code in symdesign/structure/sequence.py
1253
1254
1255
1256
1257
1258
def offset_index(dictionary: dict[int, Any], to_zero: bool = False) -> dict[int, dict]:
    """Modify the index of a sequence dictionary. Default is to one-indexed. to_zero=True gives zero-indexed"""
    if to_zero:
        return {residue - ZERO_OFFSET: dictionary[residue] for residue in dictionary}
    else:
        return {residue + ZERO_OFFSET: dictionary[residue] for residue in dictionary}

residue_object_to_number

residue_object_to_number(residue_dict: dict[str, Iterable['structure.base.Residue']]) -> dict[str, list[tuple[int, ...]]]

Convert sets of PDB.Residue objects to residue numbers

Parameters:

  • residue_dict (dict) –

    {'key1': [(residue1_ca_atom, residue2_ca_atom, ...), ...] ...}

Returns: residue_dict (dict): {'key1': [(78, 87, ...),], ...} - Entry mapped to residue sets

Source code in symdesign/structure/sequence.py
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
def residue_object_to_number(
    residue_dict: dict[str, Iterable['structure.base.Residue']]
) -> dict[str, list[tuple[int, ...]]]:
    """Convert sets of PDB.Residue objects to residue numbers

    Args:
        residue_dict (dict): {'key1': [(residue1_ca_atom, residue2_ca_atom, ...), ...] ...}
    Returns:
        residue_dict (dict): {'key1': [(78, 87, ...),], ...} - Entry mapped to residue sets
    """
    for entry in residue_dict:
        pairs = []
        # for _set in range(len(residue_dict[entry])):
        for j, _set in enumerate(residue_dict[entry]):
            residue_num_set = []
            # for i, residue in enumerate(residue_dict[entry][_set]):
            for residue in _set:
                resi_number = residue.residue_number
                residue_num_set.append(resi_number)
            pairs.append(tuple(residue_num_set))
        residue_dict[entry] = pairs

    return residue_dict

convert_to_residue_cluster_map

convert_to_residue_cluster_map(residue_cluster_dict, frag_range)

Make a residue and cluster/fragment index map

Parameters:

  • residue_cluster_dict (dict) –

    {'1_2_45': [(residue1_ca_atom, residue2_ca_atom), ...] ...}

  • frag_range (dict) –

    A range of the fragment size to search over. Ex: (-2, 3) for fragments of length 5

Returns: cluster_map (dict): {48: {'source': 'mapped', 'cluster': [(-2, 1_1_54), ...]}, ...} Where the key is the 0 indexed residue id

Source code in symdesign/structure/sequence.py
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
def convert_to_residue_cluster_map(residue_cluster_dict, frag_range):
    """Make a residue and cluster/fragment index map

    Args:
        residue_cluster_dict (dict): {'1_2_45': [(residue1_ca_atom, residue2_ca_atom), ...] ...}
        frag_range (dict): A range of the fragment size to search over. Ex: (-2, 3) for fragments of length 5
    Returns:
        cluster_map (dict): {48: {'source': 'mapped', 'cluster': [(-2, 1_1_54), ...]}, ...}
            Where the key is the 0 indexed residue id
    """
    cluster_map = {}
    for cluster in residue_cluster_dict:
        for pair in range(len(residue_cluster_dict[cluster])):
            for i, residue_atom in enumerate(residue_cluster_dict[cluster][pair]):
                # for each residue in map add the same cluster to the range of fragment residue numbers
                residue_num = residue_atom.residue_number - ZERO_OFFSET  # zero index
                for j in range(*frag_range):
                    if residue_num + j not in cluster_map:
                        if i == 0:
                            cluster_map[residue_num + j] = {'source': 'mapped', 'cluster': []}
                        else:
                            cluster_map[residue_num + j] = {'source': 'paired', 'cluster': []}
                    cluster_map[residue_num + j]['cluster'].append((j, cluster))

    return cluster_map

consensus_sequence

consensus_sequence(pssm)

Return the consensus sequence from a PSSM

Parameters:

  • pssm (dict) –

    pssm dictionary

Return: consensus_identities (dict): {1: 'M', 2: 'H', ...} One-indexed

Source code in symdesign/structure/sequence.py
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
def consensus_sequence(pssm):
    """Return the consensus sequence from a PSSM

    Args:
        pssm (dict): pssm dictionary
    Return:
        consensus_identities (dict): {1: 'M', 2: 'H', ...} One-indexed
    """
    consensus_identities = {}
    for residue in pssm:
        max_lod = 0
        max_res = pssm[residue]['type']
        for aa in protein_letters_alph3:
            if pssm[residue]['lod'][aa] > max_lod:
                max_lod = pssm[residue]['lod'][aa]
                max_res = aa
        consensus_identities[residue + ZERO_OFFSET] = max_res

    return consensus_identities

sequence_difference

sequence_difference(seq1: Sequence, seq2: Sequence, d: dict = None, matrix: str = 'BLOSUM62') -> float

Returns the sequence difference between two sequence iterators

Parameters:

  • seq1 (Sequence) –

    Either an iterable with residue type as array, or key, with residue type as d[seq1][residue]['type']

  • seq2 (Sequence) –

    Either an iterable with residue type as array, or key, with residue type as d[seq2][residue]['type']

  • d (dict, default: None ) –

    The dictionary to look up seq1 and seq2 if they are keys in d

  • matrix (str, default: 'BLOSUM62' ) –

    The type of matrix to score the sequence differences on

Returns: The computed sequence difference between seq1 and seq2

Source code in symdesign/structure/sequence.py
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
def sequence_difference(seq1: Sequence, seq2: Sequence, d: dict = None, matrix: str = 'BLOSUM62') -> float:  # TODO AMS
    """Returns the sequence difference between two sequence iterators

    Args:
        seq1: Either an iterable with residue type as array, or key, with residue type as d[seq1][residue]['type']
        seq2: Either an iterable with residue type as array, or key, with residue type as d[seq2][residue]['type']
        d: The dictionary to look up seq1 and seq2 if they are keys in d
        matrix: The type of matrix to score the sequence differences on
    Returns:
        The computed sequence difference between seq1 and seq2
    """
    if d is not None:
        # seq1 = d[seq1]
        # seq2 = d[seq2]
        # for residue in d[seq1]:
        #     s.append((d[seq1][residue]['type'], d[seq2][residue]['type']))
        pairs = [(d[seq1][residue]['type'], d[seq2][residue]['type']) for residue in d[seq1]]
    else:
        pairs = zip(seq1, seq2)

    matrix_ = substitution_matrices.load(matrix)
    scores = [matrix_.get((letter1, letter2), (letter2, letter1)) for letter1, letter2 in pairs]

    return sum(scores)

msa_from_dictionary

msa_from_dictionary(named_sequences: dict[str, str]) -> MultipleSeqAlignment

Create a MultipleSequenceAlignment from a dictionary of named sequences

Parameters:

  • named_sequences (dict[str, str]) –

    {name: sequence, ...} ex: {'clean_asu': 'MNTEELQVAAFEI...', ...}

Returns: The MultipleSequenceAlignment object for the provided sequences

Source code in symdesign/structure/sequence.py
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
def msa_from_dictionary(named_sequences: dict[str, str]) -> MultipleSeqAlignment:
    """Create a MultipleSequenceAlignment from a dictionary of named sequences

    Args:
        named_sequences: {name: sequence, ...} ex: {'clean_asu': 'MNTEELQVAAFEI...', ...}
    Returns:
        The MultipleSequenceAlignment object for the provided sequences
    """
    return MultipleSeqAlignment([SeqRecord(Seq(sequence), annotations={'molecule_type': 'Protein'}, id=name)
                                 for name, sequence in named_sequences.items()])

msa_from_seq_records

msa_from_seq_records(seq_records: Iterable[SeqRecord]) -> MultipleSeqAlignment

Create a BioPython Multiple Sequence Alignment from a SeqRecord Iterable

Parameters:

  • seq_records (Iterable[SeqRecord]) –

    {name: sequence, ...} ex: {'clean_asu': 'MNTEELQVAAFEI...', ...}

Returns: [SeqRecord(Seq('MNTEELQVAAFEI...', ...), id="Alpha"), SeqRecord(Seq('MNTEEL-VAAFEI...', ...), id="Beta"), ...]

Source code in symdesign/structure/sequence.py
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
def msa_from_seq_records(seq_records: Iterable[SeqRecord]) -> MultipleSeqAlignment:
    """Create a BioPython Multiple Sequence Alignment from a SeqRecord Iterable

    Args:
        seq_records: {name: sequence, ...} ex: {'clean_asu': 'MNTEELQVAAFEI...', ...}
    Returns:
        [SeqRecord(Seq('MNTEELQVAAFEI...', ...), id="Alpha"),
         SeqRecord(Seq('MNTEEL-VAAFEI...', ...), id="Beta"), ...]
    """
    return MultipleSeqAlignment(seq_records)

make_mutations_chain_agnostic

make_mutations_chain_agnostic(mutations)

Remove chain identifier from mutation dictionary

Parameters:

  • mutations (dict) –

    {design: {chain_id: {mutation_index: {'from': 'A', 'to': 'K'}, ...}, ...}, ...}

Returns: (dict): {design: {mutation_index: {'from': 'A', 'to': 'K'}, ...}, ...}

Source code in symdesign/structure/sequence.py
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
def make_mutations_chain_agnostic(mutations):
    """Remove chain identifier from mutation dictionary

    Args:
        mutations (dict): {design: {chain_id: {mutation_index: {'from': 'A', 'to': 'K'}, ...}, ...}, ...}
    Returns:
        (dict): {design: {mutation_index: {'from': 'A', 'to': 'K'}, ...}, ...}
    """
    flattened_mutations = {}
    for design, chain_mutations in mutations.items():
        flattened_mutations[design] = {}
        for chain, mutations in chain_mutations.items():
            flattened_mutations[design].update(mutations)

    return flattened_mutations

simplify_mutation_dict

simplify_mutation_dict(mutations: dict[str, mutation_dictionary], to: bool = True) -> dict[str, mutation_dictionary]

Simplify mutation dictionary to 'to'/'from' AA key

Parameters:

  • mutations (dict[str, mutation_dictionary]) –

    Ex: {alias: {mutation_index: {'from': 'A', 'to': 'K'}, ...}, ...}, ...}

  • to (bool, default: True ) –

    Whether to simplify with the 'to' AA key (True) or the 'from' AA key (False)

Returns: The simplified mutation dictionary. Ex: {alias: {mutation_index: 'K', ...}, ...}

Source code in symdesign/structure/sequence.py
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
def simplify_mutation_dict(mutations: dict[str, mutation_dictionary], to: bool = True) \
        -> dict[str, mutation_dictionary]:
    """Simplify mutation dictionary to 'to'/'from' AA key

    Args:
        mutations: Ex: {alias: {mutation_index: {'from': 'A', 'to': 'K'}, ...}, ...}, ...}
        to: Whether to simplify with the 'to' AA key (True) or the 'from' AA key (False)
    Returns:
        The simplified mutation dictionary. Ex: {alias: {mutation_index: 'K', ...}, ...}
    """
    simplification = 'to' if to else 'from'

    for alias, indexed_mutations in mutations.items():
        for index, mutation in indexed_mutations.items():
            mutations[alias][index] = mutation[simplification]

    return mutations

weave_mutation_dict

weave_mutation_dict(sorted_freq, mut_prob, resi_divergence, int_divergence, des_divergence)

Make final dictionary, index to sequence

Parameters:

  • sorted_freq (dict) –

    {15: ['S', 'A', 'T'], ... }

  • mut_prob (dict) –

    {15: {'A': 0.05, 'C': 0.001, 'D': 0.1, ...}, 16: {}, ...}

  • resi_divergence (dict) –

    {15: 0.732, 16: 0.552, ...}

  • int_divergence (dict) –

    {15: 0.732, 16: 0.552, ...}

  • des_divergence (dict) –

    {15: 0.732, 16: 0.552, ...}

Returns: weaved_dict (dict): {16: {'S': 0.134, 'A': 0.050, ..., 'jsd': 0.732, 'int_jsd': 0.412}, ...}

Source code in symdesign/structure/sequence.py
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
def weave_mutation_dict(sorted_freq, mut_prob, resi_divergence, int_divergence, des_divergence):
    """Make final dictionary, index to sequence

    Args:
        sorted_freq (dict): {15: ['S', 'A', 'T'], ... }
        mut_prob (dict): {15: {'A': 0.05, 'C': 0.001, 'D': 0.1, ...}, 16: {}, ...}
        resi_divergence (dict): {15: 0.732, 16: 0.552, ...}
        int_divergence (dict): {15: 0.732, 16: 0.552, ...}
        des_divergence (dict): {15: 0.732, 16: 0.552, ...}
    Returns:
        weaved_dict (dict): {16: {'S': 0.134, 'A': 0.050, ..., 'jsd': 0.732, 'int_jsd': 0.412}, ...}
    """
    weaved_dict = {}
    for residue in sorted_freq:
        final_resi = residue + ZERO_OFFSET
        weaved_dict[final_resi] = {}
        for aa in sorted_freq[residue]:
            weaved_dict[final_resi][aa] = round(mut_prob[residue][aa], 3)
        weaved_dict[final_resi]['jsd'] = resi_divergence[residue]
        weaved_dict[final_resi]['int_jsd'] = int_divergence[residue]
        weaved_dict[final_resi]['des_jsd'] = des_divergence[residue]

    return weaved_dict

clean_gaped_columns

clean_gaped_columns(alignment_dict, correct_index)

Cleans an alignment dictionary by revising key list with correctly indexed positions. 0 indexed

Source code in symdesign/structure/sequence.py
1599
1600
1601
def clean_gaped_columns(alignment_dict, correct_index):  # UNUSED
    """Cleans an alignment dictionary by revising key list with correctly indexed positions. 0 indexed"""
    return {i: alignment_dict[index] for i, index in enumerate(correct_index)}

msa_to_prob_distribution

msa_to_prob_distribution(alignment)

Turn Alignment dictionary into a probability distribution

Parameters:

  • alignment (dict) –

    {'meta': {'num_sequences': 214, 'query': 'MGSTHLVLK...', 'query_with_gaps': 'MGS--THLVLK...'}, 'msa': (Bio.Align.MultipleSeqAlignment) 'counts': {1: {'A': 13, 'C': 1, 'D': 23, ...}, 2: {}, ...}, 'frequencies': {1: {'A': 0.05, 'C': 0.001, 'D': 0.1, ...}, 2: {}, ...}, 'rep': {1: 210, 2:211, ...}} The msa formatted with counts and indexed by residue

Returns: (dict): {'meta': {'num_sequences': 214, 'query': 'MGSTHLVLK...', 'query_with_gaps': 'MGS--THLVLK...'}, 'msa': (Bio.Align.MultipleSeqAlignment) 'counts': {1: {'A': 13, 'C': 1, 'D': 23, ...}, 2: {}, ...}, 'frequencies': {1: {'A': 0.05, 'C': 0.001, 'D': 0.1, ...}, 2: {}, ...}, 'rep': {1: 210, 2:211, ...}} The msa formatted with counts and indexed by residue

Source code in symdesign/structure/sequence.py
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
def msa_to_prob_distribution(alignment):
    """Turn Alignment dictionary into a probability distribution

    Args:
        alignment (dict): {'meta': {'num_sequences': 214, 'query': 'MGSTHLVLK...', 'query_with_gaps': 'MGS--THLVLK...'},
                           'msa': (Bio.Align.MultipleSeqAlignment)
                           'counts': {1: {'A': 13, 'C': 1, 'D': 23, ...}, 2: {}, ...},
                           'frequencies': {1: {'A': 0.05, 'C': 0.001, 'D': 0.1, ...}, 2: {}, ...},
                           'rep': {1: 210, 2:211, ...}}
            The msa formatted with counts and indexed by residue
    Returns:
        (dict): {'meta': {'num_sequences': 214, 'query': 'MGSTHLVLK...', 'query_with_gaps': 'MGS--THLVLK...'},
                 'msa': (Bio.Align.MultipleSeqAlignment)
                 'counts': {1: {'A': 13, 'C': 1, 'D': 23, ...}, 2: {}, ...},
                 'frequencies': {1: {'A': 0.05, 'C': 0.001, 'D': 0.1, ...}, 2: {}, ...},
                 'rep': {1: 210, 2:211, ...}}
            The msa formatted with counts and indexed by residue
    """
    alignment['frequencies'] = {}
    for residue, amino_acid_counts in alignment['counts'].items():
        total_column_weight = alignment['rep'][residue]
        assert total_column_weight != 0, '%s: Processing error... Downstream cannot divide by 0. Position = %s' \
                                         % (msa_to_prob_distribution.__name__, residue)
        alignment['frequencies'][residue] = {aa: count / total_column_weight for aa, count in amino_acid_counts.items()}

    return alignment

window_score

window_score(scores: dict[int, float] | Sequence[float], window_len: int, lambda_: float = 0.5) -> dict

Takes an MSA score dict and transforms so that each position is a weighted average of the surrounding positions. Positions with scores less than zero are not changed and are ignored calculation

Modified from Capra and Singh 2007 code

Notes

The input should be a one-indexed dictionary (if a dictionary)

Parameters:

  • scores (dict[int, float] | Sequence[float]) –

    A dictionary with scores.

  • window_len (int) –

    Number of residues on either side of the current residue

  • lambda_ (float, default: 0.5 ) –

    Float between 0 and 1 which parameterizes the amount of the score to pull from

Returns:

  • dict

    A dictionary with the modified scores for the specified window, one-indexed

Source code in symdesign/structure/sequence.py
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
def window_score(scores: dict[int, float] | Sequence[float], window_len: int, lambda_: float = 0.5) -> dict:  # UNUSED  incorporate into MultipleSequenceAlignment
    """Takes an MSA score dict and transforms so that each position is a weighted average of the surrounding positions.
    Positions with scores less than zero are not changed and are ignored calculation

    Modified from Capra and Singh 2007 code

    Notes:
        The input should be a one-indexed dictionary (if a dictionary)

    Args:
        scores: A dictionary with scores.
        window_len: Number of residues on either side of the current residue
        lambda_: Float between 0 and 1 which parameterizes the amount of the score to pull from

    Returns:
        A dictionary with the modified scores for the specified window, one-indexed
    """
    if window_len == 0:
        try:
            return dict(enumerate(scores.values(), ZERO_OFFSET))
        except AttributeError:  # Not a dict
            return dict(enumerate(scores, ZERO_OFFSET))
    else:
        window_scores = {}
        number_scores = len(scores)
        for i in range(number_scores + ZERO_OFFSET):
            s = number_terms = 0
            if i <= window_len:
                for j in range(ZERO_OFFSET, i + window_len + ZERO_OFFSET):
                    if i != j:
                        number_terms += 1
                        s += scores[j]
            elif i + window_len > number_scores:
                for j in range(i - window_len, number_scores + ZERO_OFFSET):
                    if i != j:
                        number_terms += 1
                        s += scores[j]
            else:
                for j in range(i - window_len, i + window_len + ZERO_OFFSET):
                    if i != j:
                        number_terms += 1
                        s += scores[j]
            window_scores[i] = (1 - lambda_) * (s / number_terms) + lambda_ * scores[i]

        return window_scores

rank_possibilities

rank_possibilities(probability_dict)

Gather alternative residues and sort them by probability.

Parameters:

  • probability_dict (dict) –

    {15: {'A': 0.05, 'C': 0.001, 'D': 0.1, ...}, 16: {}, ...}

Returns: sorted_alternates_dict (dict): {15: ['S', 'A', 'T'], ... }

Source code in symdesign/structure/sequence.py
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
def rank_possibilities(probability_dict):  # UNUSED  incorporate into MultipleSequenceAlignment
    """Gather alternative residues and sort them by probability.

    Args:
        probability_dict (dict): {15: {'A': 0.05, 'C': 0.001, 'D': 0.1, ...}, 16: {}, ...}
    Returns:
         sorted_alternates_dict (dict): {15: ['S', 'A', 'T'], ... }
    """
    sorted_alternates_dict = {}
    for residue in probability_dict:
        residue_probability_list = []
        for aa in probability_dict[residue]:
            if probability_dict[residue][aa] > 0:
                residue_probability_list.append((aa, round(probability_dict[residue][aa], 5)))  # tuple instead of list
        residue_probability_list.sort(key=lambda tup: tup[1], reverse=True)
        # [('S', 0.13190), ('A', 0.0500), ...]
        sorted_alternates_dict[residue] = [aa[0] for aa in residue_probability_list]

    return sorted_alternates_dict

multi_chain_alignment

multi_chain_alignment(mutated_sequences, **kwargs)

Combines different chain's Multiple Sequence Alignments into a single MSA. One-indexed

Parameters:

  • mutated_sequences (dict) –

    {chain: {name: sequence, ...}

Returns: (MultipleSequenceAlignment): The MSA object with counts, frequencies, sequences, and indexed by residue

Source code in symdesign/structure/sequence.py
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
def multi_chain_alignment(mutated_sequences, **kwargs):
    """Combines different chain's Multiple Sequence Alignments into a single MSA. One-indexed

    Args:
        mutated_sequences (dict): {chain: {name: sequence, ...}
    Returns:
        (MultipleSequenceAlignment): The MSA object with counts, frequencies, sequences, and indexed by residue
    """
    #         (dict): {'meta': {'num_sequences': 214, 'query': 'MGSTHLVLK...', 'query_with_gaps': 'MGS--THLVLK...'},
    #                  'msa': (Bio.Align.MultipleSeqAlignment)
    #                  'counts': {1: {'A': 13, 'C': 1, 'D': 23, ...}, 2: {}, ...},
    #                  'frequencies': {1: {'A': 0.05, 'C': 0.001, 'D': 0.1, ...}, 2: {}, ...},
    #                  'rep': {1: 210, 2:211, ...}}
    #             The msa formatted with counts and indexed by residue

    # Combine alignments for all chains from design file Ex: A: 1-102, B: 1-130. Alignment: 1-232
    total_alignment = None
    for idx, named_sequences in enumerate(mutated_sequences.values()):
        if idx == 0:
            total_alignment = msa_from_dictionary(named_sequences)[:, :]
        else:
            total_alignment += msa_from_dictionary(named_sequences)[:, :]

    if total_alignment:
        return MultipleSequenceAlignment(alignment=total_alignment, **kwargs)
    else:
        raise RuntimeError(f'{multi_chain_alignment.__name__} - No sequences were found!')