Skip to content

expression

Add expression tags onto the termini of specific designs

get_sequence_features

get_sequence_features(sequence: Sequence[str | int]) -> dict[str, float]

For the specified amino acid sequence perform biochemical calculations related to solution properties

Parameters:

  • sequence (Sequence[str | int]) –

    The sequence to measure

Returns:

  • dict[str, float]

    A feature dictionary with each feature from the set below mapped to a float.

  • dict[str, float]

    {'extinction_coefficient_reduced', 'extinction_coefficient_oxidized', 'instability_index', 'molecular_weight', 'isoelectric_point', 'absorbance_0.1_red', 'absorbance_0.1_ox', }

Source code in symdesign/sequence/expression.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
def get_sequence_features(sequence: Sequence[str | int]) -> dict[str, float]:
    """For the specified amino acid sequence perform biochemical calculations related to solution properties

    Args:
        sequence: The sequence to measure

    Returns:
        A feature dictionary with each feature from the set below mapped to a float.
        {'extinction_coefficient_reduced',
         'extinction_coefficient_oxidized',
         'instability_index',
         'molecular_weight',
         'isoelectric_point',
         'absorbance_0.1_red',
         'absorbance_0.1_ox',
         }
    """
    sequence = format_sequence_to_numeric(sequence)
    ox_coef, red_coef = molecular_extinction_coefficient(sequence)
    mw = calculate_protein_molecular_weight(sequence)
    abs_01_ox = ox_coef / mw
    abs_01_red = red_coef / mw
    return {
        'number_of_residues': len(sequence),
        'extinction_coefficient_reduced': red_coef,
        'extinction_coefficient_oxidized': ox_coef,
        'instability_index': calculate_instability_index(sequence),
        'molecular_weight': mw,
        'isoelectric_point': calculate_protein_isoelectric_point(sequence),
        'absorbance_0.1_red': abs_01_red,
        'absorbance_0.1_ox': abs_01_ox
    }

calculate_protein_molecular_weight

calculate_protein_molecular_weight(sequence: Sequence[str | int]) -> float

Find the total molecular mass for the amino acids present in a sequence

Parameters:

  • sequence (Sequence[str | int]) –

    The sequence to measure

Returns:

  • float

    The molecular weight in daltons/atomic mass units

Source code in symdesign/sequence/expression.py
74
75
76
77
78
79
80
81
82
83
84
85
86
def calculate_protein_molecular_weight(sequence: Sequence[str | int]) -> float:
    """Find the total molecular mass for the amino acids present in a sequence

    Args:
        sequence: The sequence to measure

    Returns:
        The molecular weight in daltons/atomic mass units
    """
    seq_index = format_sequence_to_numeric(sequence)

    # Add h2o_mass as the n- and c-term are free
    return aa_polymer_molecular_weights[seq_index].sum() + h2o_mass

calculate_protein_isoelectric_point

calculate_protein_isoelectric_point(sequence: Sequence[str | int], threshold: float = 0.01) -> float

Find the total isoelectric point (pI) for the amino acids present in a sequence

Parameters:

  • sequence (Sequence[str | int]) –

    The sequence to measure

  • threshold (float, default: 0.01 ) –

    The pH unit value to consider the found pI passing

Returns:

  • float

    The molecular weight in daltons/atomic mass units

Source code in symdesign/sequence/expression.py
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
def calculate_protein_isoelectric_point(sequence: Sequence[str | int], threshold: float = 0.01) -> float:
    """Find the total isoelectric point (pI) for the amino acids present in a sequence

    Args:
        sequence: The sequence to measure
        threshold: The pH unit value to consider the found pI passing

    Returns:
        The molecular weight in daltons/atomic mass units
    """
    seq_index = format_sequence_to_numeric(sequence)
    # # Take the mean of every individual pI
    # return aa_isoelectric_point[seq_index].mean()

    n_cys = seq_index.count(cys_index)
    n_asp = seq_index.count(asp_index)
    n_glu = seq_index.count(glu_index)
    n_his = seq_index.count(his_index)
    n_lys = seq_index.count(lys_index)
    n_arg = seq_index.count(arg_index)
    n_tyr = seq_index.count(tyr_index)

    # def calculate_aa_charge_contribution(aa: str, pka_sol: float = 7.5):
    #     negative_charged = {'C': 7.555, 'D': 3.872, 'E': 4.412, 'Y': 10.85}
    #     positive_charged = {'H': 5.637, 'K': 9.052, 'R': 11.84}
    #     pka_aa = positive_charged.get(aa)
    #     if pka_aa:
    #         pka_aa = 1 / (1 + 10**(pka_sol-pka_aa))  # positive
    #     else:
    #         pka_aa = -1 / (1 + 10**(negative_charged[aa]-pka_sol))  # negative

    # Array version
    # Make an array of the counts padding respective ends with n- and c-termini counts (i.e. 1)
    positive_counts = np.array([1, n_his, n_lys, n_arg])
    negative_counts = np.array([n_cys, n_asp, n_glu, n_tyr, 1])

    iterative_multiplier = 1
    delta_magnitude = 4
    test_pi = 7.  # - delta_magnitude  # <- ensures that the while loop search starts at 7.
    # pka_aa_pos = 1 / (1 + 10 ** (test_pi - positive_charged_pka))  # positive
    # pka_aa_neg = -1 / (1 + 10 ** (negative_charged_pka - test_pi))  # negative
    # pos_charge = pka_aa_pos * positive_counts
    # neg_charge = pka_aa_neg * negative_counts
    # total_charge = neg_charge + pos_charge
    # remaining_charge = abs(total_charge)
    # negative_last = abs(neg_charge) > abs(pos_charge)
    # neg_charge = pos_charge = negative_last = 0
    if threshold < 0:
        raise ValueError(
            f"The argument 'threshold' can't be lower than 0. Got {threshold}")
    remaining_charge = threshold + 1
    direction = 0
    count_ = count()
    tested_pis = {}
    while remaining_charge >= threshold:
        if next(count_) > 0:
            # Check which type of charge has a larger magnitude
            if abs(neg_charge) > abs(pos_charge):
                # Estimation is more negative
                direction = -1
                if last_direction + direction == 0:
                    # pass  # This was also negative last iteration, haven't gone far enough
                # else:  # Searched this direction far enough, turn around and decrease step
                    iterative_multiplier /= 2
            else:  # Positive larger
                direction = 1
                if last_direction + direction == 0:
                    # Searched this direction far enough, turn around and decrease step
                    iterative_multiplier /= 2
                # else:
                #     pass  # This was positive last iteration, haven't gone far enough
        last_direction = direction

        # Calculate the error
        pi_modifier = direction * delta_magnitude * iterative_multiplier
        test_pi_ = pi_modifier + test_pi
        if test_pi_ in tested_pis:
            # This has already been checked. Cut the space in half again
            iterative_multiplier /= 2
            pi_modifier = direction * delta_magnitude * iterative_multiplier

        test_pi += pi_modifier

        pka_aa_pos = 1 / (1 + 10**(test_pi-positive_charged_pka))  # positive
        pka_aa_neg = -1 / (1 + 10**(negative_charged_pka-test_pi))  # negative
        # Multiply the individual contributions by the number of observations
        pos_charge = (pka_aa_pos * positive_counts).sum()
        neg_charge = (pka_aa_neg * negative_counts).sum()
        total_charge = neg_charge + pos_charge
        remaining_charge = abs(total_charge)
        logger.debug(f'pI = {test_pi}, remainder = {remaining_charge}')  # Iteration {next(count_)}:
        tested_pis[test_pi] = remaining_charge

    return test_pi + (total_charge/2)  # Approximately the real pi

calculate_instability_index

calculate_instability_index(sequence: Sequence[str | int]) -> float

Find the total instability index for the amino acids present in a sequence. See PMID:2075190

Parameters:

  • sequence (Sequence[str | int]) –

    The sequence to measure

Returns:

  • float

    The value of the stability index where a value less than 40 indicates stability

Source code in symdesign/sequence/expression.py
209
210
211
212
213
214
215
216
217
218
219
220
def calculate_instability_index(sequence: Sequence[str | int]) -> float:
    """Find the total instability index for the amino acids present in a sequence. See PMID:2075190

    Args:
        sequence: The sequence to measure

    Returns:
        The value of the stability index where a value less than 40 indicates stability
    """

    seq_index = format_sequence_to_numeric(sequence)
    return (instability_array[seq_index[:-1], seq_index[1:]].sum() * 10) / len(sequence)

molecular_extinction_coefficient

molecular_extinction_coefficient(sequence: Sequence[str | int]) -> tuple[float, float]

Calculate the molecular extinction coefficient for an amino acid sequence using the formula E(ProtOx) = Numb(Tyr) * Ext(Tyr) + Numb(Trp) * Ext(Trp) + Numb(Cystine) * Ext(Cystine) E(ProtRed) = Numb(Tyr) * Ext(Tyr) + Numb(Trp) * Ext(Trp)

Parameters:

  • sequence (Sequence[str | int]) –

    The sequence to measure

Returns:

  • tuple[float, float]

    The pair of molecular extinction coefficients, first with all Cysteine oxidized, then reduced

Source code in symdesign/sequence/expression.py
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
def molecular_extinction_coefficient(sequence: Sequence[str | int]) -> tuple[float, float]:
    """Calculate the molecular extinction coefficient for an amino acid sequence using the formula
    E(ProtOx) = Numb(Tyr) * Ext(Tyr) + Numb(Trp) * Ext(Trp) + Numb(Cystine) * Ext(Cystine)
    E(ProtRed) = Numb(Tyr) * Ext(Tyr) + Numb(Trp) * Ext(Trp)

    Args:
        sequence: The sequence to measure

    Returns:
        The pair of molecular extinction coefficients, first with all Cysteine oxidized, then reduced
    """
    seq_index = format_sequence_to_numeric(sequence)
    n_tyr = seq_index.count(tyr_index)
    n_trp = seq_index.count(trp_index)
    n_cys = seq_index.count(cys_index)
    coef_ox = n_tyr*ext_coef_tyr + n_trp*ext_coef_trp + n_cys*ext_coef_cys_ox
    coef_red = n_tyr*ext_coef_tyr + n_trp*ext_coef_trp  # + n_cys*ext_coef_cys_red
    return coef_ox, coef_red

find_matching_expression_tags

find_matching_expression_tags(uniprot_id: str = None, entity_id: str = None, pdb_code: str = None, chain: str = None, **kwargs) -> list | list[dict[str, str]]

Find matching expression tags by PDB ID reference

Parameters:

  • uniprot_id (str, default: None ) –

    The uniprot_id to query tags from

  • entity_id (str, default: None ) –
  • pdb_code (str, default: None ) –

    The pdb to query tags from. Requires chain argument as well

  • chain (str, default: None ) –

    The chain to query tags from. Requires pdb argument as well

Other Parameters:

  • alignment_length

    int = 12 - The length to slice the sequence plus any identified tags

Returns:

  • list | list[dict[str, str]]

    [{'name': 'his_tag', 'termini': 'n', 'sequence': 'MSGHHHHHHGKLKPNDLRI'}, ...], or [] if none found

Source code in symdesign/sequence/expression.py
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
def find_matching_expression_tags(uniprot_id: str = None, entity_id: str = None,
                                  pdb_code: str = None, chain: str = None, **kwargs) -> list | list[dict[str, str]]:
    """Find matching expression tags by PDB ID reference

    Args:
        uniprot_id: The uniprot_id to query tags from
        entity_id:
        pdb_code: The pdb to query tags from. Requires chain argument as well
        chain: The chain to query tags from. Requires pdb argument as well

    Keyword Args:
        alignment_length: int = 12 - The length to slice the sequence plus any identified tags

    Returns:
        [{'name': 'his_tag', 'termini': 'n', 'sequence': 'MSGHHHHHHGKLKPNDLRI'}, ...], or [] if none found
    """
    #         (dict): {'n': {His Tag: 2}, 'c': {Spy Catcher: 1},
    #                  'matching_tags': [{'name': 'his_tag', 'termini': 'n', 'sequence': 'MSGHHHHHHGKLKPNDLRI'}, ...]}
    matching_pdb_tags = []
    if entity_id is None:
        entity_ids = []
        if pdb_code and chain:
            uniprot_id = pull_uniprot_id_by_pdb(uniprot_pdb_d, pdb_code, chain=chain)
            if uniprot_id is None:
                logger.error(f"The 'pdb_code'.'chain' combination '{pdb_code}.{chain}' found no valid identifiers")
                return matching_pdb_tags
    else:
        entity_ids = [entity_id]

    if uniprot_id:
        entity_ids = query.pdb.pdb_id_matching_uniprot_id(uniprot_id=uniprot_id)
    elif entity_ids:
        pass
    else:
        logger.error("One of 'uniprot_id' OR 'entity_id' OR 'pdb_code' and 'chain' is required")
        return matching_pdb_tags
    # From PDB API
    partner_sequences = [query.pdb.get_entity_reference_sequence(entity_id=entity_id) for entity_id in entity_ids]
    for sequence in partner_sequences:
        matching_pdb_tags.extend(find_expression_tags(sequence, **kwargs))

    return matching_pdb_tags

report_termini_availability

report_termini_availability(matching_pdb_tags: list[dict[str, str]], n: bool = True, c: bool = True) -> str

From a list of possible tags, solve for the tag with the most observations in the PDB. If there are discrepancies, query the user for a solution

Parameters:

  • matching_pdb_tags (list[dict[str, str]]) –

    [{'name': 'his_tag', 'termini': 'n', 'sequence': 'MSGHHHHHHGKLKPNDLRI'}, ...]

  • n (bool, default: True ) –

    Whether the n-termini can be tagged

  • c (bool, default: True ) –

    Whether the c-termini can be tagged

Returns:

  • str

    The termini which should be tagger, either 'n', 'c' or 'skip' if the sequence shouldn't be used

Source code in symdesign/sequence/expression.py
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
def report_termini_availability(matching_pdb_tags: list[dict[str, str]], n: bool = True, c: bool = True) -> str:
    """From a list of possible tags, solve for the tag with the most observations in the PDB. If there are
    discrepancies, query the user for a solution

    Args:
        matching_pdb_tags: [{'name': 'his_tag', 'termini': 'n', 'sequence': 'MSGHHHHHHGKLKPNDLRI'}, ...]
        n: Whether the n-termini can be tagged
        c: Whether the c-termini can be tagged

    Returns:
        The termini which should be tagger, either 'n', 'c' or 'skip' if the sequence shouldn't be used
    """
    final_tag = {'name': None, 'termini': None, 'sequence': None}
    # Next, align all the tags to the reference sequence and tally the tag location and type
    # {'n': {his_tag: 2}, 'c': {spy_catcher: 1}}
    pdb_tag_tally: dict[str, dict[str, int]] = {'n': defaultdict(int), 'c': defaultdict(int)}
    for partner_tag in matching_pdb_tags:
        # if partner_pdb_tags:
        #     for partner_tag in partner_pdb_tags:
        partner_termini = partner_tag['termini']
        partner_name = partner_tag['name']
        # if partner_name in pdb_tag_tally[partner_termini]:
        pdb_tag_tally[partner_termini][partner_name] += 1
        # else:
        #     pdb_tag_tally[partner_termini][partner_name] = 1

    n_term = sum([pdb_tag_tally['n'][tag_name] for tag_name in pdb_tag_tally.get('n', {})])
    c_term = sum([pdb_tag_tally['c'][tag_name] for tag_name in pdb_tag_tally.get('c', {})])

    if n_term == 0 and c_term == 0:  # No tags found
        evidence_string = 'Based on termini availability'
    elif not c and not n:
        evidence_string = 'Based on termini availability'
    else:
        evidence_string = 'Based on prior observations and available termini'

    if n_term > c_term and n or (n_term < c_term and n and not c):
        termini_string = 'the \033[38;5;208mn\033[0;0m termini is recommended to be tagged.'
    elif n_term < c_term and c or (n_term > c_term and c and not n):
        termini_string = 'the \033[38;5;208mC\033[0;0m termini is recommended to be tagged.'
    elif not c and not n:
        termini_string = '\033[38;5;208mneither\033[0;0m termini can be tagged.'
    else:  # termini = 'Both'
        if c and not n:
            termini_string = 'the \033[38;5;208mc\033[0;0m termini can be tagged.'
        elif not c and n:
            termini_string = 'the \033[38;5;208mn\033[0;0m termini can be tagged.'
        else:
            termini_string = '\033[38;5;208mboth\033[0;0m termini can be tagged.'

    termini_header = 'Termini', 'Tag name', 'Count'
    formatted_tags = [(termini, name, counts) for termini, name_counts in pdb_tag_tally.items()
                      for name, counts in name_counts.items()]
    if formatted_tags:
        observation_string = ' Observed tag options are as follows:\n\t%s\n' % \
                             '\n\t'.join(utils.pretty_format_table(formatted_tags, header=termini_header))
    else:
        observation_string = ''

    print(f'{observation_string}{evidence_string}, {termini_string}')
    return utils.validate_input(f"Which termini would you prefer? 'skip' discards this sequence",
                                ['n', 'c', 'skip'])

select_tags_for_sequence

select_tags_for_sequence(sequence_id: str, matching_pdb_tags: list[dict[str, str]], preferred: str = None, n: bool = True, c: bool = True) -> dict[str, str | None]

From a list of possible tags, solve for the tag with the most observations in the PDB. If there are discrepancies, query the user for a solution

Parameters:

  • sequence_id (str) –

    The sequence identifier

  • matching_pdb_tags (list[dict[str, str]]) –

    [{'name': 'his_tag', 'termini': 'n', 'sequence': 'MSGHHHHHHGKLKPNDLRI'}, ...]

  • preferred (str, default: None ) –

    The name of a preferred tag provided by the user

  • n (bool, default: True ) –

    Whether the n-termini can be tagged

  • c (bool, default: True ) –

    Whether the c-termini can be tagged

Returns:

  • dict[str, str | None]

    {'name': 'his_tag', 'termini': 'n', 'sequence': 'MSGHHHHHHGKLKPNDLRI'}

Source code in symdesign/sequence/expression.py
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
def select_tags_for_sequence(sequence_id: str, matching_pdb_tags: list[dict[str, str]], preferred: str = None,
                             n: bool = True, c: bool = True) -> dict[str, str | None]:
    """From a list of possible tags, solve for the tag with the most observations in the PDB. If there are
    discrepancies, query the user for a solution

    Args:
        sequence_id: The sequence identifier
        matching_pdb_tags: [{'name': 'his_tag', 'termini': 'n', 'sequence': 'MSGHHHHHHGKLKPNDLRI'}, ...]
        preferred: The name of a preferred tag provided by the user
        n: Whether the n-termini can be tagged
        c: Whether the c-termini can be tagged

    Returns:
        {'name': 'his_tag', 'termini': 'n', 'sequence': 'MSGHHHHHHGKLKPNDLRI'}
    """
    final_tag = {'name': None, 'termini': None, 'sequence': None}
    # Next, align all the tags to the reference sequence and tally the tag location and type
    # {'n': {his_tag: 2}, 'c': {spy_catcher: 1}}
    pdb_tag_tally: dict[str, dict[str, int]] = {'n': defaultdict(int), 'c': defaultdict(int)}
    for partner_tag in matching_pdb_tags:
        # if partner_pdb_tags:
        #     for partner_tag in partner_pdb_tags:
        partner_termini = partner_tag['termini']
        partner_name = partner_tag['name']
        # if partner_name in pdb_tag_tally[partner_termini]:
        pdb_tag_tally[partner_termini][partner_name] += 1
        # else:
        #     pdb_tag_tally[partner_termini][partner_name] = 1

    n_term = sum([pdb_tag_tally['n'][tag_name] for tag_name in pdb_tag_tally.get('n', {})])
    c_term = sum([pdb_tag_tally['c'][tag_name] for tag_name in pdb_tag_tally.get('c', {})])
    if n_term == 0 and c_term == 0:  # No tags found
        return final_tag

    termini_header = 'Termini', 'Tag name', 'Count'
    formatted_tags = [(termini, name, counts) for termini, name_counts in pdb_tag_tally.items()
                      for name, counts in name_counts.items()]
    if n_term > c_term and n or (n_term < c_term and n and not c):
        termini = 'n'
    elif n_term < c_term and c or (n_term > c_term and c and not n):
        termini = 'c'
    elif not c and not n:
        print(f'For sequence target {sequence_id}, \033[38;5;208mneither\033[0;0m termini is available for tagging.\n'
              'You can configure tags now regardless and modify the choice later, or skip tagging.\n'
              'The tag options are as follows:\n\t%s\n' %
              '\n\t'.join(utils.pretty_format_table(formatted_tags, header=termini_header)))
        termini = utils.validate_input(f"Which termini would you prefer [n/c]? To skip, input 'skip'",
                                       ['n', 'c', 'skip'])
        if termini == 'skip':
            return final_tag
    else:  # termini = 'Both'
        if c and not n:
            termini = 'c'
        elif not c and n:
            termini = 'n'
        else:
            print(f'For sequence target {sequence_id}, \033[38;5;208mboth\033[0;0m termini are available and have the '
                  f'same number of matched tags.\nThe tag options are as follows:\n\t%s\n' %
                  '\n\t'.join(utils.pretty_format_table(formatted_tags, header=termini_header)))
            termini = utils.validate_input(f"Which termini would you prefer [n/c]? To skip, input 'skip'",
                                           ['n', 'c', 'skip'])
            if termini == 'skip':
                return final_tag

    # Find the most common tag at the specific termini
    max_tag_type = None
    max_tag_count = 0
    for tag_name, tag_count in pdb_tag_tally[termini].items():
        if tag_count > max_tag_count:
            max_tag_count = tag_count
            max_tag_type = tag_name

    # Ensure at least one tag was found
    if max_tag_type is None:
        return final_tag
    else:
        all_tags = [max_tag_type]
        # Check if there are equally represented tags
        for tag_name, tag_count in pdb_tag_tally[termini].items():
            if tag_name != max_tag_type and tag_count == max_tag_count:
                all_tags.append(tag_name)

    # Finally report results to the user and solve ambiguous tags
    tag_type = all_tags[0]
    custom = False
    if preferred:
        if preferred == tag_type:
            default = 'y'
        else:
            return final_tag
            # default = 'y'
            # logger.info(
            #     f"The preferred tag '{preferred}' wasn't found from observations in the PDB. Using it anyway")
            # # default = input(f'If you would like to proceed with it anyway, enter "y"'
            # #                 f'.{query.utils.input_string}').lower()
            # tag_type = preferred
            # print(f'For {sequence_id}, the tag options are:\n\t%s'
            #       % '\n\t'.join(utils.pretty_format_table([tag.values() for tag in matching_pdb_tags],
            #                                               header=matching_pdb_tags[0].keys())))
            # # Solve for the termini
            # print(f'For {sequence_id}, the termini tag options are:\n\t%s'
            #       % '\n\t'.join(utils.pretty_format_table(formatted_tags, header=('Termini', 'Tag', 'Count'))))
            # while True:
            #     termini_input = input(f'What termini would you like to use [n/c]?{query.utils.input_string}') \
            #         .lower()
            #     if termini_input in ['n', 'c']:
            #         recommended_termini = termini_input
            #         break
            #     elif termini_input == 'none':
            #         return final_tag
            #     else:
            #         print(f"Input '{termini_input}' doesn't match available options. Please try again")
    else:
        logger.info(f'For {sequence_id}, the \033[38;5;208mrecommended\033[0;0m tag options are:\n'
                    f'\tTermini-{termini} Type-{tag_type}\n'
                    'If the Termini or Type is undesired, you can see the underlying options by specifying '
                    f"'options'. Otherwise, '{tag_type}' will be chosen")
        default = utils.validate_input('If you would like to proceed with the \033[38;5;208mrecommended\033[0;0m '
                                       "options, enter 'y', otherwise, specify 'options' to see all possibilities",
                                       ['y', 'options'])
    if default == 'y':
        final_tag_name = tag_type
        final_tag_termini = termini
    else:  # if default == 'options':
        print(f'For sequence target {sequence_id}, all tag options are as follows:\n%s\n' %
              '\n\t'.join(utils.pretty_format_table(formatted_tags, header=termini_header)))
        print(f'All tags:\n{matching_pdb_tags}\n')
        termini = utils.validate_input("Which termini would you prefer [n/c]? To skip, input 'skip'",
                                       ['n', 'c', 'skip'])
        if termini == 'skip':
            return final_tag
        final_tag_termini = termini

        print('\n\t%s' % '\n\t'.join([f'{i} - {tag}' for i, tag in enumerate(pdb_tag_tally[termini], 1)]))
        number_termini_tags = len(pdb_tag_tally[termini])
        # print(f'\n\t{number_termini_tags + 1} - CUSTOM')
        tag_input = utils.validate_input('What tag would you like to use? Enter the number of the above options',
                                         list(map(str, range(1, 1 + number_termini_tags))))  # 2 + number_termini_tags
        tag_input = int(tag_input)
        if tag_input <= number_termini_tags:
            final_tag_name = list(pdb_tag_tally[termini].keys())[tag_input - 1]
        else:  # if tag_input == number_termini_tags + 1:
            # Todo this isn't currently available
            print('All available tags are:\n\t%s' %
                  '\n\t'.join([f'{i} - {tag}' for i, tag in enumerate(expression_tags, 1)]))
            number_expression_tags = len(expression_tags)
            tag_input = utils.validate_input('What tag would you like to use? Enter the number of the above options',
                                             list(map(str, range(1, 1 + number_expression_tags))))
            tag_input = int(tag_input)
            final_tag_name = list(expression_tags.keys())[tag_input - 1]

            final_tag_sequence = expression_tags[final_tag_name]

    # Solve for the desired tag
    all_matching_tag_sequences = [tag['sequence'] for tag in matching_pdb_tags
                                  if final_tag_name == tag['name'] and final_tag_termini == tag['termini']]
    # Todo align multiple and choose the consensus
    # all_alignments = []
    # max_tag_idx, max_len = None, []  # 0
    # for idx, (tag1, tag2) in enumerate(combinations(all_matching_tags, 2)):
    #     alignment = generate_alignment(tag1, tag2)
    #     all_alignments.append(alignment)
    #     # if max_len < alignment[4]:  # the length of alignment
    #     max_len.append(alignment[4])
    #     # have to find the alignment with the max length, then find which one of the sequences has the max length for
    #     # multiple alignments, then need to select all alignments to this sequence to generate the MSA

    if all_matching_tag_sequences:  # For now, grab the first available tag
        logger.debug(f'Grabbing the first matching tag out of {len(all_matching_tag_sequences)} possible')
        final_tag_sequence = all_matching_tag_sequences[0]
    else:
        logger.critical(f"{select_tags_for_sequence.__name__}: This logic shouldn't have been possible")
        return final_tag

    return dict(name=final_tag_name, termini=final_tag_termini, sequence=final_tag_sequence)

add_expression_tag

add_expression_tag(tag: str, sequence: str) -> str

Add an expression tag to a sequence by aligning a tag specified with PDB reference sequence to the sequence

Parameters:

  • tag (str) –

    The tag with additional PDB reference sequence appended

  • sequence (str) –

    The sequence of interest

Returns:

  • str

    The final sequence with the tag added

Source code in symdesign/sequence/expression.py
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
def add_expression_tag(tag: str, sequence: str) -> str:
    """Add an expression tag to a sequence by aligning a tag specified with PDB reference sequence to the sequence

    Args:
        tag: The tag with additional PDB reference sequence appended
        sequence: The sequence of interest

    Returns:
        The final sequence with the tag added
    """
    if not tag:
        return sequence
    alignment = generate_alignment(tag, sequence)
    tag_seq, seq = alignment
    # print('Expression TAG alignment:', alignment[0])
    # score = alignment.score
    # # tag_seq, seq, score, *_ = alignment
    # # score = alignment[2]  # first alignment, grab score value
    # # print('Expression TAG alignment score:', score)
    # # if score == 0:  # TODO find the correct score for a bad alignment to indicate there was no productive alignment?
    # #     # alignment[0][4]:  # the length of alignment
    # #     # match_score = score / len(sequence)  # could also use which ever sequence is greater
    # #     return sequence
    # # print(alignment[0])
    # # print(tag_seq)
    # # print(seq)
    # # starting_index_of_seq2 = seq.find(sequence[0])
    # # i = -starting_index_of_seq2 + zero_offset  # make 1 index so residue value starts at 1
    final_seq = ''
    for seq1_aa, seq2_aa in zip(tag_seq, seq):
        if seq2_aa == '-':
            # if seq1_aa in protein_letters_alph1:
            final_seq += seq1_aa
        else:
            final_seq += seq2_aa

    return final_seq

find_expression_tags

find_expression_tags(sequence: str, alignment_length: int = 12) -> list | list[dict[str, str]]

Find all expression_tags on an input sequence from a reference set of expression_tags. Returns the matching tag sequence with additional protein sequence context equal to the passed alignment_length

Parameters:

  • sequence (str) –

    The sequence of interest i.e. 'MSGHHHHHHGKLKPNDLRI...'

  • alignment_length (int, default: 12 ) –

    The length to slice the sequence plus any identified tags

Returns:

  • list | list[dict[str, str]]

    A list of the available tags with a featured dictionary for each tag. Formatted as - {'name': str, 'termini': 'n'/'c', 'sequence': 'MSGHHHHHHGKLKPNDLRI'}. Returns [] if no tags are found

Source code in symdesign/sequence/expression.py
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
def find_expression_tags(sequence: str, alignment_length: int = 12) -> list | list[dict[str, str]]:
    """Find all expression_tags on an input sequence from a reference set of expression_tags. Returns the matching tag
    sequence with additional protein sequence context equal to the passed alignment_length

    Args:
        sequence: The sequence of interest i.e. 'MSGHHHHHHGKLKPNDLRI...'
        alignment_length: The length to slice the sequence plus any identified tags

    Returns:
        A list of the available tags with a featured dictionary for each tag. Formatted as -
            {'name': str, 'termini': 'n'/'c', 'sequence': 'MSGHHHHHHGKLKPNDLRI'}. Returns [] if no tags are found
    """
    half_sequence_length = len(sequence) / 2
    matching_tags = []
    for name, tag_sequence in expression_tags.items():
        tag_index = sequence.find(tag_sequence)
        if tag_index == -1:  # No match was found
            continue
        # Save the tag name, the termini of the sequence it is closest to, and the source sequence context
        tag_length = len(tag_sequence)
        alignment_size = tag_length + alignment_length
        if tag_index < half_sequence_length:
            termini = 'n'
            matching_sequence = sequence[tag_index:tag_index + alignment_size]
        else:
            termini = 'c'
            final_tag_index = tag_index + tag_length
            matching_sequence = sequence[final_tag_index - alignment_size:final_tag_index]
        matching_tags.append({'name': name, 'termini': termini, 'sequence': matching_sequence})

    return matching_tags

remove_internal_tags

remove_internal_tags(sequence: str, tag_names: list[str] = None) -> str

Remove matching tag sequences only, from the specified sequence

Defaults to known tags in constants.expression_tags

Parameters:

  • sequence (str) –

    The sequence of interest i.e. 'MSGHHHHHHGKLKPNDLRI...'

  • tag_names (list[str], default: None ) –

    If only certain tags should be removed, a list with the names of known tags

Returns: 'MSGGKLKPNDLRI...' The modified sequence without the tag

Source code in symdesign/sequence/expression.py
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
def remove_internal_tags(sequence: str, tag_names: list[str] = None) -> str:
    """Remove matching tag sequences only, from the specified sequence

    Defaults to known tags in constants.expression_tags

    Args:
        sequence: The sequence of interest i.e. 'MSGHHHHHHGKLKPNDLRI...'
        tag_names: If only certain tags should be removed, a list with the names of known tags
    Returns:
        'MSGGKLKPNDLRI...' The modified sequence without the tag
    """
    if tag_names:
        _expression_tags = {tag_name: expression_tags[tag_name] for tag_name in tag_names}
    else:
        _expression_tags = expression_tags

    for name, tag_sequence in _expression_tags.items():
        tag_index = sequence.find(tag_sequence)
        if tag_index == -1:  # No match was found
            continue

        # Excise the tag from the source sequence
        sequence = sequence[:tag_index] + sequence[tag_index + len(tag_sequence):]

    return sequence

remove_terminal_tags

remove_terminal_tags(sequence: str, tag_names: list[str] = None, termini: termini_literal = None) -> str

Remove matching tag sequences and any remaining termini from the specified sequence

Defaults to known tags in constants.expression_tags

Parameters:

  • sequence (str) –

    The sequence of interest i.e. 'MSGHHHHHHGKLKPNDLRI...'

  • tag_names (list[str], default: None ) –

    If only certain tags should be removed, a list with the names of known tags

  • termini (termini_literal, default: None ) –

    Pass 'n' or 'c' if particular termini should be cleaned of tags

Returns: 'GGKLKPNDLRI...' The modified sequence without the tagged termini

Source code in symdesign/sequence/expression.py
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
def remove_terminal_tags(sequence: str, tag_names: list[str] = None, termini: termini_literal = None) -> str:
    """Remove matching tag sequences and any remaining termini from the specified sequence

    Defaults to known tags in constants.expression_tags

    Args:
        sequence: The sequence of interest i.e. 'MSGHHHHHHGKLKPNDLRI...'
        tag_names: If only certain tags should be removed, a list with the names of known tags
        termini: Pass 'n' or 'c' if particular termini should be cleaned of tags
    Returns:
        'GGKLKPNDLRI...' The modified sequence without the tagged termini
    """
    if tag_names:
        _expression_tags = {tag_name: expression_tags[tag_name] for tag_name in tag_names}
    else:
        _expression_tags = expression_tags

    if termini is None:  # Use the half_sequence_length, and hard code n_term
        half_sequence_length = len(sequence) / 2
        n_term = True  # c_term = True
    else:  # Hard code the half_sequence_length as always true
        half_sequence_length = len(sequence)
        if termini == 'n':
            n_term = True
            # c_term = False
        elif termini == 'c':
            n_term = False
            # c_term = True
        else:
            raise ValueError(f"Must pass either 'n' or 'c' for the argument 'termini'")

    for name, tag_sequence in _expression_tags.items():
        tag_index = sequence.find(tag_sequence)
        if tag_index == -1:  # No match was found
            continue

        # Remove from one end based on termini proximity
        if n_term and tag_index < half_sequence_length:  # termini = 'n'
            sequence = sequence[tag_index + len(tag_sequence):]
        else:  # termini = 'c'
            sequence = sequence[:tag_index]

    return sequence