Skip to content

pdb

GB module-attribute

GB = 'GenBank'

The module level identifier for a GenBankID

NOR module-attribute

NOR = 'Norine'

The module level identifier for a NorineID

UKB module-attribute

UKB = 'UniProt'

The module level identifier for a UniProtID

qsbio_confirmed module-attribute

qsbio_confirmed: Annotated[dict[str, list[int]], "PDB EntryID (lowercase) mapped to biological assembly numbers for ID's with QSBio confidence as high or very high"] = unpickle(qs_bio)

PDB EntryID (lowercase) mapped to biological assembly numbers for ID's with QSBio confidence as high or very high

retrieve_entity_id_by_sequence

retrieve_entity_id_by_sequence(sequence: str) -> str | None

From a given sequence, retrieve the top matching Entity ID from the PDB API

Parameters:

  • sequence (str) –

    The sequence used to query for the EntityID

Returns:

  • str | None

    '1ABC_1'

Source code in symdesign/resources/query/pdb.py
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
def retrieve_entity_id_by_sequence(sequence: str) -> str | None:
    """From a given sequence, retrieve the top matching Entity ID from the PDB API

    Args:
        sequence: The sequence used to query for the EntityID

    Returns:
        '1ABC_1'
    """
    matching_entities = find_matching_entities_by_sequence(sequence, all_matching=False)
    if matching_entities:
        logger.debug(f'Sequence search found the matching EntityIDs: {", ".join(matching_entities)}')
        return matching_entities[0]
    else:
        return None

find_matching_entities_by_sequence

find_matching_entities_by_sequence(sequence: str = None, return_id: return_types_literal = 'polymer_entity', **kwargs) -> list[str] | None

Search the PDB for matching IDs given a sequence and a return_type. Pass all_matching=False to retrieve the top 10 IDs, otherwise return all IDs

Parameters:

  • sequence (str, default: None ) –

    The sequence used to query for EntityID's

  • return_id (return_types_literal, default: 'polymer_entity' ) –

    The type of value to return

Returns:

  • list[str] | None

    The EntityID's matching the sequence

Source code in symdesign/resources/query/pdb.py
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
def find_matching_entities_by_sequence(sequence: str = None, return_id: return_types_literal = 'polymer_entity',
                                       **kwargs) -> list[str] | None:
    """Search the PDB for matching IDs given a sequence and a return_type. Pass all_matching=False to retrieve the top
    10 IDs, otherwise return all IDs

    Args:
        sequence: The sequence used to query for EntityID's
        return_id: The type of value to return

    Returns:
        The EntityID's matching the sequence
    """
    if return_id not in return_type_args:
        raise KeyError(
            f"The specified return_id '{return_id}' isn't supported. Allowed values: {', '.join(return_type_args)}")
    logger.debug(f'Using the default sequence similarity parameters: '
                 f'{", ".join(f"{k}: {v}" for k, v in default_sequence_values.items())}')
    sequence_query = format_terminal_group(service='sequence', sequence=sequence)
    sequence_query_results = query_pdb(
        generate_query(sequence_query, return_id=return_id, cluster_uniprot=True, **kwargs))
    if sequence_query_results:
        return parse_pdb_response_for_ids(sequence_query_results)
    else:
        logger.warning(f"Sequence wasn't found by the PDB API:\n{sequence}")
        return None  # [None]

parse_pdb_response_for_ids

parse_pdb_response_for_ids(response: dict[str, dict[str, str]], groups: bool = False) -> list[str]

Parse JSON PDB API returns for identifiers

Parameters:

  • response (dict[str, dict[str, str]]) –
  • groups (bool, default: False ) –

    Whether the identifiers are clustered by group

Returns:

  • list[str]

    The list of identifiers from the response

Source code in symdesign/resources/query/pdb.py
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
def parse_pdb_response_for_ids(response: dict[str, dict[str, str]], groups: bool = False) -> list[str]:
    """Parse JSON PDB API returns for identifiers

    Args:
        response:
        groups: Whether the identifiers are clustered by group

    Returns:
        The list of identifiers from the response
    """
    # logger.debug(f'Response contains the results: {response["result_set"]}')
    if groups:
        return [result['identifier'] for result in response.get('group_set', [])]
    else:
        return [result['identifier'] for result in response.get('result_set', [])]

query_pdb

query_pdb(query_: dict[Any] | str, json_formatted: bool = False) -> dict[str, Any] | None

Take a JSON formatted PDB API query and return the results

PDB response can look like: {'query_id': 'ecc736b3-f19c-4a54-a5d6-3db58ce6520b', 'result_type': 'entry', 'total_count': 104, 'result_set': [{'identifier': '4A73', 'score': 1.0, 'services': [{'service_type': 'text', 'nodes': [{'node_id': 11198, 'original_score': 222.23667907714844, 'norm_score': 1.0}]}]}, {'identifier': '5UCQ', 'score': 1.0, 'services': [{'service_type': 'text', 'nodes': [{'node_id': 11198, 'original_score': 222.23667907714844, 'norm_score': 1.0}]}]}, {'identifier': '6P3L', 'score': 1.0, 'services': [{'service_type': 'text', 'nodes': [{'node_id': 11198, 'original_score': 222.23667907714844, 'norm_score': 1.0}]}]}, ... ] }

Parameters:

  • query_ (dict[Any] | str) –

    The query formatted as a dictionary or a JSON string

  • json_formatted (bool, default: False ) –

    Whether the query is already formatted as a JSON string

Returns:

  • dict[str, Any] | None

    The response formatted as a dictionary from the JSON format or None if the query failed

Source code in symdesign/resources/query/pdb.py
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
def query_pdb(query_: dict[Any] | str, json_formatted: bool = False) -> dict[str, Any] | None:
    """Take a JSON formatted PDB API query and return the results

    PDB response can look like:
    {'query_id': 'ecc736b3-f19c-4a54-a5d6-3db58ce6520b',
     'result_type': 'entry',
    'total_count': 104,
    'result_set': [{'identifier': '4A73', 'score': 1.0,
                    'services': [{'service_type': 'text', 'nodes': [{'node_id': 11198,
                                                                     'original_score': 222.23667907714844,
                                                                     'norm_score': 1.0}]}]},
                   {'identifier': '5UCQ', 'score': 1.0,
                    'services': [{'service_type': 'text', 'nodes': [{'node_id': 11198,
                                                                     'original_score': 222.23667907714844,
                                                                     'norm_score': 1.0}]}]},
                   {'identifier': '6P3L', 'score': 1.0,
                    'services': [{'service_type': 'text', 'nodes': [{'node_id': 11198,
                                                                     'original_score': 222.23667907714844,
                                                                     'norm_score': 1.0}]}]},
                    ...
                  ]
    }

    Args:
        query_: The query formatted as a dictionary or a JSON string
        json_formatted: Whether the query is already formatted as a JSON string

    Returns:
        The response formatted as a dictionary from the JSON format or None if the query failed
    """
    if json_formatted:
        formatted_query_ = query_
    else:
        formatted_query_ = dumps(query_)

    query_response = None
    iteration = 0
    while True:
        try:
            query_response = requests.get(pdb_query_url, params={'json': formatted_query_})
            # logger.debug(f'Found the PDB query with url: {query_response.url}')
            if query_response.status_code == 200:
                return query_response.json()
            elif query_response.status_code == 204:
                logger.warning('No response was returned. Your query likely found no matches!')
                break
            elif query_response.status_code == 429:
                logger.debug('Too many requests, pausing momentarily')
                time.sleep(2)
            else:
                logger.debug(f'Your query returned an unrecognized status code ({query_response.status_code})')
                time.sleep(1)
                iteration += 1
        except requests.exceptions.ConnectionError:
            logger.debug('Requests ran into a connection error')
            time.sleep(1)
            iteration += 1

        if iteration > 5:
            logger.error('The maximum number of resource fetch attempts was made with no resolution. '
                         f'Offending request {getattr(query_response, "url", pdb_query_url)}')  # Todo format url
            break
            # raise DesignError('The maximum number of resource fetch attempts was made with no resolution. '
            #                   'Offending request %s' % getattr(query_response, 'url', pdb_query_url))
    return None

pdb_id_matching_uniprot_id

pdb_id_matching_uniprot_id(uniprot_id, return_id: return_types_literal = 'polymer_entity') -> list[str]

Find all matching PDB entries from a specified UniProt ID and specific return ID

Parameters:

  • uniprot_id

    The UniProt ID of interest

  • return_id (return_types_literal, default: 'polymer_entity' ) –

    The type of value to return

Returns:

  • list[str]

    The list of matching IDs

Source code in symdesign/resources/query/pdb.py
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
def pdb_id_matching_uniprot_id(uniprot_id, return_id: return_types_literal = 'polymer_entity') -> list[str]:
    """Find all matching PDB entries from a specified UniProt ID and specific return ID

    Args:
        uniprot_id: The UniProt ID of interest
        return_id: The type of value to return

    Returns:
        The list of matching IDs
    """
    if return_id not in return_type_args:
        raise KeyError(
            f"The specified return_id '{return_id}' isn't supported. Allowed values: {', '.join(return_type_args)}")
    database = {'attribute': 'rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_name',
                'negation': False, 'operator': 'exact_match', 'value': 'UniProt'}
    accession = \
        {'attribute': 'rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession',
         'negation': False, 'operator': 'in', 'value': [uniprot_id]}

    uniprot_query = [format_terminal_group(service='text', **database),
                     format_terminal_group(service='text', **accession)]
    final_query = generate_group('and', uniprot_query)
    search_query = generate_query(final_query, return_id=return_id)
    response_d = query_pdb(search_query)
    if response_d:
        return parse_pdb_response_for_ids(response_d)
    else:
        return []

generate_query

generate_query(search: dict, return_id: return_types_literal = 'entry', cluster_uniprot: bool = False, cluster_sequence: bool = False, return_groups: bool = False, all_matching: bool = True) -> dict[str, dict | str]

Format a PDB query with the specific return type and parameters affecting search results

Parameters:

  • search (dict) –

    Contains the key, value pairs in accordance with groups and terminal groups

  • return_id (return_types_literal, default: 'entry' ) –

    The type of ID that should be returned

  • cluster_uniprot (bool, default: False ) –

    Whether the query generated is a sequence type query

  • cluster_sequence (bool, default: False ) –

    Whether the query generated is clustered by sequence similarity

  • return_groups (bool, default: False ) –

    Whether to return results as group IDs

  • all_matching (bool, default: True ) –

    Whether to get all matching IDs

Returns:

  • dict[str, dict | str]

    The formatted query to be sent via HTTP GET

Source code in symdesign/resources/query/pdb.py
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
def generate_query(search: dict, return_id: return_types_literal = 'entry', cluster_uniprot: bool = False,
                   cluster_sequence: bool = False, return_groups: bool = False, all_matching: bool = True) \
        -> dict[str, dict | str]:
    """Format a PDB query with the specific return type and parameters affecting search results

    Args:
        search: Contains the key, value pairs in accordance with groups and terminal groups
        return_id: The type of ID that should be returned
        cluster_uniprot: Whether the query generated is a sequence type query
        cluster_sequence: Whether the query generated is clustered by sequence similarity
        return_groups: Whether to return results as group IDs
        all_matching: Whether to get all matching IDs

    Returns:
        The formatted query to be sent via HTTP GET
    """
    if return_id not in return_type_args:
        raise KeyError(
            f"The specified return type '{return_id}' isn't supported. Viable types include "
            f"{', '.join(return_type_args)}")

    query_d = {'query': search, 'return_type': return_id}
    request_options = {'results_content_type': ['experimental'],  # "computational" for Alphafold
                       'sort': [{
                           'sort_by': 'score',
                           'direction': 'desc'}],
                       'scoring_strategy': 'combined'
                       }
    if cluster_uniprot or cluster_sequence:
        if cluster_uniprot:
            cluster_options = sequence_request_options.copy()
        elif cluster_sequence:
            cluster_options = sequence_cluster_request_options.copy()
        else:
            raise NotImplementedError()

        if return_groups:
            cluster_options.update({'group_by_return_type': 'groups'})

        request_options.update(cluster_options)
    elif return_groups:
        logger.warning(
            "The argument 'return_groups' wasn't used as neither 'cluster_uniprot' or 'cluster_sequence' were provided")

    if all_matching:
        request_options.update({'return_all_hits': True})

    query_d.update({'request_options': request_options})

    return query_d

retrieve_pdb_entries_by_advanced_query

retrieve_pdb_entries_by_advanced_query(save: bool = True, return_results: bool = True, force_schema_update: bool = False, entity: bool = False, assembly: bool = False, chain: bool = False, entry: bool = False, **kwargs) -> str | list | None

Parameters:

  • save (bool, default: True ) –
  • return_results (bool, default: True ) –
  • force_schema_update (bool, default: False ) –
  • entity (bool, default: False ) –
  • assembly (bool, default: False ) –
  • chain (bool, default: False ) –
  • entry (bool, default: False ) –

Returns:

Source code in symdesign/resources/query/pdb.py
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
def retrieve_pdb_entries_by_advanced_query(save: bool = True, return_results: bool = True,
                                           force_schema_update: bool = False, entity: bool = False,
                                           assembly: bool = False, chain: bool = False, entry: bool = False, **kwargs) \
        -> str | list | None:
    """

    Args:
        save:
        return_results:
        force_schema_update:
        entity:
        assembly:
        chain:
        entry:

    Returns:

    """
    # {attribute: {'dtype': 'string', 'description': 'XYZ', 'operators': {'equals',}, 'choices': []}, ...}

    def search_schema(term):
        return [(key, schema[key]['description']) for key in schema if schema[key]['description'] and
                term.lower() in schema[key]['description'].lower()]

    def make_groups(*args, recursive_depth=0):
        # Todo remove ^ * expression?
        # on initialization have [{}, {}, ...]
        #  was [(), (), ...]
        # on recursion get (terminal_queries, grouping,
        terminal_queries = args[0]
        work_on_group = args[recursive_depth]
        all_grouping_indices = {i for i in range(1, 1 + len(work_on_group))}

        group_introduction = f'\n{header_string % "Grouping Instructions"}\n' \
                             f'Because you have {len(terminal_queries)} search queries, you need to combine these to ' \
                             'a total search strategy. This is accomplished by grouping your search queries together ' \
                             f'using the operations {group_operators}. You must eventually group all queries into a ' \
                             'single logical operation.\nIf you have multiple groups, you will need to group those ' \
                             'groups, so on and so forth.\nIndicate your group selections with a space separated list!'\
                             ' You will choose the group operation to combine this list afterwards.\nFollow prior ' \
                             "prompts if you need a reminder of how group#'s relate to query#'s"
        group_grouping_intro = '\nGroups remain, you must group groups as before.'
        group_inquiry_string = '\nWhich of these (identified by #) would you like to combine into a group?%s' % \
                               input_string
        group_specification_string = 'You specified "%s" as a single group.'
        group_logic_string = f'\nWhat group operator {group_operators} would you like for this group?{input_string}'

        available_query_string = '\nYour available queries are:\n%s\n' % \
                                 '\n'.join(query_display_string % (query_num, service.upper(), attribute,
                                                                   'NOT ' if negate else '', operator.upper(), value)
                                           for query_num, (service, attribute, operator, negate, value)
                                           in enumerate(list(terminal_queries.values()), 1))

        if recursive_depth == 0:
            intro_string = group_introduction
            available_entity_string = available_query_string
        else:
            intro_string = group_grouping_intro
            available_entity_string = '\nYour available groups are:\n%s\n' % \
                                      '\n'.join(f'\tGroup Group #{i}{format_string.format(*group)}'
                                                for i, group in enumerate(list(work_on_group.values()), 1))

        print(intro_string)  # provide an introduction
        print(available_entity_string)  # display available entities which switch between guery and group...

        selected_grouping_indices = deepcopy(all_grouping_indices)
        groupings = []
        while len(selected_grouping_indices) > 1:  # check if more work needs to be done
            while True:  # ensure grouping input is viable
                while True:
                    grouping = set(map(int, input(group_inquiry_string).split()))  # get new grouping
                    # error on isdigit() ^
                    if len(grouping) > 1:
                        break
                    else:
                        print('More than one group is required. Your group "%s" is invalid' % grouping)
                while True:
                    confirm = input('%s\n%s' % (group_specification_string % grouping, confirmation_string))
                    if confirm.lower() in bool_d:
                        break
                    else:
                        print('%s %s is not a valid choice!' % (invalid_string, confirm))

                if bool_d[confirmation.lower()] or confirmation.isspace():  # confirm that grouping is as specified
                    while True:  # check if logic input is viable
                        group_logic = input(group_logic_string).lower()
                        if group_logic in group_operators:
                            break
                        else:
                            print(invalid_string)
                    groupings.append((grouping, group_logic))
                    break

            # remove specified from the pool of available until all are gone
            selected_grouping_indices = selected_grouping_indices.difference(grouping)

        if len(selected_grouping_indices) > 0:
            groupings.append((selected_grouping_indices, 'and'))  # When only 1 remains, automatically add 'and'
            # Todo test logic of and with one group?

        args.extend((groupings,))  # now [{} {}, ..., ([(grouping, group_logic), (), ...])
        # once all groupings are grouped, recurse
        if len(groupings) > 1:
            # todo without the return call, the stack never comes back to update args?
            make_groups(*args, recursive_depth=recursive_depth + 1)

        return list(args)  # list() may be unnecessary

    # Start the user input routine -------------------------------------------------------------------------------------
    schema = get_rcsb_metadata_schema(force_update=force_schema_update)
    print(f'\n{header_string % "PDB API Advanced Query"}\n'
          f'This prompt will walk you through generating an advanced search query and retrieving the matching '
          "set of entry ID's from the PDB. This automatically parses the ID's of interest for downstream use, which "
          'can save you some headache. If you want to take advantage of the PDB webpage GUI to perform the advanced '
          f'search, visit:\n\t{pdb_advanced_search_url}\nThen enter "json" in the prompt below and follow those '
          'instructions.\n\n'
          'Otherwise, this command line prompt takes advantage of the same GUI functionality. If you have a '
          'search specified from a prior query that you want to process again, using "json" will be useful as well. '
          'To proceed with the command line search just hit "Enter"')
    program_start = input(input_string)
    if program_start.lower() == 'json':
        if entity:
            return_type = 'Polymer Entities'  # 'polymer_entity'
        elif assembly:
            return_type = 'Assemblies'  # 'assembly'
        elif chain:
            return_type = 'Polymer Entities'  # This isn't available on web GUI -> 'polymer_instance'
        elif entry:
            return_type = 'Structures'  # 'entry'
        else:
            return_type = 'Structures'  # 'entry'

        return_type_prompt = f'At the bottom left of the dialog, there is a drop down menu next to "Return". ' \
                             f'Choose {return_type}'
        print('DETAILS: To save time formatting and immediately move to your design pipeline, build your Query with the'
              ' PDB webpage GUI, then save the resulting JSON text to a file. To do this, first build your full query '
              f'on the advanced search page, {return_type_prompt} then click the Search button (magnifying glass icon).'
              ' After the page loads, a new section of the search page should appear above the Advanced Search Query '
              'Builder dialog. There, click the JSON|->| button to open a new page with an automatically built JSON '
              'representation of your query. Save the entirety of this JSON formatted query to a file to return your '
              "chosen ID's\n")
        # ('Paste your JSON object below. IMPORTANT select from the opening \'{\' to '
        #  '\'"return_type": "entry"\' and paste. Before hitting enter, add a closing \'}\'. This hack '
        #  'ensures ALL results are retrieved with no sorting or pagination applied\n\n%s' %
        #  input_string)
        prior_query = input(f'Please specify the path where the JSON query file is located{input_string}')
        while not os.path.exists(prior_query):
            prior_query = input(f"The specified path '{prior_query}' doesn't exist! Please try again{input_string}")

        with open(prior_query, 'r') as f:
            json_input = load(f)

        # remove any paginate instructions from the json_input
        json_input['request_options'].pop('paginate', None)
        # if all_matching:
        # Ensure we get all matching
        json_input['request_options'].update({'return_all_hits': True})
        response_d = query_pdb(json_input)
    # elif program_start.lower() == 'previous':
    #     while True:
    #         prior_query = input('Please specify the path where the search file is located%s' % input_string)
    #         if os.path.exists(prior_query):
    #             with open(prior_query, 'r') as f:
    #                 search_query = loads(f.readlines())
    #         else:
    #             print('The specified path \'%s\' doesn\'t exist! Please try again.' % prior_query)
    else:
        if entity:
            return_type = 'polymer_entity'
        elif assembly:
            return_type = 'assembly'
        elif chain:
            return_type = 'polymer_instance'
        elif entry:
            return_type = 'entry'
        else:
            return_identifier_string = '\nFor each set of options, choose the option from the first column for the ' \
                                       'description in the second.\nWhat type of identifier do you want to search the '\
                                       f'PDB for?%s{input_string}' % user_input_format % \
                                       '\n'.join(format_string.format(*item) for item in return_types.items())
            return_type = validate_input(return_identifier_string, return_type_args)

        terminal_group_queries = []
        # terminal_group_queries = {}
        increment = 1
        while True:
            # Todo only text search is available now
            # query_builder_service_string = '\nWhat type of search method would you like to use?%s%s' % \
            #                                (user_input_format % '\n'.join(format_string % item
            #                                                               for item in services.items()), input_string)
            query_builder_attribute_string = \
                '\nWhat type of attribute would you like to use? Examples include:\n\t%s\n\n' \
                f'For a more thorough list indicate "s" for search.\nAlternatively, you can browse {attribute_url}\n' \
                f'Ensure that your spelling is exact if you want your query to succeed!{input_string}' % \
                '\n\t'.join(utils.pretty_format_table(attributes.items(), header=('Option', 'Description')))
            query_builder_operator_string = '\nWhat operator would you like to use?\nPossible operators include:' \
                                            '\n\t%s\nIf you would like to negate the operator, on input type "not" ' \
                                            f'after your selection. Ex: equals not{input_string}'
            query_builder_value_string = '\nWhat value should be %s? Required type is: %s.%s%s'
            query_display_string = 'Query #%d: Search the PDB by "%s" for "%s" attributes "%s%s" "%s".'

            while True:  # start the query builder routine
                while True:
                    # service = input(query_builder_service_string)
                    service = 'text'  # Todo
                    if service in services:
                        break
                    else:
                        print(invalid_string)

                # {attribute: {'dtype': 'string', 'description': 'XYZ', 'operators': {'equals',}, 'choices': []}, ...}
                while True:
                    attribute = input(query_builder_attribute_string)
                    while attribute.lower() == 's':  # If the user would like to search all possible
                        search_term = input('What term would you like to search?%s' % input_string)
                        attribute = input(f'Found the following instances of "{search_term.upper()}":\n%s\nWhich option'
                                          f' are you interested in? Enter "s" to repeat search.{input_string}' %
                                          user_input_format %
                                          '\n'.join(format_string.format(*key_description_pair) for key_description_pair
                                                    in search_schema(search_term)))
                        if attribute != 's':
                            break
                    if attribute in schema:  # Confirm the user wants to go forward with this
                        break
                    else:
                        print(f'***ERROR: {attribute} was not found in PDB schema***')
                        # while True:  # confirm that the confirmation input is valid
                        #     confirmation = input('ERROR: %s was not found in PDB schema! If you proceed, your search is'
                        #                          ' almost certain to fail.\nProceed anyway? [y/n]%s' %
                        #                          (attribute, input_string))
                        #     if confirmation.lower() in bool_d:
                        #         break
                        #     else:
                        #         print('%s %s is not a valid choice!' % invalid_string, confirmation)
                        # if bool_d[confirmation.lower()] or confirmation.isspace():  # break the attribute routine on y or ''
                        #     break

                while True:  # Retrieve the operator for the search
                    while True:  # Check if the operator should be negated
                        operator = input(query_builder_operator_string % ', '.join(schema[attribute]['operators']))
                        if len(operator.split()) > 1:
                            negation = operator.split()[1]
                            operator = operator.split()[0]
                            if negation.lower() == 'not':  # Can negate any search
                                negate = True
                                break
                            else:
                                print(f"{invalid_string} {negation} is not a recognized negation!\n "
                                      f"Try '{operator} not' instead or remove extra input")
                        else:
                            negate = False
                            break
                    if operator in schema[attribute]['operators']:
                        break
                    else:
                        print(f"{invalid_string} {operator} isn't a valid operator")

                op_in = True
                while op_in:  # Check if operator is 'in'
                    if operator == 'in':
                        print("\nThe 'in' operator can take multiple values. If you want multiple values, specify "
                              'each as a separate input')
                    else:
                        op_in = False

                    while True:  # Retrieve the value for the search
                        value = input(query_builder_value_string % (operator.upper(), instance_d[schema[attribute]['dtype']]
                                                                    , ('\nPossible choices:\n\t%s' %
                                                                       ', '.join(schema[attribute]['choices'])
                                                                       if schema[attribute]['choices'] else ''),
                                                                    input_string))
                        if isinstance(value, instance_d[schema[attribute]['dtype']]):  # check if the right data type
                            break
                        else:
                            try:  # try to convert the input value to the specified type
                                value = instance_d[schema[attribute]['dtype']](value)
                                if schema[attribute]['choices']:  # if there is a choice
                                    if value in schema[attribute]['choices']:  # check if the value is in the possible choices
                                        break
                                    else:  # if not, confirm the users desire to do this
                                        while True:  # confirm that the confirmation input is valid
                                            confirmation = input('%s was not found in the possible choices: %s\nProceed'
                                                                 ' anyway? [y/n]%s' %
                                                                 (value, ', '.join(schema[attribute]['choices']),
                                                                  input_string))
                                            if confirmation.lower() in bool_d:
                                                break
                                            else:
                                                print(f"{invalid_string} {confirmation} isn't a valid choice")
                                        if bool_d[confirmation.lower()] or confirmation.isspace():  # break the value routine on y or ''
                                            break

                                else:
                                    break
                            except ValueError:  # catch any conversion issue like float('A')
                                print(f"{invalid_string} {value} isn't a valid {instance_d[schema[attribute]['dtype']]}"
                                      " value!")

                    while op_in:
                        # TODO ensure that the in parameters are spit out as a list
                        additional = input(additional_input_string % " value to your 'in' operator")
                        if additional.lower() in bool_d:
                            if bool_d[additional.lower()] or additional.isspace():
                                break  # Stop the inner 'in' check loop
                            else:
                                op_in = False  # Stop the inner and outer 'in' while loops
                        else:
                            print(f"{invalid_string} {additional} isn't a valid choice")

                while True:
                    confirmation = input('\n%s\n%s' % (query_display_string %
                                                       (increment, service.upper(), attribute,
                                                        'NOT ' if negate else '', operator.upper(), value),
                                                       confirmation_string))
                    if confirmation.lower() in bool_d:
                        break
                    else:
                        print(f"{invalid_string} {confirmation} isn't a valid choice")
                if bool_d[confirmation.lower()] or confirmation.isspace():
                    break

            # terminal_group_queries[increment] = (service, attribute, operator, negate, value)
            terminal_group_queries.append(dict(service=service, attribute=attribute, operator=operator, negate=negate,
                                               value=value))
            increment += 1
            while True:
                additional = input(additional_input_string % ' query')
                if additional.lower() in bool_d:
                    break
                else:
                    print(f"{invalid_string} {confirmation} isn't a valid choice")
            if not bool_d[additional.lower()]:  # or confirmation.isspace():
                break

        # Group terminal queries into groups if there are more than 1
        if len(terminal_group_queries) > 1:
            recursive_query_tree = make_groups(terminal_group_queries)
            # expecting return of [terminal_group_queries, bottom group hierarchy, second group hierarchy, ..., top]
        else:
            recursive_query_tree = [terminal_group_queries]
            # recursive_query_tree = (terminal_group_queries, )
        # recursive_query_tree = (queries, grouping1, grouping2, etc.)
        for i, node in enumerate(recursive_query_tree):
            if i == 0:
                recursive_query_tree[i] = {j: format_terminal_group(**leaf) for j, leaf in enumerate(node, 1)}
                # recursive_query_tree[i] = {j: format_terminal_group(*node[leaf]) for j, leaf in enumerate(node, 1)}

                # terminal_group_queries = {j: format_terminal_group(*leaf) for j, leaf in enumerate(node)}
                # format_terminal_group(parameter_args, service=service)
                # terminal_group_queries[increment] = \
                #     format_terminal_group(attribute, operator, value, service=service)
            else:
                # if i == 1:
                #     child_groups = terminal_group_queries
                #     # child_groups = [terminal_group_queries[j] for j in child_nodes]
                # else:
                #     child_groups = recursive_query_tree[i]
                # operation, child_nodes = node
                # groups = {j: generate_group(operation, child_groups) for j, leaf in enumerate(node)}

                # NOPE Subtract the k indices to ensure that the user input numbers match with python zero indexing
                # i - 1 gives the index of the previous index of the recursive_query_tree to operate on
                # k pulls the groups specified in the input out to make a list with the corresponding terminai groups
                recursive_query_tree[i] = {j: generate_group(operation, [recursive_query_tree[i - 1][k]
                                                                         for k in child_group_nums])
                                           for j, (child_group_nums, operation) in enumerate(node, 1)}
                # for k in child_group_nums}
        final_query = recursive_query_tree[-1][1]  #

        search_query = generate_query(final_query, return_id=return_type)
        response_d = query_pdb(search_query)
    logger.debug(f'The server returned:\n{response_d}')

    if response_d:
        retrieved_ids = parse_pdb_response_for_ids(response_d)
    else:
        return []

    if save:
        utils.io_save(retrieved_ids)

    if return_results:
        return retrieved_ids

query_pdb_by

query_pdb_by(entry: str = None, assembly_id: str = None, assembly_integer: int | str = None, entity_id: str = None, entity_integer: int | str = None, chain: str = None, **kwargs) -> dict | list[list[str]]

Retrieve information from the PDB API by EntryID, AssemblyID, or EntityID

Parameters:

  • entry (str, default: None ) –

    The 4 character PDB EntryID of interest

  • assembly_id (str, default: None ) –

    The AssemblyID to query with format (1ABC-1)

  • assembly_integer (int | str, default: None ) –

    The particular assembly integer to query. Must include entry as well

  • entity_id (str, default: None ) –

    The PDB formatted EntityID. Has the format EntryID_Integer (1ABC_1)

  • entity_integer (int | str, default: None ) –

    The entity integer from the EntryID of interest

  • chain (str, default: None ) –

    The polymer "chain" identifier otherwise known as the "asym_id" from the PDB EntryID of interest

Returns: The query result

Source code in symdesign/resources/query/pdb.py
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
def query_pdb_by(entry: str = None, assembly_id: str = None, assembly_integer: int | str = None, entity_id: str = None,
                 entity_integer: int | str = None, chain: str = None, **kwargs) -> dict | list[list[str]]:
    """Retrieve information from the PDB API by EntryID, AssemblyID, or EntityID

    Args:
        entry: The 4 character PDB EntryID of interest
        assembly_id: The AssemblyID to query with format (1ABC-1)
        assembly_integer: The particular assembly integer to query. Must include entry as well
        entity_id: The PDB formatted EntityID. Has the format EntryID_Integer (1ABC_1)
        entity_integer: The entity integer from the EntryID of interest
        chain: The polymer "chain" identifier otherwise known as the "asym_id" from the PDB EntryID of interest
    Returns:
        The query result
    """
    if entry is not None:
        if len(entry) == 4:
            if entity_integer is not None:
                logger.debug(f'Querying PDB API with {entry}_{entity_integer}')
                return _get_entity_info(entry=entry, entity_integer=entity_integer)
            elif assembly_integer is not None:
                logger.debug(f'Querying PDB API with {entry}-{assembly_integer}')
                return _get_assembly_info(entry=entry, assembly_integer=assembly_integer)
            else:
                logger.debug(f'Querying PDB API with {entry}')
                data = _get_entry_info(entry)
                if chain:
                    integer = None
                    for entity_idx, chains in data.get('entity').items():
                        if chain in chains:
                            integer = entity_idx
                            break
                    if integer:
                        logger.debug(f'Querying PDB API with {entry}_{integer}')
                        return _get_entity_info(entry=entry, entity_integer=integer)
                    else:
                        raise KeyError(
                            f"No chainID '{chain}' found in PDB ID {entry}. Possible chains "
                            f'{", ".join(ch for chns in data.get("entity", {}).items() for ch in chns)}')
                else:
                    return data
        else:
            logger.debug(f"EntryID '{entry}' isn't the required format and will not be found with the PDB API")
    elif assembly_id is not None:
        entry, assembly_integer, *extra = assembly_id.split('-')
        if not extra and len(entry) == 4:
            logger.debug(f'Querying PDB API with {entry}-{assembly_integer}')
            return _get_assembly_info(entry=entry, assembly_integer=assembly_integer)

        logger.debug(f"AssemblyID '{assembly_id}' isn't the required format and will not be found with the PDB API")

    elif entity_id is not None:
        entry, entity_integer, *extra = entity_id.split('_')
        if not extra and len(entry) == 4:
            logger.debug(f'Querying PDB API with {entry}_{entity_integer}')
            return _get_entity_info(entry=entry, entity_integer=entity_integer)

        logger.debug(f"EntityID '{entity_id}' isn't the required format and will not be found with the PDB API")
    else:
        raise RuntimeError(
            f'No valid arguments passed to {query_pdb_by.__name__}. Valid arguments include: '
            f'entry, assembly_id, assembly_integer, entity_id, entity_integer, chain')

query_assembly_id

query_assembly_id(assembly_id: str = None, entry: str = None, assembly_integer: str | int = None) -> Response | None

Retrieve PDB AssemblyID information from the PDB API. More info at http://data.rcsb.org/#data-api

For all method types the following keys are available: {'rcsb_polymer_entity_annotation', 'entity_poly', 'rcsb_polymer_entity', 'entity_src_gen', 'rcsb_polymer_entity_feature_summary', 'rcsb_polymer_entity_align', 'rcsb_id', 'rcsb_cluster_membership', 'rcsb_polymer_entity_container_identifiers', 'rcsb_entity_host_organism', 'rcsb_latest_revision', 'rcsb_entity_source_organism'} NMR only - {'rcsb_polymer_entity_feature'} EM only - set() X-ray_only_keys - {'rcsb_cluster_flexibility'}

Parameters:

  • assembly_id (str, default: None ) –

    The AssemblyID to query with format (1ABC-1)

  • entry (str, default: None ) –

    The 4 character PDB EntryID of interest

  • assembly_integer (str | int, default: None ) –

    The particular assembly integer to query. Must include entry as well

Returns: The assembly information according to the PDB

Source code in symdesign/resources/query/pdb.py
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
def query_assembly_id(assembly_id: str = None, entry: str = None, assembly_integer: str | int = None) -> \
        requests.Response | None:
    """Retrieve PDB AssemblyID information from the PDB API. More info at http://data.rcsb.org/#data-api

    For all method types the following keys are available:
    {'rcsb_polymer_entity_annotation', 'entity_poly', 'rcsb_polymer_entity', 'entity_src_gen',
     'rcsb_polymer_entity_feature_summary', 'rcsb_polymer_entity_align', 'rcsb_id', 'rcsb_cluster_membership',
     'rcsb_polymer_entity_container_identifiers', 'rcsb_entity_host_organism', 'rcsb_latest_revision',
     'rcsb_entity_source_organism'}
    NMR only - {'rcsb_polymer_entity_feature'}
    EM only - set()
    X-ray_only_keys - {'rcsb_cluster_flexibility'}

    Args:
        assembly_id: The AssemblyID to query with format (1ABC-1)
        entry: The 4 character PDB EntryID of interest
        assembly_integer: The particular assembly integer to query. Must include entry as well
    Returns:
        The assembly information according to the PDB
    """
    if assembly_id:
        entry, assembly_integer, *_ = assembly_id.split('-')  # assume that this was passed correctly

    if entry and assembly_integer:
        return connection_exception_handler(f'{pdb_rest_url}/assembly/{entry}/{assembly_integer}')

parse_assembly_json

parse_assembly_json(assembly_json: dict[str, Any]) -> list[list[str]]

For a PDB API AssemblyID, parse the associated 'clustered' chains

Parameters:

  • assembly_json (dict[str, Any]) –

    The json type dictionary returned from requests.Response.json()

Returns: The chain ID's which cluster in the assembly - Ex: [['A', 'A', 'A', ...], ...]

Source code in symdesign/resources/query/pdb.py
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
def parse_assembly_json(assembly_json: dict[str, Any]) -> list[list[str]]:
    """For a PDB API AssemblyID, parse the associated 'clustered' chains

    Args:
        assembly_json: The json type dictionary returned from requests.Response.json()
    Returns:
        The chain ID's which cluster in the assembly -
        Ex: [['A', 'A', 'A', ...], ...]
    """
    entity_clustered_chains = []
    if not assembly_json:
        return entity_clustered_chains

    for symmetry in assembly_json['rcsb_struct_symmetry']:
        # symmetry contains:
        # {symbol: "O", type: 'Octahedral, stoichiometry: [], oligomeric_state: "Homo 24-mer", clusters: [],
        #  rotation_axes: [], kind: "Global Symmetry"}
        for cluster in symmetry['clusters']:  # [{}, ...]
            # CLUSTER_IDX is not a mapping to entity index...
            # cluster contains:
            # {members: [], avg_rmsd: 5.219512137974998e-14} which indicates how similar each member in the cluster is
            entity_clustered_chains.append([member.get('asym_id') for member in cluster['members']])

    return entity_clustered_chains

query_entry_id

query_entry_id(entry: str = None) -> Response | None

Fetches the JSON object for the EntryID from the PDB API

The following information is returned: All methods (SOLUTION NMR, ELECTRON MICROSCOPY, X-RAY DIFFRACTION) have the following keys: {'rcsb_primary_citation', 'pdbx_vrpt_summary', 'pdbx_audit_revision_history', 'audit_author', 'pdbx_database_status', 'rcsb_id', 'pdbx_audit_revision_details', 'struct_keywords', 'rcsb_entry_container_identifiers', 'entry', 'rcsb_entry_info', 'struct', 'citation', 'exptl', 'rcsb_accession_info'} EM only keys: {'em3d_fitting', 'em3d_fitting_list', 'em_image_recording', 'em_specimen', 'em_software', 'em_entity_assembly', 'em_vitrification', 'em_single_particle_entity', 'em3d_reconstruction', 'em_experiment', 'pdbx_audit_support', 'em_imaging', 'em_ctf_correction'} Xray only keys: {'diffrn_radiation', 'cell', 'reflns', 'diffrn', 'software', 'refine_hist', 'diffrn_source', 'exptl_crystal', 'symmetry', 'diffrn_detector', 'refine', 'reflns_shell', 'exptl_crystal_grow'} NMR only keys: {'pdbx_nmr_exptl', 'pdbx_audit_revision_item', 'pdbx_audit_revision_category', 'pdbx_nmr_spectrometer', 'pdbx_nmr_refine', 'pdbx_nmr_representative', 'pdbx_nmr_software', 'pdbx_nmr_exptl_sample_conditions', 'pdbx_nmr_ensemble'}

entry_json['rcsb_entry_info'] = {'assembly_count': 1, 'branched_entity_count': 0, 'cis_peptide_count': 3, 'deposited_atom_count': 8492, 'deposited_model_count': 1, 'deposited_modeled_polymer_monomer_count': 989, 'deposited_nonpolymer_entity_instance_count': 0, 'deposited_polymer_entity_instance_count': 6, 'deposited_polymer_monomer_count': 1065, 'deposited_solvent_atom_count': 735, 'deposited_unmodeled_polymer_monomer_count': 76, 'diffrn_radiation_wavelength_maximum': 0.9797, 'diffrn_radiation_wavelength_minimum': 0.9797, 'disulfide_bond_count': 0, 'entity_count': 3, 'experimental_method': 'X-ray', 'experimental_method_count': 1, 'inter_mol_covalent_bond_count': 0, 'inter_mol_metalic_bond_count': 0, 'molecular_weight': 115.09, 'na_polymer_entity_types': 'Other', 'nonpolymer_entity_count': 0, 'polymer_composition': 'heteromeric protein', 'polymer_entity_count': 2, 'polymer_entity_count_dna': 0, 'polymer_entity_count_rna': 0, 'polymer_entity_count_nucleic_acid': 0, 'polymer_entity_count_nucleic_acid_hybrid': 0, 'polymer_entity_count_protein': 2, 'polymer_entity_taxonomy_count': 2, 'polymer_molecular_weight_maximum': 21.89, 'polymer_molecular_weight_minimum': 16.47, 'polymer_monomer_count_maximum': 201, 'polymer_monomer_count_minimum': 154, 'resolution_combined': [1.95], 'selected_polymer_entity_types': 'Protein (only)', 'software_programs_combined': ['PHASER', 'REFMAC', 'XDS', 'XSCALE'], 'solvent_entity_count': 1, 'diffrn_resolution_high': {'provenance_source': 'Depositor assigned', 'value': 1.95}}

Parameters:

  • entry (str, default: None ) –

    The PDB code to search for

Returns: The entry information according to the PDB

Source code in symdesign/resources/query/pdb.py
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
def query_entry_id(entry: str = None) -> requests.Response | None:
    """Fetches the JSON object for the EntryID from the PDB API

    The following information is returned:
    All methods (SOLUTION NMR, ELECTRON MICROSCOPY, X-RAY DIFFRACTION) have the following keys:
    {'rcsb_primary_citation', 'pdbx_vrpt_summary', 'pdbx_audit_revision_history', 'audit_author',
     'pdbx_database_status', 'rcsb_id', 'pdbx_audit_revision_details', 'struct_keywords',
     'rcsb_entry_container_identifiers', 'entry', 'rcsb_entry_info', 'struct', 'citation', 'exptl',
     'rcsb_accession_info'}
    EM only keys:
    {'em3d_fitting', 'em3d_fitting_list', 'em_image_recording', 'em_specimen', 'em_software', 'em_entity_assembly',
     'em_vitrification', 'em_single_particle_entity', 'em3d_reconstruction', 'em_experiment', 'pdbx_audit_support',
     'em_imaging', 'em_ctf_correction'}
    Xray only keys:
    {'diffrn_radiation', 'cell', 'reflns', 'diffrn', 'software', 'refine_hist', 'diffrn_source', 'exptl_crystal',
     'symmetry', 'diffrn_detector', 'refine', 'reflns_shell', 'exptl_crystal_grow'}
    NMR only keys:
    {'pdbx_nmr_exptl', 'pdbx_audit_revision_item', 'pdbx_audit_revision_category', 'pdbx_nmr_spectrometer',
     'pdbx_nmr_refine', 'pdbx_nmr_representative', 'pdbx_nmr_software', 'pdbx_nmr_exptl_sample_conditions',
     'pdbx_nmr_ensemble'}

    entry_json['rcsb_entry_info'] = \
        {'assembly_count': 1, 'branched_entity_count': 0, 'cis_peptide_count': 3, 'deposited_atom_count': 8492,
        'deposited_model_count': 1, 'deposited_modeled_polymer_monomer_count': 989,
        'deposited_nonpolymer_entity_instance_count': 0, 'deposited_polymer_entity_instance_count': 6,
        'deposited_polymer_monomer_count': 1065, 'deposited_solvent_atom_count': 735,
        'deposited_unmodeled_polymer_monomer_count': 76, 'diffrn_radiation_wavelength_maximum': 0.9797,
        'diffrn_radiation_wavelength_minimum': 0.9797, 'disulfide_bond_count': 0, 'entity_count': 3,
        'experimental_method': 'X-ray', 'experimental_method_count': 1, 'inter_mol_covalent_bond_count': 0,
        'inter_mol_metalic_bond_count': 0, 'molecular_weight': 115.09, 'na_polymer_entity_types': 'Other',
        'nonpolymer_entity_count': 0, 'polymer_composition': 'heteromeric protein', 'polymer_entity_count': 2,
        'polymer_entity_count_dna': 0, 'polymer_entity_count_rna': 0, 'polymer_entity_count_nucleic_acid': 0,
        'polymer_entity_count_nucleic_acid_hybrid': 0, 'polymer_entity_count_protein': 2,
        'polymer_entity_taxonomy_count': 2, 'polymer_molecular_weight_maximum': 21.89,
        'polymer_molecular_weight_minimum': 16.47, 'polymer_monomer_count_maximum': 201,
        'polymer_monomer_count_minimum': 154, 'resolution_combined': [1.95],
        'selected_polymer_entity_types': 'Protein (only)',
        'software_programs_combined': ['PHASER', 'REFMAC', 'XDS', 'XSCALE'], 'solvent_entity_count': 1,
        'diffrn_resolution_high': {'provenance_source': 'Depositor assigned', 'value': 1.95}}

    Args:
        entry: The PDB code to search for
    Returns:
        The entry information according to the PDB
    """
    if entry:
        return connection_exception_handler(f'{pdb_rest_url}/entry/{entry}')

parse_entry_json

parse_entry_json(entry_json: dict[str, Any]) -> dict[str, dict]

For a PDB API EntryID, parse the associated entity ID's and chains

Parameters:

  • entry_json (dict[str, Any]) –

    The json type dictionary returned from requests.Response.json()

Returns: The structural information present in the PDB EntryID with format - {'method': xray, 'res': resolution, 'struct': {'space': space_group, 'a_b_c': (a, b, c), 'ang_a_b_c': (ang_a, ang_b, ang_c)} }

Source code in symdesign/resources/query/pdb.py
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
def parse_entry_json(entry_json: dict[str, Any]) -> dict[str, dict]:
    """For a PDB API EntryID, parse the associated entity ID's and chains

    Args:
        entry_json: The json type dictionary returned from requests.Response.json()
    Returns:
        The structural information present in the PDB EntryID with format -
        {'method': xray,
         'res': resolution,
         'struct': {'space': space_group, 'a_b_c': (a, b, c), 'ang_a_b_c': (ang_a, ang_b, ang_c)}
         }
    """
    experimental_method = entry_json['rcsb_entry_info'].get('experimental_method')
    if experimental_method:
        # Todo make ray, diffraction
        if 'ray' in experimental_method.lower() and 'cell' in entry_json and 'symmetry' in entry_json:
            cell = entry_json['cell']
            ang_a, ang_b, ang_c = cell['angle_alpha'], cell['angle_beta'], cell['angle_gamma']
            a, b, c = cell['length_a'], cell['length_b'], cell['length_c']
            space_group = entry_json['symmetry']['space_group_name_hm']
            struct_d = {'space': space_group, 'a_b_c': (a, b, c), 'ang_a_b_c': (ang_a, ang_b, ang_c)}
            resolution = entry_json['rcsb_entry_info']['resolution_combined'][0]
        elif experimental_method == 'EM':
            # em_keys = {
            #     'em3d_fitting': [{'id': '1', 'ref_protocol': 'RIGID BODY FIT', 'ref_space': 'REAL'}]
            #     'em3d_fitting_list': [{'id': '1', 'pdb_entry_id': '4ZK7', '3d_fitting_id': '1'}]
            #     'em3d_reconstruction': [{'algorithm': 'BACK PROJECTION', 'id': '1', 'image_processing_id': '1', 'num_class_averages': 1, 'num_particles': 110369, 'resolution': 2.9, 'resolution_method': 'FSC 0.143 CUT-OFF', 'symmetry_type': 'POINT'}]
            #     'em_ctf_correction': [{'em_image_processing_id': '1', 'id': '1', 'type': 'PHASE FLIPPING AND AMPLITUDE CORRECTION'}]
            #     'em_entity_assembly': [{'details': 'The map was generated by focused refinement of BG505 SOSIP-T33-31 nanoparticle dataset using a mask around the T33-31 nanoparticle core (masking out the flexibly linked antigens).', 'entity_id_list': ['1', '2'], 'id': '1', 'name': 'Designed tetrahedral nanoparticle BG505 SOSIP-T33-31', 'parent_id': 0, 'source': 'RECOMBINANT', 'type': 'COMPLEX'}]
            #     'em_experiment': {'aggregation_state': 'PARTICLE', 'entity_assembly_id': '1', 'id': '1', 'reconstruction_method': 'SINGLE PARTICLE'}
            #     'em_image_recording': [{'average_exposure_time': 10.25, 'avg_electron_dose_per_image': 50.0, 'detector_mode': 'COUNTING', 'film_or_detector_model': 'GATAN K2 SUMMIT (4k x 4k)', 'id': '1', 'imaging_id': '1', 'num_grids_imaged': 2, 'num_real_images': 1751}]
            #     'em_imaging': [{'accelerating_voltage': 200, 'alignment_procedure': 'BASIC', 'c2_aperture_diameter': 70.0, 'cryogen': 'NITROGEN', 'electron_source': 'FIELD EMISSION GUN', 'id': '1', 'illumination_mode': 'FLOOD BEAM', 'microscope_model': 'FEI TALOS ARCTICA', 'mode': 'BRIGHT FIELD', 'nominal_cs': 2.7, 'nominal_defocus_max': 2000.0, 'nominal_defocus_min': 800.0, 'nominal_magnification': 36000, 'specimen_holder_model': 'OTHER', 'specimen_id': '1'}]
            #     'em_particle_selection': [{'details': 'Gaussian picker in Relion/3.0', 'id': '1', 'image_processing_id': '1', 'num_particles_selected': 223099}]
            #     'em_single_particle_entity': [{'id': 1, 'image_processing_id': '1', 'point_symmetry': 'T'}]
            #     'em_software': [{'category': 'PARTICLE SELECTION', 'id': '1', 'image_processing_id': '1', 'name': 'RELION', 'version': '3.0'}, {'category': 'IMAGE ACQUISITION', 'id': '2', 'imaging_id': '1', 'name': 'Leginon'}, {'category': 'MASKING', 'id': '3'}, {'category': 'CTF CORRECTION', 'id': '4', 'image_processing_id': '1', 'name': 'RELION', 'version': '3.0'}, {'category': 'LAYERLINE INDEXING', 'id': '5'}, {'category': 'DIFFRACTION INDEXING', 'id': '6'}, {'category': 'MODEL FITTING', 'fitting_id': '1', 'id': '7', 'name': 'UCSF Chimera'}, {'category': 'OTHER', 'id': '8'}, {'category': 'INITIAL EULER ASSIGNMENT', 'id': '9', 'image_processing_id': '1', 'name': 'RELION', 'version': '3.0'}, {'category': 'FINAL EULER ASSIGNMENT', 'id': '10', 'image_processing_id': '1', 'name': 'RELION', 'version': '3.0'}, {'category': 'CLASSIFICATION', 'id': '11', 'image_processing_id': '1', 'name': 'RELION', 'version': '3.0'}, {'category': 'RECONSTRUCTION', 'id': '12', 'image_processing_id': '1', 'name': 'RELION', 'version': '3.0'}, {'category': 'MODEL REFINEMENT', 'fitting_id': '1', 'id': '13', 'name': 'Coot'}, {'category': 'MODEL REFINEMENT', 'fitting_id': '1', 'id': '14', 'name': 'RosettaEM'}]
            #     'em_specimen': [{'concentration': 4.1, 'details': 'BG505 SOSIP-T33-31 nanoparticle was prepared by combining equimolar amounts of BG505 SOSIP-T33-31A and BG505 SOSIP-T33-31B components that were expressed separately.', 'embedding_applied': 'NO', 'experiment_id': '1', 'id': '1', 'shadowing_applied': 'NO', 'staining_applied': 'NO', 'vitrification_applied': 'YES'}]
            #     'em_vitrification': [{'chamber_temperature': 283.0, 'cryogen_name': 'ETHANE', 'details': 'Blotting time varied between 3 and 7 seconds.', 'humidity': 100.0, 'id': '1', 'instrument': 'FEI VITROBOT MARK IV', 'specimen_id': '1'}]
            # }
            # for key in em_keys:
            #     print(entry_json.get(key))
            struct_d = {}
            # Access the first entry in the list with [0] v
            resolution = entry_json['em3d_reconstruction'][0]['resolution']
        else:  # Todo NMR
            logger.warning(f"No useful information added with the experimental method {experimental_method} as "
                           "this method hasn't been explored yet")
            struct_d = {}
            resolution = None
    else:
        logger.warning('Entry has no "experimental_method" keyword')
        struct_d = {}
        resolution = None

    return {'res': resolution, 'struct': struct_d, 'method': experimental_method.lower()}

format_symmetry_group

format_symmetry_group(symmetry: str, homomeric_number: int = 1, heteromeric_number: int = None) -> str

Return a PDB API length limitation query

Parameters:

  • symmetry (str) –

    The symmetry to query for

  • homomeric_number (int, default: 1 ) –

    If the symmetry desired is homomeric, how many copies of the entity are desired

  • heteromeric_number (int, default: None ) –

    If the symmetry desired is heteromeric, how many entities are present

Returns: The symmetry formatted query limiting entity searches to the described symmetry

Source code in symdesign/resources/query/pdb.py
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
def format_symmetry_group(symmetry: str, homomeric_number: int = 1, heteromeric_number: int = None) -> str:
    """Return a PDB API length limitation query

    Args:
        symmetry: The symmetry to query for
        homomeric_number: If the symmetry desired is homomeric, how many copies of the entity are desired
        heteromeric_number: If the symmetry desired is heteromeric, how many entities are present
    Returns:
        The symmetry formatted query limiting entity searches to the described symmetry
    """
    if 'c' in symmetry.lower():
        symmetry_query = cyclic_symmetry_limiting_group % symmetry
    elif 'd' in symmetry.lower():
        symmetry_query = dihedral_symmetry_limiting_group % symmetry
    else:  # point group symmetry
        symmetry_query = point_symmetry_limiting_group % symmetry

    symmetry_number = utils.symmetry.valid_subunit_number.get(symmetry)
    if heteromeric_number:
        symmetry_query += ',' + heteromer_termini % symmetry_number * heteromeric_number
    else:  # if homomer:
        symmetry_query += ',' + homomer_termini % symmetry_number * homomeric_number
    # else:
    #     raise ValueError("Must provide either 'homomeric_number' or 'heteromeric_number'")

    return symmetry_query

format_length_group

format_length_group(lower: int, upper: int) -> str

Return a PDB API length limitation query

Parameters:

  • lower (int) –

    The low end to limit entity length

  • upper (int) –

    The upper limit on entity length

Returns: The length formatted query limiting entity searches to between the values lower and upper (non-inclusive)

Source code in symdesign/resources/query/pdb.py
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
def format_length_group(lower: int, upper: int) -> str:
    """Return a PDB API length limitation query

    Args:
        lower: The low end to limit entity length
        upper: The upper limit on entity length
    Returns:
        The length formatted query limiting entity searches to between the values lower and upper (non-inclusive)
    """
    return length_group % (lower, upper)

nanohedra_building_blocks_query

nanohedra_building_blocks_query(symmetry: str, lower: int = None, upper: int = None, thermophile: bool = False, return_groups: bool = False, limit_by_groups: Iterable[str] = None, search_by_groups: Iterable[str] = None) -> dict[Any] | None

Retrieve symmetric oligomers from the PDB to act as building blocks for nanohedra docking

Parameters:

  • symmetry (str) –

    The symmetry to query for

  • lower (int, default: None ) –

    The low end to limit entity length

  • upper (int, default: None ) –

    The upper limit on entity length

  • thermophile (bool, default: False ) –

    Whether to limit search to entries from thermophilic species

  • return_groups (bool, default: False ) –

    Whether to return results as groupID's

  • limit_by_groups (Iterable[str], default: None ) –

    Whether to limit the query, i.e. not return groupID's that are provided in this argument

  • search_by_groups (Iterable[str], default: None ) –

    Search only for groupID's that are provided to this argument

Returns: Matching EntityID's formatted as a dictionary from the JSON formatted response or None if the query failed

Source code in symdesign/resources/query/pdb.py
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
def nanohedra_building_blocks_query(
        symmetry: str, lower: int = None, upper: int = None, thermophile: bool = False, return_groups: bool = False,
        limit_by_groups: Iterable[str] = None, search_by_groups: Iterable[str] = None) -> dict[Any] | None:
    """Retrieve symmetric oligomers from the PDB to act as building blocks for nanohedra docking

    Args:
        symmetry: The symmetry to query for
        lower: The low end to limit entity length
        upper: The upper limit on entity length
        thermophile: Whether to limit search to entries from thermophilic species
        return_groups: Whether to return results as groupID's
        limit_by_groups: Whether to limit the query, i.e. not return groupID's that are provided in this argument
        search_by_groups: Search only for groupID's that are provided to this argument
    Returns:
        Matching EntityID's formatted as a dictionary from the JSON formatted response or None if the query failed
    """
    groups_and_terminal = common_quality_filters \
        + ',' + format_symmetry_group(symmetry) \
        + ',' + format_length_group(lower, upper)

    if thermophile:
        groups_and_terminal += ',' + thermophilic_json_terminal_operator
    if limit_by_groups:
        groups_and_terminal += ',' + not_in_entity_group_id_search_block \
                               % ','.join([f'"{id_}"' for id_ in limit_by_groups])
    if search_by_groups:
        groups_and_terminal += ',' + in_entity_group_id_search_block \
                               % ','.join([f'"{id_}"' for id_ in search_by_groups])

    building_block_query = and_group_query % groups_and_terminal
    logger.debug(f'Found building_block_query: {building_block_query}')
    formatted_query = json.loads(building_block_query)

    return query_pdb(generate_query(formatted_query, return_id='polymer_entity',
                                    cluster_sequence=True, return_groups=return_groups, all_matching=True))

find_author_confirmed_assembly_from_entity_group

find_author_confirmed_assembly_from_entity_group(group_ids: Iterable[str], symmetry: str, lower: int = None, upper: int = None) -> dict[Any] | None

For specific groupID's, request all EntityID's that have an assembly confirmed by depositing authors from PDB API

Parameters:

  • group_ids (Iterable[str]) –

    The groupID's to limit search to

  • symmetry (str) –

    The symmetry to query for

  • lower (int, default: None ) –

    The low end to limit entity length

  • upper (int, default: None ) –

    The upper limit on entity length

Returns: Matching AssemblyID's formatted as a dictionary from the JSON formatted response or None if the query failed

Source code in symdesign/resources/query/pdb.py
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
def find_author_confirmed_assembly_from_entity_group(
        group_ids: Iterable[str], symmetry: str, lower: int = None, upper: int = None) -> dict[Any] | None:
    """For specific groupID's, request all EntityID's that have an assembly confirmed by depositing authors from PDB API

    Args:
        group_ids: The groupID's to limit search to
        symmetry: The symmetry to query for
        lower: The low end to limit entity length
        upper: The upper limit on entity length
    Returns:
        Matching AssemblyID's formatted as a dictionary from the JSON formatted response or None if the query failed
    """
    groups_and_terminal = common_quality_filters \
        + ',' + format_symmetry_group(symmetry) \
        + ',' + format_length_group(lower, upper) \
        + ',' + assembly_author_defined \
        + ',' + in_entity_group_id_search_block % ','.join([f'"{id_}"' for id_ in group_ids])
    author_confirmed_query = and_group_query % groups_and_terminal
    logger.debug(f'Found author_confirmed_query: {author_confirmed_query}')
    formatted_query = json.loads(author_confirmed_query)

    return query_pdb(generate_query(formatted_query, return_id='assembly', all_matching=True))

solve_author_confirmed_assemblies

solve_author_confirmed_assemblies(params: QueryParams, grouped_entity_ids: dict[str, list[str]]) -> tuple[list[str], list[str]]

From a map of Entity group ID's to resolution sorted EntityIDs, solve for those EntityIDs that have an assembly

First search for QSbio confirmed assemblies, then search the PDB API for 'author_defined_assembly' and 'author_and_software_defined_assembly'

Parameters:

  • params (QueryParams) –

    The parameter profile specified for the search procedure

  • grouped_entity_ids (dict[str, list[str]]) –

    A dictionary mapping groupID to EntryID's

Returns: A tuple of the objects ( The best EntityIDs according to incoming sorting and that pass the assembly test The Entity group ID of those groups that couldn't be solved )

Source code in symdesign/resources/query/pdb.py
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
def solve_author_confirmed_assemblies(params: QueryParams, grouped_entity_ids: dict[str, list[str]]) \
        -> tuple[list[str], list[str]]:
    """From a map of Entity group ID's to resolution sorted EntityIDs, solve for those EntityIDs that have an assembly

    First search for QSbio confirmed assemblies, then search the PDB API for 'author_defined_assembly' and
    'author_and_software_defined_assembly'

    Args:
        params: The parameter profile specified for the search procedure
        grouped_entity_ids: A dictionary mapping groupID to EntryID's
    Returns:
        A tuple of the objects (
            The best EntityIDs according to incoming sorting and that pass the assembly test
            The Entity group ID of those groups that couldn't be solved
        )
    """
    # Check if the top thermophilic ids are actually bona-fide assemblies
    top_entity_ids: list[str | None] = []
    # If they aren't, then solve by PDB API query
    solve_group_by_pdb = []
    for group_id, entity_ids in grouped_entity_ids.items():
        group_assemblies = [qsbio_confirmed.get(id_[:4].lower()) for id_ in entity_ids]
        for id_, assembly in zip(entity_ids, group_assemblies):
            if assembly is None:
                continue
            else:
                top_entity_ids.append(id_)
                break
        else:  # No assemblies are qsbio_confirmed
            # Solve by PDB assembly inference
            solve_group_by_pdb.append(group_id)
            top_entity_ids.append(None)

    author_confirmed_assembly_result = \
        find_author_confirmed_assembly_from_entity_group(
            solve_group_by_pdb, params.symmetry, params.lower_length, params.upper_length)
    if author_confirmed_assembly_result:
        author_confirmed_assembly_ids = parse_pdb_response_for_ids(author_confirmed_assembly_result)
    else:
        author_confirmed_assembly_ids = []

    # Limit AssemblyID's to EntryID's
    author_confirmed_entry_ids = [id_[:4] for id_ in author_confirmed_assembly_ids]
    remove_group_ids = []
    remove_group_indices = []

    # For orphaned groups, find and fit author confirmed assemblies in their corresponding groups
    for group_idx, (top_id, group_id) in enumerate(zip(top_entity_ids, grouped_entity_ids.keys())):
        if top_id is None:
            for entity_id in grouped_entity_ids[group_id]:
                if entity_id[:4] in author_confirmed_entry_ids:
                    top_entity_ids[group_idx] = entity_id
                    break
            else:  # This still isn't solved. Remove from the pool
                remove_group_ids.append(group_id)
                remove_group_indices.append(group_idx)

    for group_idx in reversed(remove_group_indices):
        top_entity_ids.pop(group_idx)

    return top_entity_ids, remove_group_ids

entity_thermophilicity

entity_thermophilicity(entry: str = None, entity_integer: int | str = None, entity_id: str = None) -> float | None

Query the PDB API for an EntityID and return the associated chains and reference dictionary

Parameters:

  • entry (str, default: None ) –

    The 4 character PDB EntryID of interest

  • entity_integer (int | str, default: None ) –

    The entity integer from the EntryID of interest

  • entity_id (str, default: None ) –

    The PDB formatted EntityID. Has the format EntryID_Integer (1ABC_1)

Returns: Value ranging from 0-1 where 1 is completely thermophilic according to taxonomic classification

Source code in symdesign/resources/query/pdb.py
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
def entity_thermophilicity(entry: str = None, entity_integer: int | str = None, entity_id: str = None) -> float | None:
    """Query the PDB API for an EntityID and return the associated chains and reference dictionary

    Args:
        entry: The 4 character PDB EntryID of interest
        entity_integer: The entity integer from the EntryID of interest
        entity_id: The PDB formatted EntityID. Has the format EntryID_Integer (1ABC_1)
    Returns:
        Value ranging from 0-1 where 1 is completely thermophilic according to taxonomic classification
    """
    entity_request = query_entity_id(entry=entry, entity_integer=entity_integer, entity_id=entity_id)
    if not entity_request:
        return None

    return thermophilicity_from_entity_json(entity_request.json())

thermophilicity_from_entity_json

thermophilicity_from_entity_json(entity_json: dict[str, Any]) -> float

Return the extent to which the entity json entry in question is thermophilic

Parameters:

  • entity_json (dict[str, Any]) –

    The return json from PDB API query

Returns: Value ranging from 0-1 where 1 is completely thermophilic

Source code in symdesign/resources/query/pdb.py
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
def thermophilicity_from_entity_json(entity_json: dict[str, Any]) -> float:
    """Return the extent to which the entity json entry in question is thermophilic

    Args:
        entity_json: The return json from PDB API query
    Returns:
        Value ranging from 0-1 where 1 is completely thermophilic
    """
    thermophilic_source = []
    for organism in entity_json.get('rcsb_entity_source_organism', {}):
        taxonomy_id = int(organism.get('ncbi_taxonomy_id', -1))
        if taxonomy_id in thermophilic_taxonomy_ids:
            thermophilic_source.append(1)
        else:
            thermophilic_source.append(0)

    if thermophilic_source:
        return sum(thermophilic_source) / len(thermophilic_source)
    else:
        return 0.

parse_entities_json

parse_entities_json(entity_jsons: Iterable[dict[str, Any]]) -> dict[str, dict]

Parameters:

  • entity_jsons (Iterable[dict[str, Any]]) –

    An Iterable of json like objects containing EntityID information as retrieved from the PDB API

Returns: The entity dictionary with format - {'EntityID': {'chains': ['A', 'B', ...], 'dbref': {'accession': ('Q96DC8',), 'db': 'UniProt'}, 'reference_sequence': 'MSLEHHHHHH...', 'thermophilicity': 1.0}, ...}

Source code in symdesign/resources/query/pdb.py
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
def parse_entities_json(entity_jsons: Iterable[dict[str, Any]]) -> dict[str, dict]:
    """

    Args:
        entity_jsons: An Iterable of json like objects containing EntityID information as retrieved from the PDB API
    Returns:
        The entity dictionary with format -
        {'EntityID':
            {'chains': ['A', 'B', ...],
             'dbref': {'accession': ('Q96DC8',), 'db': 'UniProt'},
             'reference_sequence': 'MSLEHHHHHH...',
             'thermophilicity': 1.0},
         ...}
    """
    def extract_dbref(entity_ids_json: dict[str, Any]) -> dict[str, dict]:
        """For a PDB API EntityID, parse the associated chains and database reference identifiers

        Args:
            entity_ids_json: The json type dictionary returned from requests.Response.json()
        Returns:
            Ex: {'db': DATABASE, 'accession': 'Q96DC8'} where DATABASE can be one of 'GenBank', 'Norine', 'UniProt'
        """
        database_keys = ['db', 'accession']
        try:
            uniprot_ids = entity_ids_json['uniprot_ids']
            # Todo choose the most accurate if more than 2...
            #  'rcsb_polymer_entity_align' indicates how the model from the PDB aligns to UniprotKB through SIFTS
            #  [{provenance_source: "SIFTS",
            #    reference_database_accession: "P12528",
            #    reference_database_name: "UniProt",
            #    aligned_regions: [{entity_beg_seq_id: 1,
            #                       length: 124,
            #                       ref_beg_seq_id: 2}]
            #   },
            #   {}, ...
            #  ]
            if len(uniprot_ids) > 1:
                logger.warning(f'For Entity {entity_ids_json["rcsb_id"]}, found multiple UniProt Entries: '
                               f'{", ".join(uniprot_ids)}')
            db_d = dict(zip(database_keys, (UKB, tuple(uniprot_ids))))
        except KeyError:  # No 'uniprot_ids'
            # GenBank = GB, which is mostly RNA or DNA structures or antibody complexes
            # Norine = NOR, which is small peptide structures, sometimes bound to proteins...
            try:
                identifiers = [dict(db=ident['database_name'], accession=(ident['database_accession'],))
                               for ident in entity_ids_json.get('reference_sequence_identifiers', [])]
            except KeyError:  # There are really no identifiers of use
                return {}
            if identifiers:
                if len(identifiers) == 1:  # Only one solution
                    db_d = identifiers[0]
                else:  # Find the most ideal accession_database UniProt > GenBank > Norine > ???
                    whatever_else = 0
                    priority_l = [[] for _ in range(len(identifiers))]
                    for idx, (database, accession) in enumerate(identifiers):
                        if database == UKB:
                            priority_l[0].append(idx)
                        elif database == GB:
                            # Two elements are required from above len check, never have IndexError
                            priority_l[1].append(idx)
                        # elif database == NOR:
                        #     priority_l[2].append(idx)
                        elif not whatever_else:
                            # Only set the first time an unknown identifier is seen
                            whatever_else = idx

                    # Loop through the list of prioritized identifiers
                    for identifier_idx in priority_l:
                        if identifier_idx:  # we have found a priority database, choose the corresponding identifier idx
                            # Make the db_d with the db name as first arg and all the identifiers as the second arg
                            db_d = dict(zip(database_keys,
                                            (identifiers[identifier_idx[0]]['db'], [identifiers[idx]['accession']
                                                                                    for idx in identifier_idx])))
                            break
                    else:  # if no solution from priority but something else, choose the other
                        db_d = identifiers[whatever_else]
            else:
                db_d = {}

        return db_d

    entity_info = {}
    for entity_idx, entity_json in enumerate(entity_jsons, 1):
        if entity_json is None:
            continue
        entity_json_ids = entity_json.get('rcsb_polymer_entity_container_identifiers')
        if entity_json_ids:
            entity_info[entity_json_ids['rcsb_id'].lower()] = dict(
                chains=entity_json_ids['asym_ids'],
                dbref=extract_dbref(entity_json_ids),
                reference_sequence=entity_json['entity_poly']['pdbx_seq_one_letter_code_can'],
                thermophilicity=thermophilicity_from_entity_json(entity_json),
            )

    return entity_info

query_entity_id

query_entity_id(entry: str = None, entity_integer: str | int = None, entity_id: str = None) -> Response | None

Retrieve PDB EntityID information from the PDB API. More info at http://data.rcsb.org/#data-api

For all method types the following keys are available: {'rcsb_polymer_entity_annotation', 'entity_poly', 'rcsb_polymer_entity', 'entity_src_gen', 'rcsb_polymer_entity_feature_summary', 'rcsb_polymer_entity_align', 'rcsb_id', 'rcsb_cluster_membership', 'rcsb_polymer_entity_container_identifiers', 'rcsb_entity_host_organism', 'rcsb_latest_revision', 'rcsb_entity_source_organism'} NMR only - {'rcsb_polymer_entity_feature'} EM only - set() X-ray_only_keys - {'rcsb_cluster_flexibility'}

Parameters:

  • entry (str, default: None ) –

    The 4 character PDB EntryID of interest

  • entity_integer (str | int, default: None ) –

    The integer of the entity_id

  • entity_id (str, default: None ) –

    The PDB formatted EntityID. Has the format EntryID_Integer (1ABC_1)

Returns: The entity information according to the PDB

Source code in symdesign/resources/query/pdb.py
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
def query_entity_id(entry: str = None, entity_integer: str | int = None, entity_id: str = None) -> \
        requests.Response | None:
    """Retrieve PDB EntityID information from the PDB API. More info at http://data.rcsb.org/#data-api

    For all method types the following keys are available:
    {'rcsb_polymer_entity_annotation', 'entity_poly', 'rcsb_polymer_entity', 'entity_src_gen',
     'rcsb_polymer_entity_feature_summary', 'rcsb_polymer_entity_align', 'rcsb_id', 'rcsb_cluster_membership',
     'rcsb_polymer_entity_container_identifiers', 'rcsb_entity_host_organism', 'rcsb_latest_revision',
     'rcsb_entity_source_organism'}
    NMR only - {'rcsb_polymer_entity_feature'}
    EM only - set()
    X-ray_only_keys - {'rcsb_cluster_flexibility'}

    Args:
        entry: The 4 character PDB EntryID of interest
        entity_integer: The integer of the entity_id
        entity_id: The PDB formatted EntityID. Has the format EntryID_Integer (1ABC_1)
    Returns:
        The entity information according to the PDB
    """
    if entity_id:
        entry, entity_integer, *_ = entity_id.split('_')  # Assume that this was passed correctly

    if entry and entity_integer:
        return connection_exception_handler(f'{pdb_rest_url}/polymer_entity/{entry}/{entity_integer}')

get_entity_id

get_entity_id(entry: str = None, entity_integer: int | str = None, entity_id: str = None, chain: str = None) -> tuple[str, str] | tuple[None]

Retrieve a UniProtID from the PDB API by passing various PDB identifiers or combinations thereof

Parameters:

  • entry (str, default: None ) –

    The 4 character PDB EntryID of interest

  • entity_integer (int | str, default: None ) –

    The entity integer from the EntryID of interest

  • entity_id (str, default: None ) –

    The PDB formatted EntityID. Has the format EntryID_Integer (1ABC_1)

  • chain (str, default: None ) –

    The polymer "chain" identifier otherwise known as the "asym_id" from the PDB EntryID of interest

Returns: The Entity_ID

Source code in symdesign/resources/query/pdb.py
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
def get_entity_id(entry: str = None, entity_integer: int | str = None, entity_id: str = None, chain: str = None) -> \
        tuple[str, str] | tuple[None]:
    """Retrieve a UniProtID from the PDB API by passing various PDB identifiers or combinations thereof

    Args:
        entry: The 4 character PDB EntryID of interest
        entity_integer: The entity integer from the EntryID of interest
        entity_id: The PDB formatted EntityID. Has the format EntryID_Integer (1ABC_1)
        chain: The polymer "chain" identifier otherwise known as the "asym_id" from the PDB EntryID of interest
    Returns:
        The Entity_ID
    """
    if entry is not None:
        if len(entry) != 4:
            logger.warning(f'EntryID "{entry}" is not of the required format and will not be found with the PDB API')
        elif entity_integer is not None:
            return entry, entity_integer
            # entity_id = f'{entry}_{entity_integer}'
        else:
            info = _get_entry_info(entry)
            chain_entity = {chain: entity_idx for entity_idx, chains in info.get('entity', {}).items() for chain in chains}
            if chain is not None:
                try:
                    return entry, chain_entity[chain]
                    # entity_id = f'{entry}_{chain_entity[chain]}'
                except KeyError:
                    raise KeyError(f'No chain "{chain}" found in PDB ID {entry}. '
                                   f'Possible chains {", ".join(chain_entity)}')
            else:
                entity_integer = next(iter(chain_entity.values()))
                logger.warning('Using the argument "entry" without either "entity_integer" or "chain" is not '
                               f'recommended. Choosing the first EntityID "{entry}_{entity_integer}"')
                return entry, entity_integer
                # entity_id = f'{entry}_{entity_integer}'

    elif entity_id is not None:
        entry, entity_integer, *extra = entity_id.split('_')
        if not extra and len(entry) == 4:
            return entry, entity_integer

        logger.debug(f"EntityID '{entity_id}' isn't the required format and will not be found with the PDB API")

    return None,

get_entity_uniprot_id

get_entity_uniprot_id(**kwargs) -> str | None

Retrieve a UniProtID from the PDB API by passing various PDB identifiers or combinations thereof

Other Parameters:

  • entry=None (str) –

    The 4 character PDB EntryID of interest

  • entity_integer=None (str) –

    The entity integer from the EntryID of interest

  • entity_id=None (str) –

    The PDB formatted EntityID. Has the format EntryID_Integer (1ABC_1)

  • chain=None (str) –

    The polymer "chain" identifier otherwise known as the "asym_id" from the PDB EntryID of interest

Returns:

  • str | None

    The UniProt ID

Source code in symdesign/resources/query/pdb.py
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
def get_entity_uniprot_id(**kwargs) -> str | None:
    """Retrieve a UniProtID from the PDB API by passing various PDB identifiers or combinations thereof

    Keyword Args:
        entry=None (str): The 4 character PDB EntryID of interest
        entity_integer=None (str): The entity integer from the EntryID of interest
        entity_id=None (str): The PDB formatted EntityID. Has the format EntryID_Integer (1ABC_1)
        chain=None (str): The polymer "chain" identifier otherwise known as the "asym_id" from the PDB EntryID of
            interest

    Returns:
        The UniProt ID
    """
    entity_request = query_entity_id(*get_entity_id(**kwargs))
    if entity_request:
        # return the first uniprot entry
        return entity_request.json().get('rcsb_polymer_entity_container_identifiers')['uniprot_id'][0]

get_entity_reference_sequence

get_entity_reference_sequence(**kwargs) -> str | None

Query the PDB API for the reference amino acid sequence for a specified entity ID (PDB EntryID_Entity_ID)

Other Parameters:

  • entry=None (str) –

    The 4 character PDB EntryID of interest

  • entity_integer=None (str) –

    The entity integer from the EntryID of interest

  • entity_id=None (str) –

    The PDB formatted EntityID. Has the format EntryID_Integer (1ABC_1)

  • chain=None (str) –

    The polymer "chain" identifier otherwise known as the "asym_id" from the PDB EntryID of interest

Returns:

  • str | None

    One letter amino acid sequence

Source code in symdesign/resources/query/pdb.py
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
def get_entity_reference_sequence(**kwargs) -> str | None:
    """Query the PDB API for the reference amino acid sequence for a specified entity ID (PDB EntryID_Entity_ID)

    Keyword Args:
        entry=None (str): The 4 character PDB EntryID of interest
        entity_integer=None (str): The entity integer from the EntryID of interest
        entity_id=None (str): The PDB formatted EntityID. Has the format EntryID_Integer (1ABC_1)
        chain=None (str): The polymer "chain" identifier otherwise known as the "asym_id" from the PDB EntryID of
            interest

    Returns:
        One letter amino acid sequence
    """
    entity_request = query_entity_id(*get_entity_id(**kwargs))
    if entity_request:
        return entity_request.json().get('entity_poly')['pdbx_seq_one_letter_code_can']  # returns non-cannonical as 'X'

get_rcsb_metadata_schema

get_rcsb_metadata_schema(file=os.path.join(current_dir, 'rcsb_schema.pkl'), search_only=True, force_update=False)

Parse the rcsb metadata schema for useful information from the format {"properties" : {"assignment_version" : {"type" : "string", "examples" : [ "V4_0_2" ], "description" : "Identifies the version of the feature assignment.", "rcsb_description" : [ {"text" : "Identifies the version of the feature assignment.", "context" : "dictionary"}, {"text" : "Feature Version", "context" : "brief"} ] }, ... "symmetry_type" : {"type" : "string", <-- provide data type provide options --> "enum" : [ "2D CRYSTAL", "3D CRYSTAL", "HELICAL", "POINT" ], provide description --> "description" : "The type of symmetry applied to the reconstruction", provide operators --> "rcsb_search_context" : [ "exact-match" ], "rcsb_full_text_priority" : 10, "rcsb_description" : [ {"text" : "The type of symmetry applied to the reconstruction", "context" : "dictionary"}, {"text" : "Symmetry Type (Em 3d Reconstruction)", "context" : "brief"} ] }, ... }, "title" : "Core Metadata", "additionalProperties" : false, "$comment" : "Schema version: 1.14.0" "required" : ["rcsb_id", "rcsb_entry_container_identifiers", "rcsb_entry_info", "rcsb_pubmed_container_identifiers", "rcsb_polymer_entity_container_identifiers", "rcsb_assembly_container_identifiers", "rcsb_uniprot_container_identifiers" ], "$schema" : "http://json-schema.org/draft-07/schema#", "description" : "Collective JSON schema that includes definitions for all indexed cores with RCSB metadata extensions.", } Returns: (dict): {attribute: {'dtype': 'string', 'description': 'XYZ', 'operators': {'equals'}, 'choices': []}, ...}

Source code in symdesign/resources/query/pdb.py
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
def get_rcsb_metadata_schema(file=os.path.join(current_dir, 'rcsb_schema.pkl'), search_only=True, force_update=False):
    """Parse the rcsb metadata schema for useful information from the format
         {"properties" : {"assignment_version" : {"type" : "string", "examples" : [ "V4_0_2" ],
                                             "description" : "Identifies the version of the feature assignment.",
                                             "rcsb_description" : [
                                              {"text" : "Identifies the version of the feature assignment.",
                                               "context" : "dictionary"},
                                              {"text" : "Feature Version", "context" : "brief"} ]
                                            },
                          ...
                          "symmetry_type" : {"type" : "string",     <-- provide data type
               provide options     -->       "enum" : [ "2D CRYSTAL", "3D CRYSTAL", "HELICAL", "POINT" ],
               provide description -->       "description" : "The type of symmetry applied to the reconstruction",
               provide operators   -->       "rcsb_search_context" : [ "exact-match" ],
                                             "rcsb_full_text_priority" : 10,
                                             "rcsb_description" : [
                                                {"text" : "The type of symmetry applied to the reconstruction",
                                                 "context" : "dictionary"},
                                                {"text" : "Symmetry Type (Em 3d Reconstruction)", "context" : "brief"} ]
                                            },
                          ... },
          "title" : "Core Metadata", "additionalProperties" : false, "$comment" : "Schema version: 1.14.0"
          "required" : ["rcsb_id", "rcsb_entry_container_identifiers", "rcsb_entry_info",
                        "rcsb_pubmed_container_identifiers", "rcsb_polymer_entity_container_identifiers",
                        "rcsb_assembly_container_identifiers", "rcsb_uniprot_container_identifiers" ],
          "$schema" : "http://json-schema.org/draft-07/schema#",
          "description" : "Collective JSON schema that includes definitions for all indexed cores with RCSB metadata extensions.",
         }
    Returns:
        (dict): {attribute: {'dtype': 'string', 'description': 'XYZ', 'operators': {'equals'}, 'choices': []}, ...}
    """
    schema_pairs = {'dtype': 'type', 'description': 'description', 'operators': 'rcsb_search_context',
                    'choices': 'enum'}
    operator_d = {'full-text': 'contains_words, contains_phrase, exists', 'exact-match': 'in, exact_match, exists',
                  'default-match': 'equals, greater, less, greater_or_equal, less_or_equal, range, range_closed, '
                                   'exists', 'suggest': None}
    # Types of rcsb_search_context: (can be multiple)
    # full-text - contains_words, contains_phrase, exists
    # exact-match - in, exact-match, exists
    # default-match - equals, greater, less, greater_or_equal, less_or_equal, range, range_closed, exists
    # suggests - provides an example to the user in the GUI
    data_types = ['string', 'integer', 'number']

    def recurse_metadata(metadata_d, stack=tuple()):  # this puts the yield inside a local iter so we don't return
        for attribute in metadata_d:
            if metadata_d[attribute]['type'] == 'array':  # 'items' must be a keyword in dictionary
                # stack += (attribute, 'a')
                if metadata_d[attribute]['items']['type'] in data_types:  # array is the final attribute of the branch
                    yield stack + (attribute, 'a')
                elif metadata_d[attribute]['items']['type'] == 'object':  # type must be object, therefore contain 'properties' key and then have more attributes as leafs
                    yield from recurse_metadata(metadata_d[attribute]['items']['properties'], stack=stack + ((attribute, 'a', 'o',)))
                else:
                    logger.debug('Array with type %s found in %s' % (metadata_d[attribute], stack))
            elif metadata_d[attribute]['type'] == 'object':  # This should never be reachable?
                # print('%s object found %s' % (attribute, stack))
                if 'properties' in metadata_d[attribute]:  # check may be unnecessary
                    yield from recurse_metadata(metadata_d[attribute]['properties'], stack=stack + (attribute, 'o',))
                else:
                    logger.debug('Object with no properties found %s in %s' % (metadata_d[attribute], stack))
                    # yield stack + ('o', attribute,)
            elif metadata_d[attribute]['type'] in data_types:
                yield stack + (attribute,)  # + ('o', attribute,) add 'o' as the parent had properties from the object type
            else:
                logger.debug('other type = %s' % metadata_d[attribute]['type'])

    if not os.path.exists(file) or force_update:  # Todo and date.datetime - date.current is not greater than a month...
        logger.info('Gathering the most current PDB metadata. This may take a couple minutes...')
        metadata_json = requests.get(attribute_metadata_schema_json).json()
        metadata_properties_d = metadata_json['properties']
        gen_schema = recurse_metadata(metadata_properties_d)
        schema_header_tuples = [yield_schema for yield_schema in gen_schema]

        schema_dictionary_strings_d = {'a': "['items']", 'o': "['properties']"}  # 'a': "['items']['properties']"
        schema_d = {}
        for i, attribute_tuple in enumerate(schema_header_tuples):
            attribute_full = '.'.join(attribute for attribute in attribute_tuple
                                      if attribute not in schema_dictionary_strings_d)
            if i < 5:
                logger.debug(attribute_full)
            schema_d[attribute_full] = {}
            d_search_string = ''.join(f"['{attribute}']" if attribute not in schema_dictionary_strings_d
                                      else schema_dictionary_strings_d[attribute] for attribute in attribute_tuple)
            evaluation_d = eval(f'{metadata_properties_d}{d_search_string}')
            for key, value in schema_pairs.items():
                if value in evaluation_d:
                    schema_d[attribute_full][key] = evaluation_d[value]
                else:
                    schema_d[attribute_full][key] = None

            if 'format' in evaluation_d:
                schema_d[attribute_full]['dtype'] = 'date'

            if schema_d[attribute_full]['description']:  # convert the description to a simplified descriptor
                schema_d[attribute_full]['description'] = schema_d[attribute_full]['description'].split('\n')[0]

            if schema_d[attribute_full]['operators']:  # convert the rcsb_search_context to valid operator(s)
                schema_d[attribute_full]['operators'] = set(', '.join(
                    operator_d[search_context] for search_context in schema_d[attribute_full]['operators']
                    if operator_d[search_context]).split(', '))
            else:
                if search_only:  # remove entries that don't have a corresponding operator as these aren't searchable
                    schema_d.pop(attribute_full)

        pickled_schema_file = utils.pickle_object(schema_d, file, out_path='')
    else:
        return utils.unpickle(file)

    return schema_d