pdb

GB `module-attribute`

GB = 'GenBank'

The module level identifier for a GenBankID

NOR `module-attribute`

NOR = 'Norine'

The module level identifier for a NorineID

UKB `module-attribute`

UKB = 'UniProt'

The module level identifier for a UniProtID

qsbio_confirmed `module-attribute`

qsbio_confirmed: Annotated[dict[str, list[int]], "PDB EntryID (lowercase) mapped to biological assembly numbers for ID's with QSBio confidence as high or very high"] = unpickle(qs_bio)

PDB EntryID (lowercase) mapped to biological assembly numbers for ID's with QSBio confidence as high or very high

retrieve_entity_id_by_sequence

retrieve_entity_id_by_sequence(sequence: str) -> str | None

From a given sequence, retrieve the top matching Entity ID from the PDB API

Parameters:

sequence (str) –

The sequence used to query for the EntityID

Returns:

str | None –

'1ABC_1'

Source code in symdesign/resources/query/pdb.py

def retrieve_entity_id_by_sequence(sequence: str) -> str | None:
    """From a given sequence, retrieve the top matching Entity ID from the PDB API

    Args:
        sequence: The sequence used to query for the EntityID

    Returns:
        '1ABC_1'
    """
    matching_entities = find_matching_entities_by_sequence(sequence, all_matching=False)
    if matching_entities:
        logger.debug(f'Sequence search found the matching EntityIDs: {", ".join(matching_entities)}')
        return matching_entities[0]
    else:
        return None

find_matching_entities_by_sequence

find_matching_entities_by_sequence(sequence: str = None, return_id: return_types_literal = 'polymer_entity', **kwargs) -> list[str] | None

Search the PDB for matching IDs given a sequence and a return_type. Pass all_matching=False to retrieve the top 10 IDs, otherwise return all IDs

Parameters:

sequence (str, default: None ) –

The sequence used to query for EntityID's
return_id (return_types_literal, default: 'polymer_entity' ) –

The type of value to return

Returns:

list[str] | None –

The EntityID's matching the sequence

Source code in symdesign/resources/query/pdb.py

def find_matching_entities_by_sequence(sequence: str = None, return_id: return_types_literal = 'polymer_entity',
                                       **kwargs) -> list[str] | None:
    """Search the PDB for matching IDs given a sequence and a return_type. Pass all_matching=False to retrieve the top
    10 IDs, otherwise return all IDs

    Args:
        sequence: The sequence used to query for EntityID's
        return_id: The type of value to return

    Returns:
        The EntityID's matching the sequence
    """
    if return_id not in return_type_args:
        raise KeyError(
            f"The specified return_id '{return_id}' isn't supported. Allowed values: {', '.join(return_type_args)}")
    logger.debug(f'Using the default sequence similarity parameters: '
                 f'{", ".join(f"{k}: {v}" for k, v in default_sequence_values.items())}')
    sequence_query = format_terminal_group(service='sequence', sequence=sequence)
    sequence_query_results = query_pdb(
        generate_query(sequence_query, return_id=return_id, cluster_uniprot=True, **kwargs))
    if sequence_query_results:
        return parse_pdb_response_for_ids(sequence_query_results)
    else:
        logger.warning(f"Sequence wasn't found by the PDB API:\n{sequence}")
        return None  # [None]

parse_pdb_response_for_ids

parse_pdb_response_for_ids(response: dict[str, dict[str, str]], groups: bool = False) -> list[str]

Parse JSON PDB API returns for identifiers

Parameters:

response (dict[str, dict[str, str]]) –
groups (bool, default: False ) –

Whether the identifiers are clustered by group

Returns:

list[str] –

The list of identifiers from the response

Source code in symdesign/resources/query/pdb.py

def parse_pdb_response_for_ids(response: dict[str, dict[str, str]], groups: bool = False) -> list[str]:
    """Parse JSON PDB API returns for identifiers

    Args:
        response:
        groups: Whether the identifiers are clustered by group

    Returns:
        The list of identifiers from the response
    """
    # logger.debug(f'Response contains the results: {response["result_set"]}')
    if groups:
        return [result['identifier'] for result in response.get('group_set', [])]
    else:
        return [result['identifier'] for result in response.get('result_set', [])]

query_pdb

query_pdb(query_: dict[Any] | str, json_formatted: bool = False) -> dict[str, Any] | None

Take a JSON formatted PDB API query and return the results

PDB response can look like: {'query_id': 'ecc736b3-f19c-4a54-a5d6-3db58ce6520b', 'result_type': 'entry', 'total_count': 104, 'result_set': [{'identifier': '4A73', 'score': 1.0, 'services': [{'service_type': 'text', 'nodes': [{'node_id': 11198, 'original_score': 222.23667907714844, 'norm_score': 1.0}]}]}, {'identifier': '5UCQ', 'score': 1.0, 'services': [{'service_type': 'text', 'nodes': [{'node_id': 11198, 'original_score': 222.23667907714844, 'norm_score': 1.0}]}]}, {'identifier': '6P3L', 'score': 1.0, 'services': [{'service_type': 'text', 'nodes': [{'node_id': 11198, 'original_score': 222.23667907714844, 'norm_score': 1.0}]}]}, ... ] }

Parameters:

query_ (dict[Any] | str) –

The query formatted as a dictionary or a JSON string
json_formatted (bool, default: False ) –

Whether the query is already formatted as a JSON string

Returns:

dict[str, Any] | None –

The response formatted as a dictionary from the JSON format or None if the query failed

Source code in symdesign/resources/query/pdb.py

def query_pdb(query_: dict[Any] | str, json_formatted: bool = False) -> dict[str, Any] | None:
    """Take a JSON formatted PDB API query and return the results

    PDB response can look like:
    {'query_id': 'ecc736b3-f19c-4a54-a5d6-3db58ce6520b',
     'result_type': 'entry',
    'total_count': 104,
    'result_set': [{'identifier': '4A73', 'score': 1.0,
                    'services': [{'service_type': 'text', 'nodes': [{'node_id': 11198,
                                                                     'original_score': 222.23667907714844,
                                                                     'norm_score': 1.0}]}]},
                   {'identifier': '5UCQ', 'score': 1.0,
                    'services': [{'service_type': 'text', 'nodes': [{'node_id': 11198,
                                                                     'original_score': 222.23667907714844,
                                                                     'norm_score': 1.0}]}]},
                   {'identifier': '6P3L', 'score': 1.0,
                    'services': [{'service_type': 'text', 'nodes': [{'node_id': 11198,
                                                                     'original_score': 222.23667907714844,
                                                                     'norm_score': 1.0}]}]},
                    ...
                  ]
    }

    Args:
        query_: The query formatted as a dictionary or a JSON string
        json_formatted: Whether the query is already formatted as a JSON string

    Returns:
        The response formatted as a dictionary from the JSON format or None if the query failed
    """
    if json_formatted:
        formatted_query_ = query_
    else:
        formatted_query_ = dumps(query_)

    query_response = None
    iteration = 0
    while True:
        try:
            query_response = requests.get(pdb_query_url, params={'json': formatted_query_})
            # logger.debug(f'Found the PDB query with url: {query_response.url}')
            if query_response.status_code == 200:
                return query_response.json()
            elif query_response.status_code == 204:
                logger.warning('No response was returned. Your query likely found no matches!')
                break
            elif query_response.status_code == 429:
                logger.debug('Too many requests, pausing momentarily')
                time.sleep(2)
            else:
                logger.debug(f'Your query returned an unrecognized status code ({query_response.status_code})')
                time.sleep(1)
                iteration += 1
        except requests.exceptions.ConnectionError:
            logger.debug('Requests ran into a connection error')
            time.sleep(1)
            iteration += 1

        if iteration > 5:
            logger.error('The maximum number of resource fetch attempts was made with no resolution. '
                         f'Offending request {getattr(query_response, "url", pdb_query_url)}')  # Todo format url
            break
            # raise DesignError('The maximum number of resource fetch attempts was made with no resolution. '
            #                   'Offending request %s' % getattr(query_response, 'url', pdb_query_url))
    return None

pdb_id_matching_uniprot_id

pdb_id_matching_uniprot_id(uniprot_id, return_id: return_types_literal = 'polymer_entity') -> list[str]

Find all matching PDB entries from a specified UniProt ID and specific return ID

Parameters:

uniprot_id –

The UniProt ID of interest
return_id (return_types_literal, default: 'polymer_entity' ) –

The type of value to return

Returns:

list[str] –

The list of matching IDs

Source code in symdesign/resources/query/pdb.py

def pdb_id_matching_uniprot_id(uniprot_id, return_id: return_types_literal = 'polymer_entity') -> list[str]:
    """Find all matching PDB entries from a specified UniProt ID and specific return ID

    Args:
        uniprot_id: The UniProt ID of interest
        return_id: The type of value to return

    Returns:
        The list of matching IDs
    """
    if return_id not in return_type_args:
        raise KeyError(
            f"The specified return_id '{return_id}' isn't supported. Allowed values: {', '.join(return_type_args)}")
    database = {'attribute': 'rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_name',
                'negation': False, 'operator': 'exact_match', 'value': 'UniProt'}
    accession = \
        {'attribute': 'rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession',
         'negation': False, 'operator': 'in', 'value': [uniprot_id]}

    uniprot_query = [format_terminal_group(service='text', **database),
                     format_terminal_group(service='text', **accession)]
    final_query = generate_group('and', uniprot_query)
    search_query = generate_query(final_query, return_id=return_id)
    response_d = query_pdb(search_query)
    if response_d:
        return parse_pdb_response_for_ids(response_d)
    else:
        return []

generate_query

generate_query(search: dict, return_id: return_types_literal = 'entry', cluster_uniprot: bool = False, cluster_sequence: bool = False, return_groups: bool = False, all_matching: bool = True) -> dict[str, dict | str]

Format a PDB query with the specific return type and parameters affecting search results

Parameters:

search (dict) –

Contains the key, value pairs in accordance with groups and terminal groups
return_id (return_types_literal, default: 'entry' ) –

The type of ID that should be returned
cluster_uniprot (bool, default: False ) –

Whether the query generated is a sequence type query
cluster_sequence (bool, default: False ) –

Whether the query generated is clustered by sequence similarity
return_groups (bool, default: False ) –

Whether to return results as group IDs
all_matching (bool, default: True ) –

Whether to get all matching IDs

Returns:

dict[str, dict | str] –

The formatted query to be sent via HTTP GET

Source code in symdesign/resources/query/pdb.py

def generate_query(search: dict, return_id: return_types_literal = 'entry', cluster_uniprot: bool = False,
                   cluster_sequence: bool = False, return_groups: bool = False, all_matching: bool = True) \
        -> dict[str, dict | str]:
    """Format a PDB query with the specific return type and parameters affecting search results

    Args:
        search: Contains the key, value pairs in accordance with groups and terminal groups
        return_id: The type of ID that should be returned
        cluster_uniprot: Whether the query generated is a sequence type query
        cluster_sequence: Whether the query generated is clustered by sequence similarity
        return_groups: Whether to return results as group IDs
        all_matching: Whether to get all matching IDs

    Returns:
        The formatted query to be sent via HTTP GET
    """
    if return_id not in return_type_args:
        raise KeyError(
            f"The specified return type '{return_id}' isn't supported. Viable types include "
            f"{', '.join(return_type_args)}")

    query_d = {'query': search, 'return_type': return_id}
    request_options = {'results_content_type': ['experimental'],  # "computational" for Alphafold
                       'sort': [{
                           'sort_by': 'score',
                           'direction': 'desc'}],
                       'scoring_strategy': 'combined'
                       }
    if cluster_uniprot or cluster_sequence:
        if cluster_uniprot:
            cluster_options = sequence_request_options.copy()
        elif cluster_sequence:
            cluster_options = sequence_cluster_request_options.copy()
        else:
            raise NotImplementedError()

        if return_groups:
            cluster_options.update({'group_by_return_type': 'groups'})

        request_options.update(cluster_options)
    elif return_groups:
        logger.warning(
            "The argument 'return_groups' wasn't used as neither 'cluster_uniprot' or 'cluster_sequence' were provided")

    if all_matching:
        request_options.update({'return_all_hits': True})

    query_d.update({'request_options': request_options})

    return query_d

retrieve_pdb_entries_by_advanced_query

retrieve_pdb_entries_by_advanced_query(save: bool = True, return_results: bool = True, force_schema_update: bool = False, entity: bool = False, assembly: bool = False, chain: bool = False, entry: bool = False, **kwargs) -> str | list | None

Parameters:

save (bool, default: True ) –
return_results (bool, default: True ) –
force_schema_update (bool, default: False ) –
entity (bool, default: False ) –
assembly (bool, default: False ) –
chain (bool, default: False ) –
entry (bool, default: False ) –

Returns:

Source code in symdesign/resources/query/pdb.py

def retrieve_pdb_entries_by_advanced_query(save: bool = True, return_results: bool = True,
                                           force_schema_update: bool = False, entity: bool = False,
                                           assembly: bool = False, chain: bool = False, entry: bool = False, **kwargs) \
        -> str | list | None:
    """

    Args:
        save:
        return_results:
        force_schema_update:
        entity:
        assembly:
        chain:
        entry:

    Returns:

    """
    # {attribute: {'dtype': 'string', 'description': 'XYZ', 'operators': {'equals',}, 'choices': []}, ...}

    def search_schema(term):
        return [(key, schema[key]['description']) for key in schema if schema[key]['description'] and
                term.lower() in schema[key]['description'].lower()]

    def make_groups(*args, recursive_depth=0):
        # Todo remove ^ * expression?
        # on initialization have [{}, {}, ...]
        #  was [(), (), ...]
        # on recursion get (terminal_queries, grouping,
        terminal_queries = args[0]
        work_on_group = args[recursive_depth]
        all_grouping_indices = {i for i in range(1, 1 + len(work_on_group))}

        group_introduction = f'\n{header_string % "Grouping Instructions"}\n' \
                             f'Because you have {len(terminal_queries)} search queries, you need to combine these to ' \
                             'a total search strategy. This is accomplished by grouping your search queries together ' \
                             f'using the operations {group_operators}. You must eventually group all queries into a ' \
                             'single logical operation.\nIf you have multiple groups, you will need to group those ' \
                             'groups, so on and so forth.\nIndicate your group selections with a space separated list!'\
                             ' You will choose the group operation to combine this list afterwards.\nFollow prior ' \
                             "prompts if you need a reminder of how group#'s relate to query#'s"
        group_grouping_intro = '\nGroups remain, you must group groups as before.'
        group_inquiry_string = '\nWhich of these (identified by #) would you like to combine into a group?%s' % \
                               input_string
        group_specification_string = 'You specified "%s" as a single group.'
        group_logic_string = f'\nWhat group operator {group_operators} would you like for this group?{input_string}'

        available_query_string = '\nYour available queries are:\n%s\n' % \
                                 '\n'.join(query_display_string % (query_num, service.upper(), attribute,
                                                                   'NOT ' if negate else '', operator.upper(), value)
                                           for query_num, (service, attribute, operator, negate, value)
                                           in enumerate(list(terminal_queries.values()), 1))

        if recursive_depth == 0:
            intro_string = group_introduction
            available_entity_string = available_query_string
        else:
            intro_string = group_grouping_intro
            available_entity_string = '\nYour available groups are:\n%s\n' % \
                                      '\n'.join(f'\tGroup Group #{i}{format_string.format(*group)}'
                                                for i, group in enumerate(list(work_on_group.values()), 1))

        print(intro_string)  # provide an introduction
        print(available_entity_string)  # display available entities which switch between guery and group...

        selected_grouping_indices = deepcopy(all_grouping_indices)
        groupings = []
        while len(selected_grouping_indices) > 1:  # check if more work needs to be done
            while True:  # ensure grouping input is viable
                while True:
                    grouping = set(map(int, input(group_inquiry_string).split()))  # get new grouping
                    # error on isdigit() ^
                    if len(grouping) > 1:
                        break
                    else:
                        print('More than one group is required. Your group "%s" is invalid' % grouping)
                while True:
                    confirm = input('%s\n%s' % (group_specification_string % grouping, confirmation_string))
                    if confirm.lower() in bool_d:
                        break
                    else:
                        print('%s %s is not a valid choice!' % (invalid_string, confirm))

                if bool_d[confirmation.lower()] or confirmation.isspace():  # confirm that grouping is as specified
                    while True:  # check if logic input is viable
                        group_logic = input(group_logic_string).lower()
                        if group_logic in group_operators:
                            break
                        else:
                            print(invalid_string)
                    groupings.append((grouping, group_logic))
                    break

            # remove specified from the pool of available until all are gone
            selected_grouping_indices = selected_grouping_indices.difference(grouping)

        if len(selected_grouping_indices) > 0:
            groupings.append((selected_grouping_indices, 'and'))  # When only 1 remains, automatically add 'and'
            # Todo test logic of and with one group?

        args.extend((groupings,))  # now [{} {}, ..., ([(grouping, group_logic), (), ...])
        # once all groupings are grouped, recurse
        if len(groupings) > 1:
            # todo without the return call, the stack never comes back to update args?
            make_groups(*args, recursive_depth=recursive_depth + 1)

        return list(args)  # list() may be unnecessary

    # Start the user input routine -------------------------------------------------------------------------------------
    schema = get_rcsb_metadata_schema(force_update=force_schema_update)
    print(f'\n{header_string % "PDB API Advanced Query"}\n'
          f'This prompt will walk you through generating an advanced search query and retrieving the matching '
          "set of entry ID's from the PDB. This automatically parses the ID's of interest for downstream use, which "
          'can save you some headache. If you want to take advantage of the PDB webpage GUI to perform the advanced '
          f'search, visit:\n\t{pdb_advanced_search_url}\nThen enter "json" in the prompt below and follow those '
          'instructions.\n\n'
          'Otherwise, this command line prompt takes advantage of the same GUI functionality. If you have a '
          'search specified from a prior query that you want to process again, using "json" will be useful as well. '
          'To proceed with the command line search just hit "Enter"')
    program_start = input(input_string)
    if program_start.lower() == 'json':
        if entity:
            return_type = 'Polymer Entities'  # 'polymer_entity'
        elif assembly:
            return_type = 'Assemblies'  # 'assembly'
        elif chain:
            return_type = 'Polymer Entities'  # This isn't available on web GUI -> 'polymer_instance'
        elif entry:
            return_type = 'Structures'  # 'entry'
        else:
            return_type = 'Structures'  # 'entry'

        return_type_prompt = f'At the bottom left of the dialog, there is a drop down menu next to "Return". ' \
                             f'Choose {return_type}'
        print('DETAILS: To save time formatting and immediately move to your design pipeline, build your Query with the'
              ' PDB webpage GUI, then save the resulting JSON text to a file. To do this, first build your full query '
              f'on the advanced search page, {return_type_prompt} then click the Search button (magnifying glass icon).'
              ' After the page loads, a new section of the search page should appear above the Advanced Search Query '
              'Builder dialog. There, click the JSON|->| button to open a new page with an automatically built JSON '
              'representation of your query. Save the entirety of this JSON formatted query to a file to return your '
              "chosen ID's\n")
        # ('Paste your JSON object below. IMPORTANT select from the opening \'{\' to '
        #  '\'"return_type": "entry"\' and paste. Before hitting enter, add a closing \'}\'. This hack '
        #  'ensures ALL results are retrieved with no sorting or pagination applied\n\n%s' %
        #  input_string)
        prior_query = input(f'Please specify the path where the JSON query file is located{input_string}')
        while not os.path.exists(prior_query):
            prior_query = input(f"The specified path '{prior_query}' doesn't exist! Please try again{input_string}")

        with open(prior_query, 'r') as f:
            json_input = load(f)

        # remove any paginate instructions from the json_input
        json_input['request_options'].pop('paginate', None)
        # if all_matching:
        # Ensure we get all matching
        json_input['request_options'].update({'return_all_hits': True})
        response_d = query_pdb(json_input)
    # elif program_start.lower() == 'previous':
    #     while True:
    #         prior_query = input('Please specify the path where the search file is located%s' % input_string)
    #         if os.path.exists(prior_query):
    #             with open(prior_query, 'r') as f:
    #                 search_query = loads(f.readlines())
    #         else:
    #             print('The specified path \'%s\' doesn\'t exist! Please try again.' % prior_query)
    else:
        if entity:
            return_type = 'polymer_entity'
        elif assembly:
            return_type = 'assembly'
        elif chain:
            return_type = 'polymer_instance'
        elif entry:
            return_type = 'entry'
        else:
            return_identifier_string = '\nFor each set of options, choose the option from the first column for the ' \
                                       'description in the second.\nWhat type of identifier do you want to search the '\
                                       f'PDB for?%s{input_string}' % user_input_format % \
                                       '\n'.join(format_string.format(*item) for item in return_types.items())
            return_type = validate_input(return_identifier_string, return_type_args)

        terminal_group_queries = []
        # terminal_group_queries = {}
        increment = 1
        while True:
            # Todo only text search is available now
            # query_builder_service_string = '\nWhat type of search method would you like to use?%s%s' % \
            #                                (user_input_format % '\n'.join(format_string % item
            #                                                               for item in services.items()), input_string)
            query_builder_attribute_string = \
                '\nWhat type of attribute would you like to use? Examples include:\n\t%s\n\n' \
                f'For a more thorough list indicate "s" for search.\nAlternatively, you can browse {attribute_url}\n' \
                f'Ensure that your spelling is exact if you want your query to succeed!{input_string}' % \
                '\n\t'.join(utils.pretty_format_table(attributes.items(), header=('Option', 'Description')))
            query_builder_operator_string = '\nWhat operator would you like to use?\nPossible operators include:' \
                                            '\n\t%s\nIf you would like to negate the operator, on input type "not" ' \
                                            f'after your selection. Ex: equals not{input_string}'
            query_builder_value_string = '\nWhat value should be %s? Required type is: %s.%s%s'
            query_display_string = 'Query #%d: Search the PDB by "%s" for "%s" attributes "%s%s" "%s".'

            while True:  # start the query builder routine
                while True:
                    # service = input(query_builder_service_string)
                    service = 'text'  # Todo
                    if service in services:
                        break
                    else:
                        print(invalid_string)

                # {attribute: {'dtype': 'string', 'description': 'XYZ', 'operators': {'equals',}, 'choices': []}, ...}
                while True:
                    attribute = input(query_builder_attribute_string)
                    while attribute.lower() == 's':  # If the user would like to search all possible
                        search_term = input('What term would you like to search?%s' % input_string)
                        attribute = input(f'Found the following instances of "{search_term.upper()}":\n%s\nWhich option'
                                          f' are you interested in? Enter "s" to repeat search.{input_string}' %
                                          user_input_format %
                                          '\n'.join(format_string.format(*key_description_pair) for key_description_pair
                                                    in search_schema(search_term)))
                        if attribute != 's':
                            break
                    if attribute in schema:  # Confirm the user wants to go forward with this
                        break
                    else:
                        print(f'***ERROR: {attribute} was not found in PDB schema***')
                        # while True:  # confirm that the confirmation input is valid
                        #     confirmation = input('ERROR: %s was not found in PDB schema! If you proceed, your search is'
                        #                          ' almost certain to fail.\nProceed anyway? [y/n]%s' %
                        #                          (attribute, input_string))
                        #     if confirmation.lower() in bool_d:
                        #         break
                        #     else:
                        #         print('%s %s is not a valid choice!' % invalid_string, confirmation)
                        # if bool_d[confirmation.lower()] or confirmation.isspace():  # break the attribute routine on y or ''
                        #     break

                while True:  # Retrieve the operator for the search
                    while True:  # Check if the operator should be negated
                        operator = input(query_builder_operator_string % ', '.join(schema[attribute]['operators']))
                        if len(operator.split()) > 1:
                            negation = operator.split()[1]
                            operator = operator.split()[0]
                            if negation.lower() == 'not':  # Can negate any search
                                negate = True
                                break
                            else:
                                print(f"{invalid_string} {negation} is not a recognized negation!\n "
                                      f"Try '{operator} not' instead or remove extra input")
                        else:
                            negate = False
                            break
                    if operator in schema[attribute]['operators']:
                        break
                    else:
                        print(f"{invalid_string} {operator} isn't a valid operator")

                op_in = True
                while op_in:  # Check if operator is 'in'
                    if operator == 'in':
                        print("\nThe 'in' operator can take multiple values. If you want multiple values, specify "
                              'each as a separate input')
                    else:
                        op_in = False

                    while True:  # Retrieve the value for the search
                        value = input(query_builder_value_string % (operator.upper(), instance_d[schema[attribute]['dtype']]
                                                                    , ('\nPossible choices:\n\t%s' %
                                                                       ', '.join(schema[attribute]['choices'])
                                                                       if schema[attribute]['choices'] else ''),
                                                                    input_string))
                        if isinstance(value, instance_d[schema[attribute]['dtype']]):  # check if the right data type
                            break
                        else:
                            try:  # try to convert the input value to the specified type
                                value = instance_d[schema[attribute]['dtype']](value)
                                if schema[attribute]['choices']:  # if there is a choice
                                    if value in schema[attribute]['choices']:  # check if the value is in the possible choices
                                        break
                                    else:  # if not, confirm the users desire to do this
                                        while True:  # confirm that the confirmation input is valid
                                            confirmation = input('%s was not found in the possible choices: %s\nProceed'
                                                                 ' anyway? [y/n]%s' %
                                                                 (value, ', '.join(schema[attribute]['choices']),
                                                                  input_string))
                                            if confirmation.lower() in bool_d:
                                                break
                                            else:
                                                print(f"{invalid_string} {confirmation} isn't a valid choice")
                                        if bool_d[confirmation.lower()] or confirmation.isspace():  # break the value routine on y or ''
                                            break

                                else:
                                    break
                            except ValueError:  # catch any conversion issue like float('A')
                                print(f"{invalid_string} {value} isn't a valid {instance_d[schema[attribute]['dtype']]}"
                                      " value!")

                    while op_in:
                        # TODO ensure that the in parameters are spit out as a list
                        additional = input(additional_input_string % " value to your 'in' operator")
                        if additional.lower() in bool_d:
                            if bool_d[additional.lower()] or additional.isspace():
                                break  # Stop the inner 'in' check loop
                            else:
                                op_in = False  # Stop the inner and outer 'in' while loops
                        else:
                            print(f"{invalid_string} {additional} isn't a valid choice")

                while True:
                    confirmation = input('\n%s\n%s' % (query_display_string %
                                                       (increment, service.upper(), attribute,
                                                        'NOT ' if negate else '', operator.upper(), value),
                                                       confirmation_string))
                    if confirmation.lower() in bool_d:
                        break
                    else:
                        print(f"{invalid_string} {confirmation} isn't a valid choice")
                if bool_d[confirmation.lower()] or confirmation.isspace():
                    break

            # terminal_group_queries[increment] = (service, attribute, operator, negate, value)
            terminal_group_queries.append(dict(service=service, attribute=attribute, operator=operator, negate=negate,
                                               value=value))
            increment += 1
            while True:
                additional = input(additional_input_string % ' query')
                if additional.lower() in bool_d:
                    break
                else:
                    print(f"{invalid_string} {confirmation} isn't a valid choice")
            if not bool_d[additional.lower()]:  # or confirmation.isspace():
                break

        # Group terminal queries into groups if there are more than 1
        if len(terminal_group_queries) > 1:
            recursive_query_tree = make_groups(terminal_group_queries)
            # expecting return of [terminal_group_queries, bottom group hierarchy, second group hierarchy, ..., top]
        else:
            recursive_query_tree = [terminal_group_queries]
            # recursive_query_tree = (terminal_group_queries, )
        # recursive_query_tree = (queries, grouping1, grouping2, etc.)
        for i, node in enumerate(recursive_query_tree):
            if i == 0:
                recursive_query_tree[i] = {j: format_terminal_group(**leaf) for j, leaf in enumerate(node, 1)}
                # recursive_query_tree[i] = {j: format_terminal_group(*node[leaf]) for j, leaf in enumerate(node, 1)}

                # terminal_group_queries = {j: format_terminal_group(*leaf) for j, leaf in enumerate(node)}
                # format_terminal_group(parameter_args, service=service)
                # terminal_group_queries[increment] = \
                #     format_terminal_group(attribute, operator, value, service=service)
            else:
                # if i == 1:
                #     child_groups = terminal_group_queries
                #     # child_groups = [terminal_group_queries[j] for j in child_nodes]
                # else:
                #     child_groups = recursive_query_tree[i]
                # operation, child_nodes = node
                # groups = {j: generate_group(operation, child_groups) for j, leaf in enumerate(node)}

                # NOPE Subtract the k indices to ensure that the user input numbers match with python zero indexing
                # i - 1 gives the index of the previous index of the recursive_query_tree to operate on
                # k pulls the groups specified in the input out to make a list with the corresponding terminai groups
                recursive_query_tree[i] = {j: generate_group(operation, [recursive_query_tree[i - 1][k]
                                                                         for k in child_group_nums])
                                           for j, (child_group_nums, operation) in enumerate(node, 1)}
                # for k in child_group_nums}
        final_query = recursive_query_tree[-1][1]  #

        search_query = generate_query(final_query, return_id=return_type)
        response_d = query_pdb(search_query)
    logger.debug(f'The server returned:\n{response_d}')

    if response_d:
        retrieved_ids = parse_pdb_response_for_ids(response_d)
    else:
        return []

    if save:
        utils.io_save(retrieved_ids)

    if return_results:
        return retrieved_ids

query_pdb_by

query_pdb_by(entry: str = None, assembly_id: str = None, assembly_integer: int | str = None, entity_id: str = None, entity_integer: int | str = None, chain: str = None, **kwargs) -> dict | list[list[str]]

Retrieve information from the PDB API by EntryID, AssemblyID, or EntityID

Parameters:

entry (str, default: None ) –

The 4 character PDB EntryID of interest
assembly_id (str, default: None ) –

The AssemblyID to query with format (1ABC-1)
assembly_integer (int | str, default: None ) –

The particular assembly integer to query. Must include entry as well
entity_id (str, default: None ) –

The PDB formatted EntityID. Has the format EntryID_Integer (1ABC_1)
entity_integer (int | str, default: None ) –

The entity integer from the EntryID of interest
chain (str, default: None ) –

The polymer "chain" identifier otherwise known as the "asym_id" from the PDB EntryID of interest

Returns: The query result

Source code in symdesign/resources/query/pdb.py

def query_pdb_by(entry: str = None, assembly_id: str = None, assembly_integer: int | str = None, entity_id: str = None,
                 entity_integer: int | str = None, chain: str = None, **kwargs) -> dict | list[list[str]]:
    """Retrieve information from the PDB API by EntryID, AssemblyID, or EntityID

    Args:
        entry: The 4 character PDB EntryID of interest
        assembly_id: The AssemblyID to query with format (1ABC-1)
        assembly_integer: The particular assembly integer to query. Must include entry as well
        entity_id: The PDB formatted EntityID. Has the format EntryID_Integer (1ABC_1)
        entity_integer: The entity integer from the EntryID of interest
        chain: The polymer "chain" identifier otherwise known as the "asym_id" from the PDB EntryID of interest
    Returns:
        The query result
    """
    if entry is not None:
        if len(entry) == 4:
            if entity_integer is not None:
                logger.debug(f'Querying PDB API with {entry}_{entity_integer}')
                return _get_entity_info(entry=entry, entity_integer=entity_integer)
            elif assembly_integer is not None:
                logger.debug(f'Querying PDB API with {entry}-{assembly_integer}')
                return _get_assembly_info(entry=entry, assembly_integer=assembly_integer)
            else:
                logger.debug(f'Querying PDB API with {entry}')
                data = _get_entry_info(entry)
                if chain:
                    integer = None
                    for entity_idx, chains in data.get('entity').items():
                        if chain in chains:
                            integer = entity_idx
                            break
                    if integer:
                        logger.debug(f'Querying PDB API with {entry}_{integer}')
                        return _get_entity_info(entry=entry, entity_integer=integer)
                    else:
                        raise KeyError(
                            f"No chainID '{chain}' found in PDB ID {entry}. Possible chains "
                            f'{", ".join(ch for chns in data.get("entity", {}).items() for ch in chns)}')
                else:
                    return data
        else:
            logger.debug(f"EntryID '{entry}' isn't the required format and will not be found with the PDB API")
    elif assembly_id is not None:
        entry, assembly_integer, *extra = assembly_id.split('-')
        if not extra and len(entry) == 4:
            logger.debug(f'Querying PDB API with {entry}-{assembly_integer}')
            return _get_assembly_info(entry=entry, assembly_integer=assembly_integer)

        logger.debug(f"AssemblyID '{assembly_id}' isn't the required format and will not be found with the PDB API")

    elif entity_id is not None:
        entry, entity_integer, *extra = entity_id.split('_')
        if not extra and len(entry) == 4:
            logger.debug(f'Querying PDB API with {entry}_{entity_integer}')
            return _get_entity_info(entry=entry, entity_integer=entity_integer)

        logger.debug(f"EntityID '{entity_id}' isn't the required format and will not be found with the PDB API")
    else:
        raise RuntimeError(
            f'No valid arguments passed to {query_pdb_by.__name__}. Valid arguments include: '
            f'entry, assembly_id, assembly_integer, entity_id, entity_integer, chain')

query_assembly_id

query_assembly_id(assembly_id: str = None, entry: str = None, assembly_integer: str | int = None) -> Response | None

Retrieve PDB AssemblyID information from the PDB API. More info at http://data.rcsb.org/#data-api

For all method types the following keys are available: {'rcsb_polymer_entity_annotation', 'entity_poly', 'rcsb_polymer_entity', 'entity_src_gen', 'rcsb_polymer_entity_feature_summary', 'rcsb_polymer_entity_align', 'rcsb_id', 'rcsb_cluster_membership', 'rcsb_polymer_entity_container_identifiers', 'rcsb_entity_host_organism', 'rcsb_latest_revision', 'rcsb_entity_source_organism'} NMR only - {'rcsb_polymer_entity_feature'} EM only - set() X-ray_only_keys - {'rcsb_cluster_flexibility'}

Parameters:

assembly_id (str, default: None ) –

The AssemblyID to query with format (1ABC-1)
entry (str, default: None ) –

The 4 character PDB EntryID of interest
assembly_integer (str | int, default: None ) –

The particular assembly integer to query. Must include entry as well

Returns: The assembly information according to the PDB

Source code in symdesign/resources/query/pdb.py

def query_assembly_id(assembly_id: str = None, entry: str = None, assembly_integer: str | int = None) -> \
        requests.Response | None:
    """Retrieve PDB AssemblyID information from the PDB API. More info at http://data.rcsb.org/#data-api

    For all method types the following keys are available:
    {'rcsb_polymer_entity_annotation', 'entity_poly', 'rcsb_polymer_entity', 'entity_src_gen',
     'rcsb_polymer_entity_feature_summary', 'rcsb_polymer_entity_align', 'rcsb_id', 'rcsb_cluster_membership',
     'rcsb_polymer_entity_container_identifiers', 'rcsb_entity_host_organism', 'rcsb_latest_revision',
     'rcsb_entity_source_organism'}
    NMR only - {'rcsb_polymer_entity_feature'}
    EM only - set()
    X-ray_only_keys - {'rcsb_cluster_flexibility'}

    Args:
        assembly_id: The AssemblyID to query with format (1ABC-1)
        entry: The 4 character PDB EntryID of interest
        assembly_integer: The particular assembly integer to query. Must include entry as well
    Returns:
        The assembly information according to the PDB
    """
    if assembly_id:
        entry, assembly_integer, *_ = assembly_id.split('-')  # assume that this was passed correctly

    if entry and assembly_integer:
        return connection_exception_handler(f'{pdb_rest_url}/assembly/{entry}/{assembly_integer}')

parse_assembly_json

parse_assembly_json(assembly_json: dict[str, Any]) -> list[list[str]]

For a PDB API AssemblyID, parse the associated 'clustered' chains

Parameters:

assembly_json (dict[str, Any]) –

The json type dictionary returned from requests.Response.json()

Returns: The chain ID's which cluster in the assembly - Ex: [['A', 'A', 'A', ...], ...]

Source code in symdesign/resources/query/pdb.py

def parse_assembly_json(assembly_json: dict[str, Any]) -> list[list[str]]:
    """For a PDB API AssemblyID, parse the associated 'clustered' chains

    Args:
        assembly_json: The json type dictionary returned from requests.Response.json()
    Returns:
        The chain ID's which cluster in the assembly -
        Ex: [['A', 'A', 'A', ...], ...]
    """
    entity_clustered_chains = []
    if not assembly_json:
        return entity_clustered_chains

    for symmetry in assembly_json['rcsb_struct_symmetry']:
        # symmetry contains:
        # {symbol: "O", type: 'Octahedral, stoichiometry: [], oligomeric_state: "Homo 24-mer", clusters: [],
        #  rotation_axes: [], kind: "Global Symmetry"}
        for cluster in symmetry['clusters']:  # [{}, ...]
            # CLUSTER_IDX is not a mapping to entity index...
            # cluster contains:
            # {members: [], avg_rmsd: 5.219512137974998e-14} which indicates how similar each member in the cluster is
            entity_clustered_chains.append([member.get('asym_id') for member in cluster['members']])

    return entity_clustered_chains

query_entry_id

query_entry_id(entry: str = None) -> Response | None

Fetches the JSON object for the EntryID from the PDB API

The following information is returned: All methods (SOLUTION NMR, ELECTRON MICROSCOPY, X-RAY DIFFRACTION) have the following keys: {'rcsb_primary_citation', 'pdbx_vrpt_summary', 'pdbx_audit_revision_history', 'audit_author', 'pdbx_database_status', 'rcsb_id', 'pdbx_audit_revision_details', 'struct_keywords', 'rcsb_entry_container_identifiers', 'entry', 'rcsb_entry_info', 'struct', 'citation', 'exptl', 'rcsb_accession_info'} EM only keys: {'em3d_fitting', 'em3d_fitting_list', 'em_image_recording', 'em_specimen', 'em_software', 'em_entity_assembly', 'em_vitrification', 'em_single_particle_entity', 'em3d_reconstruction', 'em_experiment', 'pdbx_audit_support', 'em_imaging', 'em_ctf_correction'} Xray only keys: {'diffrn_radiation', 'cell', 'reflns', 'diffrn', 'software', 'refine_hist', 'diffrn_source', 'exptl_crystal', 'symmetry', 'diffrn_detector', 'refine', 'reflns_shell', 'exptl_crystal_grow'} NMR only keys: {'pdbx_nmr_exptl', 'pdbx_audit_revision_item', 'pdbx_audit_revision_category', 'pdbx_nmr_spectrometer', 'pdbx_nmr_refine', 'pdbx_nmr_representative', 'pdbx_nmr_software', 'pdbx_nmr_exptl_sample_conditions', 'pdbx_nmr_ensemble'}

entry_json['rcsb_entry_info'] = {'assembly_count': 1, 'branched_entity_count': 0, 'cis_peptide_count': 3, 'deposited_atom_count': 8492, 'deposited_model_count': 1, 'deposited_modeled_polymer_monomer_count': 989, 'deposited_nonpolymer_entity_instance_count': 0, 'deposited_polymer_entity_instance_count': 6, 'deposited_polymer_monomer_count': 1065, 'deposited_solvent_atom_count': 735, 'deposited_unmodeled_polymer_monomer_count': 76, 'diffrn_radiation_wavelength_maximum': 0.9797, 'diffrn_radiation_wavelength_minimum': 0.9797, 'disulfide_bond_count': 0, 'entity_count': 3, 'experimental_method': 'X-ray', 'experimental_method_count': 1, 'inter_mol_covalent_bond_count': 0, 'inter_mol_metalic_bond_count': 0, 'molecular_weight': 115.09, 'na_polymer_entity_types': 'Other', 'nonpolymer_entity_count': 0, 'polymer_composition': 'heteromeric protein', 'polymer_entity_count': 2, 'polymer_entity_count_dna': 0, 'polymer_entity_count_rna': 0, 'polymer_entity_count_nucleic_acid': 0, 'polymer_entity_count_nucleic_acid_hybrid': 0, 'polymer_entity_count_protein': 2, 'polymer_entity_taxonomy_count': 2, 'polymer_molecular_weight_maximum': 21.89, 'polymer_molecular_weight_minimum': 16.47, 'polymer_monomer_count_maximum': 201, 'polymer_monomer_count_minimum': 154, 'resolution_combined': [1.95], 'selected_polymer_entity_types': 'Protein (only)', 'software_programs_combined': ['PHASER', 'REFMAC', 'XDS', 'XSCALE'], 'solvent_entity_count': 1, 'diffrn_resolution_high': {'provenance_source': 'Depositor assigned', 'value': 1.95}}

Parameters:

entry (str, default: None ) –

The PDB code to search for

Returns: The entry information according to the PDB

Source code in symdesign/resources/query/pdb.py

def query_entry_id(entry: str = None) -> requests.Response | None:
    """Fetches the JSON object for the EntryID from the PDB API

    The following information is returned:
    All methods (SOLUTION NMR, ELECTRON MICROSCOPY, X-RAY DIFFRACTION) have the following keys:
    {'rcsb_primary_citation', 'pdbx_vrpt_summary', 'pdbx_audit_revision_history', 'audit_author',
     'pdbx_database_status', 'rcsb_id', 'pdbx_audit_revision_details', 'struct_keywords',
     'rcsb_entry_container_identifiers', 'entry', 'rcsb_entry_info', 'struct', 'citation', 'exptl',
     'rcsb_accession_info'}
    EM only keys:
    {'em3d_fitting', 'em3d_fitting_list', 'em_image_recording', 'em_specimen', 'em_software', 'em_entity_assembly',
     'em_vitrification', 'em_single_particle_entity', 'em3d_reconstruction', 'em_experiment', 'pdbx_audit_support',
     'em_imaging', 'em_ctf_correction'}
    Xray only keys:
    {'diffrn_radiation', 'cell', 'reflns', 'diffrn', 'software', 'refine_hist', 'diffrn_source', 'exptl_crystal',
     'symmetry', 'diffrn_detector', 'refine', 'reflns_shell', 'exptl_crystal_grow'}
    NMR only keys:
    {'pdbx_nmr_exptl', 'pdbx_audit_revision_item', 'pdbx_audit_revision_category', 'pdbx_nmr_spectrometer',
     'pdbx_nmr_refine', 'pdbx_nmr_representative', 'pdbx_nmr_software', 'pdbx_nmr_exptl_sample_conditions',
     'pdbx_nmr_ensemble'}

    entry_json['rcsb_entry_info'] = \
        {'assembly_count': 1, 'branched_entity_count': 0, 'cis_peptide_count': 3, 'deposited_atom_count': 8492,
        'deposited_model_count': 1, 'deposited_modeled_polymer_monomer_count': 989,
        'deposited_nonpolymer_entity_instance_count': 0, 'deposited_polymer_entity_instance_count': 6,
        'deposited_polymer_monomer_count': 1065, 'deposited_solvent_atom_count': 735,
        'deposited_unmodeled_polymer_monomer_count': 76, 'diffrn_radiation_wavelength_maximum': 0.9797,
        'diffrn_radiation_wavelength_minimum': 0.9797, 'disulfide_bond_count': 0, 'entity_count': 3,
        'experimental_method': 'X-ray', 'experimental_method_count': 1, 'inter_mol_covalent_bond_count': 0,
        'inter_mol_metalic_bond_count': 0, 'molecular_weight': 115.09, 'na_polymer_entity_types': 'Other',
        'nonpolymer_entity_count': 0, 'polymer_composition': 'heteromeric protein', 'polymer_entity_count': 2,
        'polymer_entity_count_dna': 0, 'polymer_entity_count_rna': 0, 'polymer_entity_count_nucleic_acid': 0,
        'polymer_entity_count_nucleic_acid_hybrid': 0, 'polymer_entity_count_protein': 2,
        'polymer_entity_taxonomy_count': 2, 'polymer_molecular_weight_maximum': 21.89,
        'polymer_molecular_weight_minimum': 16.47, 'polymer_monomer_count_maximum': 201,
        'polymer_monomer_count_minimum': 154, 'resolution_combined': [1.95],
        'selected_polymer_entity_types': 'Protein (only)',
        'software_programs_combined': ['PHASER', 'REFMAC', 'XDS', 'XSCALE'], 'solvent_entity_count': 1,
        'diffrn_resolution_high': {'provenance_source': 'Depositor assigned', 'value': 1.95}}

    Args:
        entry: The PDB code to search for
    Returns:
        The entry information according to the PDB
    """
    if entry:
        return connection_exception_handler(f'{pdb_rest_url}/entry/{entry}')

parse_entry_json

parse_entry_json(entry_json: dict[str, Any]) -> dict[str, dict]

For a PDB API EntryID, parse the associated entity ID's and chains

Parameters:

entry_json (dict[str, Any]) –

The json type dictionary returned from requests.Response.json()

Returns: The structural information present in the PDB EntryID with format - {'method': xray, 'res': resolution, 'struct': {'space': space_group, 'a_b_c': (a, b, c), 'ang_a_b_c': (ang_a, ang_b, ang_c)} }

Source code in symdesign/resources/query/pdb.py

def parse_entry_json(entry_json: dict[str, Any]) -> dict[str, dict]:
    """For a PDB API EntryID, parse the associated entity ID's and chains

    Args:
        entry_json: The json type dictionary returned from requests.Response.json()
    Returns:
        The structural information present in the PDB EntryID with format -
        {'method': xray,
         'res': resolution,
         'struct': {'space': space_group, 'a_b_c': (a, b, c), 'ang_a_b_c': (ang_a, ang_b, ang_c)}
         }
    """
    experimental_method = entry_json['rcsb_entry_info'].get('experimental_method')
    if experimental_method:
        # Todo make ray, diffraction
        if 'ray' in experimental_method.lower() and 'cell' in entry_json and 'symmetry' in entry_json:
            cell = entry_json['cell']
            ang_a, ang_b, ang_c = cell['angle_alpha'], cell['angle_beta'], cell['angle_gamma']
            a, b, c = cell['length_a'], cell['length_b'], cell['length_c']
            space_group = entry_json['symmetry']['space_group_name_hm']
            struct_d = {'space': space_group, 'a_b_c': (a, b, c), 'ang_a_b_c': (ang_a, ang_b, ang_c)}
            resolution = entry_json['rcsb_entry_info']['resolution_combined'][0]
        elif experimental_method == 'EM':
            # em_keys = {
            #     'em3d_fitting': [{'id': '1', 'ref_protocol': 'RIGID BODY FIT', 'ref_space': 'REAL'}]
            #     'em3d_fitting_list': [{'id': '1', 'pdb_entry_id': '4ZK7', '3d_fitting_id': '1'}]
            #     'em3d_reconstruction': [{'algorithm': 'BACK PROJECTION', 'id': '1', 'image_processing_id': '1', 'num_class_averages': 1, 'num_particles': 110369, 'resolution': 2.9, 'resolution_method': 'FSC 0.143 CUT-OFF', 'symmetry_type': 'POINT'}]
            #     'em_ctf_correction': [{'em_image_processing_id': '1', 'id': '1', 'type': 'PHASE FLIPPING AND AMPLITUDE CORRECTION'}]
            #     'em_entity_assembly': [{'details': 'The map was generated by focused refinement of BG505 SOSIP-T33-31 nanoparticle dataset using a mask around the T33-31 nanoparticle core (masking out the flexibly linked antigens).', 'entity_id_list': ['1', '2'], 'id': '1', 'name': 'Designed tetrahedral nanoparticle BG505 SOSIP-T33-31', 'parent_id': 0, 'source': 'RECOMBINANT', 'type': 'COMPLEX'}]
            #     'em_experiment': {'aggregation_state': 'PARTICLE', 'entity_assembly_id': '1', 'id': '1', 'reconstruction_method': 'SINGLE PARTICLE'}
            #     'em_image_recording': [{'average_exposure_time': 10.25, 'avg_electron_dose_per_image': 50.0, 'detector_mode': 'COUNTING', 'film_or_detector_model': 'GATAN K2 SUMMIT (4k x 4k)', 'id': '1', 'imaging_id': '1', 'num_grids_imaged': 2, 'num_real_images': 1751}]
            #     'em_imaging': [{'accelerating_voltage': 200, 'alignment_procedure': 'BASIC', 'c2_aperture_diameter': 70.0, 'cryogen': 'NITROGEN', 'electron_source': 'FIELD EMISSION GUN', 'id': '1', 'illumination_mode': 'FLOOD BEAM', 'microscope_model': 'FEI TALOS ARCTICA', 'mode': 'BRIGHT FIELD', 'nominal_cs': 2.7, 'nominal_defocus_max': 2000.0, 'nominal_defocus_min': 800.0, 'nominal_magnification': 36000, 'specimen_holder_model': 'OTHER', 'specimen_id': '1'}]
            #     'em_particle_selection': [{'details': 'Gaussian picker in Relion/3.0', 'id': '1', 'image_processing_id': '1', 'num_particles_selected': 223099}]
            #     'em_single_particle_entity': [{'id': 1, 'image_processing_id': '1', 'point_symmetry': 'T'}]
            #     'em_software': [{'category': 'PARTICLE SELECTION', 'id': '1', 'image_processing_id': '1', 'name': 'RELION', 'version': '3.0'}, {'category': 'IMAGE ACQUISITION', 'id': '2', 'imaging_id': '1', 'name': 'Leginon'}, {'category': 'MASKING', 'id': '3'}, {'category': 'CTF CORRECTION', 'id': '4', 'image_processing_id': '1', 'name': 'RELION', 'version': '3.0'}, {'category': 'LAYERLINE INDEXING', 'id': '5'}, {'category': 'DIFFRACTION INDEXING', 'id': '6'}, {'category': 'MODEL FITTING', 'fitting_id': '1', 'id': '7', 'name': 'UCSF Chimera'}, {'category': 'OTHER', 'id': '8'}, {'category': 'INITIAL EULER ASSIGNMENT', 'id': '9', 'image_processing_id': '1', 'name': 'RELION', 'version': '3.0'}, {'category': 'FINAL EULER ASSIGNMENT', 'id': '10', 'image_processing_id': '1', 'name': 'RELION', 'version': '3.0'}, {'category': 'CLASSIFICATION', 'id': '11', 'image_processing_id': '1', 'name': 'RELION', 'version': '3.0'}, {'category': 'RECONSTRUCTION', 'id': '12', 'image_processing_id': '1', 'name': 'RELION', 'version': '3.0'}, {'category': 'MODEL REFINEMENT', 'fitting_id': '1', 'id': '13', 'name': 'Coot'}, {'category': 'MODEL REFINEMENT', 'fitting_id': '1', 'id': '14', 'name': 'RosettaEM'}]
            #     'em_specimen': [{'concentration': 4.1, 'details': 'BG505 SOSIP-T33-31 nanoparticle was prepared by combining equimolar amounts of BG505 SOSIP-T33-31A and BG505 SOSIP-T33-31B components that were expressed separately.', 'embedding_applied': 'NO', 'experiment_id': '1', 'id': '1', 'shadowing_applied': 'NO', 'staining_applied': 'NO', 'vitrification_applied': 'YES'}]
            #     'em_vitrification': [{'chamber_temperature': 283.0, 'cryogen_name': 'ETHANE', 'details': 'Blotting time varied between 3 and 7 seconds.', 'humidity': 100.0, 'id': '1', 'instrument': 'FEI VITROBOT MARK IV', 'specimen_id': '1'}]
            # }
            # for key in em_keys:
            #     print(entry_json.get(key))
            struct_d = {}
            # Access the first entry in the list with [0] v
            resolution = entry_json['em3d_reconstruction'][0]['resolution']
        else:  # Todo NMR
            logger.warning(f"No useful information added with the experimental method {experimental_method} as "
                           "this method hasn't been explored yet")
            struct_d = {}
            resolution = None
    else:
        logger.warning('Entry has no "experimental_method" keyword')
        struct_d = {}
        resolution = None

    return {'res': resolution, 'struct': struct_d, 'method': experimental_method.lower()}

format_symmetry_group

format_symmetry_group(symmetry: str, homomeric_number: int = 1, heteromeric_number: int = None) -> str

Return a PDB API length limitation query

Parameters:

symmetry (str) –

The symmetry to query for
homomeric_number (int, default: 1 ) –

If the symmetry desired is homomeric, how many copies of the entity are desired
heteromeric_number (int, default: None ) –

If the symmetry desired is heteromeric, how many entities are present

Returns: The symmetry formatted query limiting entity searches to the described symmetry

Source code in symdesign/resources/query/pdb.py

def format_symmetry_group(symmetry: str, homomeric_number: int = 1, heteromeric_number: int = None) -> str:
    """Return a PDB API length limitation query

    Args:
        symmetry: The symmetry to query for
        homomeric_number: If the symmetry desired is homomeric, how many copies of the entity are desired
        heteromeric_number: If the symmetry desired is heteromeric, how many entities are present
    Returns:
        The symmetry formatted query limiting entity searches to the described symmetry
    """
    if 'c' in symmetry.lower():
        symmetry_query = cyclic_symmetry_limiting_group % symmetry
    elif 'd' in symmetry.lower():
        symmetry_query = dihedral_symmetry_limiting_group % symmetry
    else:  # point group symmetry
        symmetry_query = point_symmetry_limiting_group % symmetry

    symmetry_number = utils.symmetry.valid_subunit_number.get(symmetry)
    if heteromeric_number:
        symmetry_query += ',' + heteromer_termini % symmetry_number * heteromeric_number
    else:  # if homomer:
        symmetry_query += ',' + homomer_termini % symmetry_number * homomeric_number
    # else:
    #     raise ValueError("Must provide either 'homomeric_number' or 'heteromeric_number'")

    return symmetry_query

format_length_group

format_length_group(lower: int, upper: int) -> str

Return a PDB API length limitation query

Parameters:

lower (int) –

The low end to limit entity length
upper (int) –

The upper limit on entity length

Returns: The length formatted query limiting entity searches to between the values lower and upper (non-inclusive)

Source code in symdesign/resources/query/pdb.py

def format_length_group(lower: int, upper: int) -> str:
    """Return a PDB API length limitation query

    Args:
        lower: The low end to limit entity length
        upper: The upper limit on entity length
    Returns:
        The length formatted query limiting entity searches to between the values lower and upper (non-inclusive)
    """
    return length_group % (lower, upper)

nanohedra_building_blocks_query

nanohedra_building_blocks_query(symmetry: str, lower: int = None, upper: int = None, thermophile: bool = False, return_groups: bool = False, limit_by_groups: Iterable[str] = None, search_by_groups: Iterable[str] = None) -> dict[Any] | None

Retrieve symmetric oligomers from the PDB to act as building blocks for nanohedra docking

Parameters:

symmetry (str) –

The symmetry to query for
lower (int, default: None ) –

The low end to limit entity length
upper (int, default: None ) –

The upper limit on entity length
thermophile (bool, default: False ) –

Whether to limit search to entries from thermophilic species
return_groups (bool, default: False ) –

Whether to return results as groupID's
limit_by_groups (Iterable[str], default: None ) –

Whether to limit the query, i.e. not return groupID's that are provided in this argument
search_by_groups (Iterable[str], default: None ) –

Search only for groupID's that are provided to this argument

Returns: Matching EntityID's formatted as a dictionary from the JSON formatted response or None if the query failed

Source code in symdesign/resources/query/pdb.py

def nanohedra_building_blocks_query(
        symmetry: str, lower: int = None, upper: int = None, thermophile: bool = False, return_groups: bool = False,
        limit_by_groups: Iterable[str] = None, search_by_groups: Iterable[str] = None) -> dict[Any] | None:
    """Retrieve symmetric oligomers from the PDB to act as building blocks for nanohedra docking

    Args:
        symmetry: The symmetry to query for
        lower: The low end to limit entity length
        upper: The upper limit on entity length
        thermophile: Whether to limit search to entries from thermophilic species
        return_groups: Whether to return results as groupID's
        limit_by_groups: Whether to limit the query, i.e. not return groupID's that are provided in this argument
        search_by_groups: Search only for groupID's that are provided to this argument
    Returns:
        Matching EntityID's formatted as a dictionary from the JSON formatted response or None if the query failed
    """
    groups_and_terminal = common_quality_filters \
        + ',' + format_symmetry_group(symmetry) \
        + ',' + format_length_group(lower, upper)

    if thermophile:
        groups_and_terminal += ',' + thermophilic_json_terminal_operator
    if limit_by_groups:
        groups_and_terminal += ',' + not_in_entity_group_id_search_block \
                               % ','.join([f'"{id_}"' for id_ in limit_by_groups])
    if search_by_groups:
        groups_and_terminal += ',' + in_entity_group_id_search_block \
                               % ','.join([f'"{id_}"' for id_ in search_by_groups])

    building_block_query = and_group_query % groups_and_terminal
    logger.debug(f'Found building_block_query: {building_block_query}')
    formatted_query = json.loads(building_block_query)

    return query_pdb(generate_query(formatted_query, return_id='polymer_entity',
                                    cluster_sequence=True, return_groups=return_groups, all_matching=True))

find_author_confirmed_assembly_from_entity_group

find_author_confirmed_assembly_from_entity_group(group_ids: Iterable[str], symmetry: str, lower: int = None, upper: int = None) -> dict[Any] | None

For specific groupID's, request all EntityID's that have an assembly confirmed by depositing authors from PDB API

Parameters:

group_ids (Iterable[str]) –

The groupID's to limit search to
symmetry (str) –

The symmetry to query for
lower (int, default: None ) –

The low end to limit entity length
upper (int, default: None ) –

The upper limit on entity length

Returns: Matching AssemblyID's formatted as a dictionary from the JSON formatted response or None if the query failed

Source code in symdesign/resources/query/pdb.py

def find_author_confirmed_assembly_from_entity_group(
        group_ids: Iterable[str], symmetry: str, lower: int = None, upper: int = None) -> dict[Any] | None:
    """For specific groupID's, request all EntityID's that have an assembly confirmed by depositing authors from PDB API

    Args:
        group_ids: The groupID's to limit search to
        symmetry: The symmetry to query for
        lower: The low end to limit entity length
        upper: The upper limit on entity length
    Returns:
        Matching AssemblyID's formatted as a dictionary from the JSON formatted response or None if the query failed
    """
    groups_and_terminal = common_quality_filters \
        + ',' + format_symmetry_group(symmetry) \
        + ',' + format_length_group(lower, upper) \
        + ',' + assembly_author_defined \
        + ',' + in_entity_group_id_search_block % ','.join([f'"{id_}"' for id_ in group_ids])
    author_confirmed_query = and_group_query % groups_and_terminal
    logger.debug(f'Found author_confirmed_query: {author_confirmed_query}')
    formatted_query = json.loads(author_confirmed_query)

    return query_pdb(generate_query(formatted_query, return_id='assembly', all_matching=True))

solve_author_confirmed_assemblies

solve_author_confirmed_assemblies(params: QueryParams, grouped_entity_ids: dict[str, list[str]]) -> tuple[list[str], list[str]]

From a map of Entity group ID's to resolution sorted EntityIDs, solve for those EntityIDs that have an assembly

First search for QSbio confirmed assemblies, then search the PDB API for 'author_defined_assembly' and 'author_and_software_defined_assembly'

Parameters:

params (QueryParams) –

The parameter profile specified for the search procedure
grouped_entity_ids (dict[str, list[str]]) –

A dictionary mapping groupID to EntryID's

Returns: A tuple of the objects ( The best EntityIDs according to incoming sorting and that pass the assembly test The Entity group ID of those groups that couldn't be solved )

Source code in symdesign/resources/query/pdb.py

def solve_author_confirmed_assemblies(params: QueryParams, grouped_entity_ids: dict[str, list[str]]) \
        -> tuple[list[str], list[str]]:
    """From a map of Entity group ID's to resolution sorted EntityIDs, solve for those EntityIDs that have an assembly

    First search for QSbio confirmed assemblies, then search the PDB API for 'author_defined_assembly' and
    'author_and_software_defined_assembly'

    Args:
        params: The parameter profile specified for the search procedure
        grouped_entity_ids: A dictionary mapping groupID to EntryID's
    Returns:
        A tuple of the objects (
            The best EntityIDs according to incoming sorting and that pass the assembly test
            The Entity group ID of those groups that couldn't be solved
        )
    """
    # Check if the top thermophilic ids are actually bona-fide assemblies
    top_entity_ids: list[str | None] = []
    # If they aren't, then solve by PDB API query
    solve_group_by_pdb = []
    for group_id, entity_ids in grouped_entity_ids.items():
        group_assemblies = [qsbio_confirmed.get(id_[:4].lower()) for id_ in entity_ids]
        for id_, assembly in zip(entity_ids, group_assemblies):
            if assembly is None:
                continue
            else:
                top_entity_ids.append(id_)
                break
        else:  # No assemblies are qsbio_confirmed
            # Solve by PDB assembly inference
            solve_group_by_pdb.append(group_id)
            top_entity_ids.append(None)

    author_confirmed_assembly_result = \
        find_author_confirmed_assembly_from_entity_group(
            solve_group_by_pdb, params.symmetry, params.lower_length, params.upper_length)
    if author_confirmed_assembly_result:
        author_confirmed_assembly_ids = parse_pdb_response_for_ids(author_confirmed_assembly_result)
    else:
        author_confirmed_assembly_ids = []

    # Limit AssemblyID's to EntryID's
    author_confirmed_entry_ids = [id_[:4] for id_ in author_confirmed_assembly_ids]
    remove_group_ids = []
    remove_group_indices = []

    # For orphaned groups, find and fit author confirmed assemblies in their corresponding groups
    for group_idx, (top_id, group_id) in enumerate(zip(top_entity_ids, grouped_entity_ids.keys())):
        if top_id is None:
            for entity_id in grouped_entity_ids[group_id]:
                if entity_id[:4] in author_confirmed_entry_ids:
                    top_entity_ids[group_idx] = entity_id
                    break
            else:  # This still isn't solved. Remove from the pool
                remove_group_ids.append(group_id)
                remove_group_indices.append(group_idx)

    for group_idx in reversed(remove_group_indices):
        top_entity_ids.pop(group_idx)

    return top_entity_ids, remove_group_ids

entity_thermophilicity

entity_thermophilicity(entry: str = None, entity_integer: int | str = None, entity_id: str = None) -> float | None

Query the PDB API for an EntityID and return the associated chains and reference dictionary

Parameters:

entry (str, default: None ) –

The 4 character PDB EntryID of interest
entity_integer (int | str, default: None ) –

The entity integer from the EntryID of interest
entity_id (str, default: None ) –

The PDB formatted EntityID. Has the format EntryID_Integer (1ABC_1)

Returns: Value ranging from 0-1 where 1 is completely thermophilic according to taxonomic classification

Source code in symdesign/resources/query/pdb.py

def entity_thermophilicity(entry: str = None, entity_integer: int | str = None, entity_id: str = None) -> float | None:
    """Query the PDB API for an EntityID and return the associated chains and reference dictionary

    Args:
        entry: The 4 character PDB EntryID of interest
        entity_integer: The entity integer from the EntryID of interest
        entity_id: The PDB formatted EntityID. Has the format EntryID_Integer (1ABC_1)
    Returns:
        Value ranging from 0-1 where 1 is completely thermophilic according to taxonomic classification
    """
    entity_request = query_entity_id(entry=entry, entity_integer=entity_integer, entity_id=entity_id)
    if not entity_request:
        return None

    return thermophilicity_from_entity_json(entity_request.json())

thermophilicity_from_entity_json

thermophilicity_from_entity_json(entity_json: dict[str, Any]) -> float

Return the extent to which the entity json entry in question is thermophilic

Parameters:

entity_json (dict[str, Any]) –

The return json from PDB API query

Returns: Value ranging from 0-1 where 1 is completely thermophilic

Source code in symdesign/resources/query/pdb.py

def thermophilicity_from_entity_json(entity_json: dict[str, Any]) -> float:
    """Return the extent to which the entity json entry in question is thermophilic

    Args:
        entity_json: The return json from PDB API query
    Returns:
        Value ranging from 0-1 where 1 is completely thermophilic
    """
    thermophilic_source = []
    for organism in entity_json.get('rcsb_entity_source_organism', {}):
        taxonomy_id = int(organism.get('ncbi_taxonomy_id', -1))
        if taxonomy_id in thermophilic_taxonomy_ids:
            thermophilic_source.append(1)
        else:
            thermophilic_source.append(0)

    if thermophilic_source:
        return sum(thermophilic_source) / len(thermophilic_source)
    else:
        return 0.

parse_entities_json

parse_entities_json(entity_jsons: Iterable[dict[str, Any]]) -> dict[str, dict]

Parameters:

entity_jsons (Iterable[dict[str, Any]]) –

An Iterable of json like objects containing EntityID information as retrieved from the PDB API

Returns: The entity dictionary with format - {'EntityID': {'chains': ['A', 'B', ...], 'dbref': {'accession': ('Q96DC8',), 'db': 'UniProt'}, 'reference_sequence': 'MSLEHHHHHH...', 'thermophilicity': 1.0}, ...}

Source code in symdesign/resources/query/pdb.py

def parse_entities_json(entity_jsons: Iterable[dict[str, Any]]) -> dict[str, dict]:
    """

    Args:
        entity_jsons: An Iterable of json like objects containing EntityID information as retrieved from the PDB API
    Returns:
        The entity dictionary with format -
        {'EntityID':
            {'chains': ['A', 'B', ...],
             'dbref': {'accession': ('Q96DC8',), 'db': 'UniProt'},
             'reference_sequence': 'MSLEHHHHHH...',
             'thermophilicity': 1.0},
         ...}
    """
    def extract_dbref(entity_ids_json: dict[str, Any]) -> dict[str, dict]:
        """For a PDB API EntityID, parse the associated chains and database reference identifiers

        Args:
            entity_ids_json: The json type dictionary returned from requests.Response.json()
        Returns:
            Ex: {'db': DATABASE, 'accession': 'Q96DC8'} where DATABASE can be one of 'GenBank', 'Norine', 'UniProt'
        """
        database_keys = ['db', 'accession']
        try:
            uniprot_ids = entity_ids_json['uniprot_ids']
            # Todo choose the most accurate if more than 2...
            #  'rcsb_polymer_entity_align' indicates how the model from the PDB aligns to UniprotKB through SIFTS
            #  [{provenance_source: "SIFTS",
            #    reference_database_accession: "P12528",
            #    reference_database_name: "UniProt",
            #    aligned_regions: [{entity_beg_seq_id: 1,
            #                       length: 124,
            #                       ref_beg_seq_id: 2}]
            #   },
            #   {}, ...
            #  ]
            if len(uniprot_ids) > 1:
                logger.warning(f'For Entity {entity_ids_json["rcsb_id"]}, found multiple UniProt Entries: '
                               f'{", ".join(uniprot_ids)}')
            db_d = dict(zip(database_keys, (UKB, tuple(uniprot_ids))))
        except KeyError:  # No 'uniprot_ids'
            # GenBank = GB, which is mostly RNA or DNA structures or antibody complexes
            # Norine = NOR, which is small peptide structures, sometimes bound to proteins...
            try:
                identifiers = [dict(db=ident['database_name'], accession=(ident['database_accession'],))
                               for ident in entity_ids_json.get('reference_sequence_identifiers', [])]
            except KeyError:  # There are really no identifiers of use
                return {}
            if identifiers:
                if len(identifiers) == 1:  # Only one solution
                    db_d = identifiers[0]
                else:  # Find the most ideal accession_database UniProt > GenBank > Norine > ???
                    whatever_else = 0
                    priority_l = [[] for _ in range(len(identifiers))]
                    for idx, (database, accession) in enumerate(identifiers):
                        if database == UKB:
                            priority_l[0].append(idx)
                        elif database == GB:
                            # Two elements are required from above len check, never have IndexError
                            priority_l[1].append(idx)
                        # elif database == NOR:
                        #     priority_l[2].append(idx)
                        elif not whatever_else:
                            # Only set the first time an unknown identifier is seen
                            whatever_else = idx

                    # Loop through the list of prioritized identifiers
                    for identifier_idx in priority_l:
                        if identifier_idx:  # we have found a priority database, choose the corresponding identifier idx
                            # Make the db_d with the db name as first arg and all the identifiers as the second arg
                            db_d = dict(zip(database_keys,
                                            (identifiers[identifier_idx[0]]['db'], [identifiers[idx]['accession']
                                                                                    for idx in identifier_idx])))
                            break
                    else:  # if no solution from priority but something else, choose the other
                        db_d = identifiers[whatever_else]
            else:
                db_d = {}

        return db_d

    entity_info = {}
    for entity_idx, entity_json in enumerate(entity_jsons, 1):
        if entity_json is None:
            continue
        entity_json_ids = entity_json.get('rcsb_polymer_entity_container_identifiers')
        if entity_json_ids:
            entity_info[entity_json_ids['rcsb_id'].lower()] = dict(
                chains=entity_json_ids['asym_ids'],
                dbref=extract_dbref(entity_json_ids),
                reference_sequence=entity_json['entity_poly']['pdbx_seq_one_letter_code_can'],
                thermophilicity=thermophilicity_from_entity_json(entity_json),
            )

    return entity_info

query_entity_id

query_entity_id(entry: str = None, entity_integer: str | int = None, entity_id: str = None) -> Response | None

Retrieve PDB EntityID information from the PDB API. More info at http://data.rcsb.org/#data-api

For all method types the following keys are available: {'rcsb_polymer_entity_annotation', 'entity_poly', 'rcsb_polymer_entity', 'entity_src_gen', 'rcsb_polymer_entity_feature_summary', 'rcsb_polymer_entity_align', 'rcsb_id', 'rcsb_cluster_membership', 'rcsb_polymer_entity_container_identifiers', 'rcsb_entity_host_organism', 'rcsb_latest_revision', 'rcsb_entity_source_organism'} NMR only - {'rcsb_polymer_entity_feature'} EM only - set() X-ray_only_keys - {'rcsb_cluster_flexibility'}

Parameters:

entry (str, default: None ) –

The 4 character PDB EntryID of interest
entity_integer (str | int, default: None ) –

The integer of the entity_id
entity_id (str, default: None ) –

The PDB formatted EntityID. Has the format EntryID_Integer (1ABC_1)

Returns: The entity information according to the PDB

Source code in symdesign/resources/query/pdb.py

def query_entity_id(entry: str = None, entity_integer: str | int = None, entity_id: str = None) -> \
        requests.Response | None:
    """Retrieve PDB EntityID information from the PDB API. More info at http://data.rcsb.org/#data-api

    For all method types the following keys are available:
    {'rcsb_polymer_entity_annotation', 'entity_poly', 'rcsb_polymer_entity', 'entity_src_gen',
     'rcsb_polymer_entity_feature_summary', 'rcsb_polymer_entity_align', 'rcsb_id', 'rcsb_cluster_membership',
     'rcsb_polymer_entity_container_identifiers', 'rcsb_entity_host_organism', 'rcsb_latest_revision',
     'rcsb_entity_source_organism'}
    NMR only - {'rcsb_polymer_entity_feature'}
    EM only - set()
    X-ray_only_keys - {'rcsb_cluster_flexibility'}

    Args:
        entry: The 4 character PDB EntryID of interest
        entity_integer: The integer of the entity_id
        entity_id: The PDB formatted EntityID. Has the format EntryID_Integer (1ABC_1)
    Returns:
        The entity information according to the PDB
    """
    if entity_id:
        entry, entity_integer, *_ = entity_id.split('_')  # Assume that this was passed correctly

    if entry and entity_integer:
        return connection_exception_handler(f'{pdb_rest_url}/polymer_entity/{entry}/{entity_integer}')

get_entity_id

get_entity_id(entry: str = None, entity_integer: int | str = None, entity_id: str = None, chain: str = None) -> tuple[str, str] | tuple[None]

Retrieve a UniProtID from the PDB API by passing various PDB identifiers or combinations thereof

Parameters:

entry (str, default: None ) –

The 4 character PDB EntryID of interest
entity_integer (int | str, default: None ) –

The entity integer from the EntryID of interest
entity_id (str, default: None ) –

The PDB formatted EntityID. Has the format EntryID_Integer (1ABC_1)
chain (str, default: None ) –

The polymer "chain" identifier otherwise known as the "asym_id" from the PDB EntryID of interest

Returns: The Entity_ID

Source code in symdesign/resources/query/pdb.py

def get_entity_id(entry: str = None, entity_integer: int | str = None, entity_id: str = None, chain: str = None) -> \
        tuple[str, str] | tuple[None]:
    """Retrieve a UniProtID from the PDB API by passing various PDB identifiers or combinations thereof

    Args:
        entry: The 4 character PDB EntryID of interest
        entity_integer: The entity integer from the EntryID of interest
        entity_id: The PDB formatted EntityID. Has the format EntryID_Integer (1ABC_1)
        chain: The polymer "chain" identifier otherwise known as the "asym_id" from the PDB EntryID of interest
    Returns:
        The Entity_ID
    """
    if entry is not None:
        if len(entry) != 4:
            logger.warning(f'EntryID "{entry}" is not of the required format and will not be found with the PDB API')
        elif entity_integer is not None:
            return entry, entity_integer
            # entity_id = f'{entry}_{entity_integer}'
        else:
            info = _get_entry_info(entry)
            chain_entity = {chain: entity_idx for entity_idx, chains in info.get('entity', {}).items() for chain in chains}
            if chain is not None:
                try:
                    return entry, chain_entity[chain]
                    # entity_id = f'{entry}_{chain_entity[chain]}'
                except KeyError:
                    raise KeyError(f'No chain "{chain}" found in PDB ID {entry}. '
                                   f'Possible chains {", ".join(chain_entity)}')
            else:
                entity_integer = next(iter(chain_entity.values()))
                logger.warning('Using the argument "entry" without either "entity_integer" or "chain" is not '
                               f'recommended. Choosing the first EntityID "{entry}_{entity_integer}"')
                return entry, entity_integer
                # entity_id = f'{entry}_{entity_integer}'

    elif entity_id is not None:
        entry, entity_integer, *extra = entity_id.split('_')
        if not extra and len(entry) == 4:
            return entry, entity_integer

        logger.debug(f"EntityID '{entity_id}' isn't the required format and will not be found with the PDB API")

    return None,

get_entity_uniprot_id

get_entity_uniprot_id(**kwargs) -> str | None

Retrieve a UniProtID from the PDB API by passing various PDB identifiers or combinations thereof

Other Parameters:

entry=None (str) –

The 4 character PDB EntryID of interest
entity_integer=None (str) –

The entity integer from the EntryID of interest
entity_id=None (str) –

The PDB formatted EntityID. Has the format EntryID_Integer (1ABC_1)
chain=None (str) –

The polymer "chain" identifier otherwise known as the "asym_id" from the PDB EntryID of interest

Returns:

str | None –

The UniProt ID

Source code in symdesign/resources/query/pdb.py

def get_entity_uniprot_id(**kwargs) -> str | None:
    """Retrieve a UniProtID from the PDB API by passing various PDB identifiers or combinations thereof

    Keyword Args:
        entry=None (str): The 4 character PDB EntryID of interest
        entity_integer=None (str): The entity integer from the EntryID of interest
        entity_id=None (str): The PDB formatted EntityID. Has the format EntryID_Integer (1ABC_1)
        chain=None (str): The polymer "chain" identifier otherwise known as the "asym_id" from the PDB EntryID of
            interest

    Returns:
        The UniProt ID
    """
    entity_request = query_entity_id(*get_entity_id(**kwargs))
    if entity_request:
        # return the first uniprot entry
        return entity_request.json().get('rcsb_polymer_entity_container_identifiers')['uniprot_id'][0]

get_entity_reference_sequence

get_entity_reference_sequence(**kwargs) -> str | None

Query the PDB API for the reference amino acid sequence for a specified entity ID (PDB EntryID_Entity_ID)

Other Parameters:

entry=None (str) –

The 4 character PDB EntryID of interest
entity_integer=None (str) –

The entity integer from the EntryID of interest
entity_id=None (str) –

The PDB formatted EntityID. Has the format EntryID_Integer (1ABC_1)
chain=None (str) –

The polymer "chain" identifier otherwise known as the "asym_id" from the PDB EntryID of interest

Returns:

str | None –

One letter amino acid sequence

Source code in symdesign/resources/query/pdb.py

def get_entity_reference_sequence(**kwargs) -> str | None:
    """Query the PDB API for the reference amino acid sequence for a specified entity ID (PDB EntryID_Entity_ID)

    Keyword Args:
        entry=None (str): The 4 character PDB EntryID of interest
        entity_integer=None (str): The entity integer from the EntryID of interest
        entity_id=None (str): The PDB formatted EntityID. Has the format EntryID_Integer (1ABC_1)
        chain=None (str): The polymer "chain" identifier otherwise known as the "asym_id" from the PDB EntryID of
            interest

    Returns:
        One letter amino acid sequence
    """
    entity_request = query_entity_id(*get_entity_id(**kwargs))
    if entity_request:
        return entity_request.json().get('entity_poly')['pdbx_seq_one_letter_code_can']  # returns non-cannonical as 'X'

get_rcsb_metadata_schema

get_rcsb_metadata_schema(file=os.path.join(current_dir, 'rcsb_schema.pkl'), search_only=True, force_update=False)

Parse the rcsb metadata schema for useful information from the format {"properties" : {"assignment_version" : {"type" : "string", "examples" : [ "V4_0_2" ], "description" : "Identifies the version of the feature assignment.", "rcsb_description" : [ {"text" : "Identifies the version of the feature assignment.", "context" : "dictionary"}, {"text" : "Feature Version", "context" : "brief"} ] }, ... "symmetry_type" : {"type" : "string", <-- provide data type provide options --> "enum" : [ "2D CRYSTAL", "3D CRYSTAL", "HELICAL", "POINT" ], provide description --> "description" : "The type of symmetry applied to the reconstruction", provide operators --> "rcsb_search_context" : [ "exact-match" ], "rcsb_full_text_priority" : 10, "rcsb_description" : [ {"text" : "The type of symmetry applied to the reconstruction", "context" : "dictionary"}, {"text" : "Symmetry Type (Em 3d Reconstruction)", "context" : "brief"} ] }, ... }, "title" : "Core Metadata", "additionalProperties" : false, "$comment" : "Schema version: 1.14.0" "required" : ["rcsb_id", "rcsb_entry_container_identifiers", "rcsb_entry_info", "rcsb_pubmed_container_identifiers", "rcsb_polymer_entity_container_identifiers", "rcsb_assembly_container_identifiers", "rcsb_uniprot_container_identifiers" ], "$schema" : "http://json-schema.org/draft-07/schema#", "description" : "Collective JSON schema that includes definitions for all indexed cores with RCSB metadata extensions.", } Returns: (dict): {attribute: {'dtype': 'string', 'description': 'XYZ', 'operators': {'equals'}, 'choices': []}, ...}

Source code in symdesign/resources/query/pdb.py

def get_rcsb_metadata_schema(file=os.path.join(current_dir, 'rcsb_schema.pkl'), search_only=True, force_update=False):
    """Parse the rcsb metadata schema for useful information from the format
         {"properties" : {"assignment_version" : {"type" : "string", "examples" : [ "V4_0_2" ],
                                             "description" : "Identifies the version of the feature assignment.",
                                             "rcsb_description" : [
                                              {"text" : "Identifies the version of the feature assignment.",
                                               "context" : "dictionary"},
                                              {"text" : "Feature Version", "context" : "brief"} ]
                                            },
                          ...
                          "symmetry_type" : {"type" : "string",     <-- provide data type
               provide options     -->       "enum" : [ "2D CRYSTAL", "3D CRYSTAL", "HELICAL", "POINT" ],
               provide description -->       "description" : "The type of symmetry applied to the reconstruction",
               provide operators   -->       "rcsb_search_context" : [ "exact-match" ],
                                             "rcsb_full_text_priority" : 10,
                                             "rcsb_description" : [
                                                {"text" : "The type of symmetry applied to the reconstruction",
                                                 "context" : "dictionary"},
                                                {"text" : "Symmetry Type (Em 3d Reconstruction)", "context" : "brief"} ]
                                            },
                          ... },
          "title" : "Core Metadata", "additionalProperties" : false, "$comment" : "Schema version: 1.14.0"
          "required" : ["rcsb_id", "rcsb_entry_container_identifiers", "rcsb_entry_info",
                        "rcsb_pubmed_container_identifiers", "rcsb_polymer_entity_container_identifiers",
                        "rcsb_assembly_container_identifiers", "rcsb_uniprot_container_identifiers" ],
          "$schema" : "http://json-schema.org/draft-07/schema#",
          "description" : "Collective JSON schema that includes definitions for all indexed cores with RCSB metadata extensions.",
         }
    Returns:
        (dict): {attribute: {'dtype': 'string', 'description': 'XYZ', 'operators': {'equals'}, 'choices': []}, ...}
    """
    schema_pairs = {'dtype': 'type', 'description': 'description', 'operators': 'rcsb_search_context',
                    'choices': 'enum'}
    operator_d = {'full-text': 'contains_words, contains_phrase, exists', 'exact-match': 'in, exact_match, exists',
                  'default-match': 'equals, greater, less, greater_or_equal, less_or_equal, range, range_closed, '
                                   'exists', 'suggest': None}
    # Types of rcsb_search_context: (can be multiple)
    # full-text - contains_words, contains_phrase, exists
    # exact-match - in, exact-match, exists
    # default-match - equals, greater, less, greater_or_equal, less_or_equal, range, range_closed, exists
    # suggests - provides an example to the user in the GUI
    data_types = ['string', 'integer', 'number']

    def recurse_metadata(metadata_d, stack=tuple()):  # this puts the yield inside a local iter so we don't return
        for attribute in metadata_d:
            if metadata_d[attribute]['type'] == 'array':  # 'items' must be a keyword in dictionary
                # stack += (attribute, 'a')
                if metadata_d[attribute]['items']['type'] in data_types:  # array is the final attribute of the branch
                    yield stack + (attribute, 'a')
                elif metadata_d[attribute]['items']['type'] == 'object':  # type must be object, therefore contain 'properties' key and then have more attributes as leafs
                    yield from recurse_metadata(metadata_d[attribute]['items']['properties'], stack=stack + ((attribute, 'a', 'o',)))
                else:
                    logger.debug('Array with type %s found in %s' % (metadata_d[attribute], stack))
            elif metadata_d[attribute]['type'] == 'object':  # This should never be reachable?
                # print('%s object found %s' % (attribute, stack))
                if 'properties' in metadata_d[attribute]:  # check may be unnecessary
                    yield from recurse_metadata(metadata_d[attribute]['properties'], stack=stack + (attribute, 'o',))
                else:
                    logger.debug('Object with no properties found %s in %s' % (metadata_d[attribute], stack))
                    # yield stack + ('o', attribute,)
            elif metadata_d[attribute]['type'] in data_types:
                yield stack + (attribute,)  # + ('o', attribute,) add 'o' as the parent had properties from the object type
            else:
                logger.debug('other type = %s' % metadata_d[attribute]['type'])

    if not os.path.exists(file) or force_update:  # Todo and date.datetime - date.current is not greater than a month...
        logger.info('Gathering the most current PDB metadata. This may take a couple minutes...')
        metadata_json = requests.get(attribute_metadata_schema_json).json()
        metadata_properties_d = metadata_json['properties']
        gen_schema = recurse_metadata(metadata_properties_d)
        schema_header_tuples = [yield_schema for yield_schema in gen_schema]

        schema_dictionary_strings_d = {'a': "['items']", 'o': "['properties']"}  # 'a': "['items']['properties']"
        schema_d = {}
        for i, attribute_tuple in enumerate(schema_header_tuples):
            attribute_full = '.'.join(attribute for attribute in attribute_tuple
                                      if attribute not in schema_dictionary_strings_d)
            if i < 5:
                logger.debug(attribute_full)
            schema_d[attribute_full] = {}
            d_search_string = ''.join(f"['{attribute}']" if attribute not in schema_dictionary_strings_d
                                      else schema_dictionary_strings_d[attribute] for attribute in attribute_tuple)
            evaluation_d = eval(f'{metadata_properties_d}{d_search_string}')
            for key, value in schema_pairs.items():
                if value in evaluation_d:
                    schema_d[attribute_full][key] = evaluation_d[value]
                else:
                    schema_d[attribute_full][key] = None

            if 'format' in evaluation_d:
                schema_d[attribute_full]['dtype'] = 'date'

            if schema_d[attribute_full]['description']:  # convert the description to a simplified descriptor
                schema_d[attribute_full]['description'] = schema_d[attribute_full]['description'].split('\n')[0]

            if schema_d[attribute_full]['operators']:  # convert the rcsb_search_context to valid operator(s)
                schema_d[attribute_full]['operators'] = set(', '.join(
                    operator_d[search_context] for search_context in schema_d[attribute_full]['operators']
                    if operator_d[search_context]).split(', '))
            else:
                if search_only:  # remove entries that don't have a corresponding operator as these aren't searchable
                    schema_d.pop(attribute_full)

        pickled_schema_file = utils.pickle_object(schema_d, file, out_path='')
    else:
        return utils.unpickle(file)

    return schema_d

pdb

GB module-attribute

NOR module-attribute

UKB module-attribute

qsbio_confirmed module-attribute

retrieve_entity_id_by_sequence

find_matching_entities_by_sequence

parse_pdb_response_for_ids

query_pdb

pdb_id_matching_uniprot_id

generate_query

retrieve_pdb_entries_by_advanced_query

query_pdb_by

query_assembly_id

parse_assembly_json

query_entry_id

parse_entry_json

format_symmetry_group

format_length_group

nanohedra_building_blocks_query

find_author_confirmed_assembly_from_entity_group

solve_author_confirmed_assemblies

entity_thermophilicity

thermophilicity_from_entity_json

parse_entities_json

query_entity_id

get_entity_id

get_entity_uniprot_id

get_entity_reference_sequence

get_rcsb_metadata_schema

GB `module-attribute`

NOR `module-attribute`

UKB `module-attribute`

qsbio_confirmed `module-attribute`