cluster

pose_pair_rmsd

pose_pair_rmsd(pose1: PoseJob, pose2: PoseJob) -> float

Calculate the rmsd between pairs of Poses using CB coordinates. Must be the same length pose

Parameters:

pose1 (PoseJob) –

First PoseJob object
pose2 (PoseJob) –

Second PoseJob object

Returns: RMSD value

Source code in symdesign/protocols/cluster.py

def pose_pair_rmsd(pose1: PoseJob, pose2: PoseJob) -> float:
    """Calculate the rmsd between pairs of Poses using CB coordinates. Must be the same length pose

    Args:
        pose1: First PoseJob object
        pose2: Second PoseJob object
    Returns:
        RMSD value
    """
    # This focuses on all residues, not any particular set of residues
    rmsd, rot, tx = superposition3d(pose1.pose.cb_coords, pose2.pose.cb_coords)
    return rmsd

pose_pair_by_rmsd

pose_pair_by_rmsd(compositions: Iterable[Sequence[PoseJob]]) -> dict[str | PoseJob, list[str | PoseJob]]

Perform rmsd comparison for all compositions of PoseJob instances

Parameters:

compositions (Iterable[Sequence[PoseJob]]) –

Groups of PoseJob instances that should be measured against one another pairwise

Returns: {PoseJob representative: [PoseJob members], ... }

Source code in symdesign/protocols/cluster.py

def pose_pair_by_rmsd(compositions: Iterable[Sequence[PoseJob]]) -> dict[str | PoseJob, list[str | PoseJob]]:
    """Perform rmsd comparison for all compositions of PoseJob instances

    Args:
        compositions: Groups of PoseJob instances that should be measured against one another pairwise
    Returns:
        {PoseJob representative: [PoseJob members], ... }
    """
    for pose_jobs in compositions:
        # Make all PoseJob combinations for this pair
        pose_job_pairs = list(combinations(pose_jobs, 2))
        results = [pose_pair_rmsd(*pair) for pair in pose_job_pairs]
        # Add all identical comparison results (all rmsd are 0 as they are with themselves
        results.extend(list(repeat(0, len(pose_jobs))))
        # Add all identical PoseJob combinations to pose_job_pairs
        pose_job_pairs.extend(list(zip(pose_jobs, pose_jobs)))

        return cluster_poses_by_value(pose_job_pairs, results)

ialign

ialign(*pdb_files: AnyStr, chain1: str = None, chain2: str = None, out_path: AnyStr = os.path.join(os.getcwd(), 'ialign')) -> float

Run non-sequential iAlign on two .pdb files

Parameters:

pdb_files (AnyStr, default: () ) –
# (pdb_file1) –
# (pdb_file2) –
chain1 (str, default: None ) –
chain2 (str, default: None ) –
out_path (AnyStr, default: join(getcwd(), 'ialign') ) –

The path to write iAlign results to

Returns: The IS score from Mu & Skolnic 2010

Source code in symdesign/protocols/cluster.py

def ialign(*pdb_files: AnyStr, chain1: str = None, chain2: str = None,
           out_path: AnyStr = os.path.join(os.getcwd(), 'ialign')) -> float:
    """Run non-sequential iAlign on two .pdb files

    Args:
        pdb_files:
        # pdb_file1:
        # pdb_file2:
        chain1:
        chain2:
        out_path: The path to write iAlign results to
    Returns:
        The IS score from Mu & Skolnic 2010
    """
    if chain1 is None:
        chain1 = 'AB'
    if chain2 is None:
        chain2 = 'AB'
    chains = ['-c1', chain1, '-c2', chain2]

    pdb_file1, pdb_file2, *_ = pdb_files
    temp_pdb_file1 = os.path.join(os.getcwd(), 'temp',
                                  os.path.basename(pdb_file1.translate(utils.keep_digit_table)))
    temp_pdb_file2 = os.path.join(os.getcwd(), 'temp',
                                  os.path.basename(pdb_file2.translate(utils.keep_digit_table)))
    # Move the desired files to a temporary file location
    os.system(f'scp {pdb_file1} {temp_pdb_file1}')
    os.system(f'scp {pdb_file2} {temp_pdb_file2}')
    # Perform the iAlign process
    # Example: perl ../bin/ialign.pl -w output -s -a 0 1lyl.pdb AC 12as.pdb AB | grep "IS-score = "
    cmd = ['perl', putils.ialign_exe_path, '-s', '-w', out_path, '-p1', temp_pdb_file1, '-p2', temp_pdb_file2] + chains
    logger.debug(f'iAlign command: {subprocess.list2cmdline(cmd)}')
    ialign_p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    ialign_out, ialign_err = ialign_p.communicate()
    # Format the output
    # Example: IS-score = 0.38840, P-value = 0.3808E-003, Z-score =  7.873
    grep_p = subprocess.Popen(['grep', 'IS-score = '], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
    ialign_is_score, err = grep_p.communicate(input=ialign_out)
    ialign_is_score = ialign_is_score.decode()
    logger.debug(f'iAlign interface alignment: {ialign_is_score.strip()}')
    is_score, pvalue, z_score = [score.split('=')[-1].strip() for score in ialign_is_score.split(',')]
    try:
        is_score = float(is_score)
    except ValueError:  # is_score isn't a number
        logger.debug('No significant interface found')
        is_score = 0.

    return is_score

cluster_poses_by_value

cluster_poses_by_value(identifier_pairs: Iterable[tuple[Any, Any]], values: Iterable[float], epsilon: float = 1.0) -> dict[str | PoseJob, list[str | PoseJob]]

Take pairs of identifiers and a precomputed distance metric (such as RMSD) and cluster using DBSCAN algorithm

Parameters:

identifier_pairs (Iterable[tuple[Any, Any]]) –

The identifiers for each pair measurement
values (Iterable[float]) –

The corresponding measurement values for each pair of identifiers
epsilon (float, default: 1.0 ) –

The parameter for DBSCAN to influence the spread of clusters, needs to be tuned for measurement values

Returns: {PoseJob representative: [PoseJob members], ... }

Source code in symdesign/protocols/cluster.py

def cluster_poses_by_value(identifier_pairs: Iterable[tuple[Any, Any]], values: Iterable[float], epsilon: float = 1.) \
        -> dict[str | PoseJob, list[str | PoseJob]]:
    """Take pairs of identifiers and a precomputed distance metric (such as RMSD) and cluster using DBSCAN algorithm

    Args:
        identifier_pairs: The identifiers for each pair measurement
        values: The corresponding measurement values for each pair of identifiers
        epsilon: The parameter for DBSCAN to influence the spread of clusters, needs to be tuned for measurement values
    Returns:
        {PoseJob representative: [PoseJob members], ... }
    """
    # BELOW IS THE INPUT FORMAT I WANT FOR cluster_poses_by_value()
    # index = list(combinations(pose_jobs, 2)) + list(zip(pose_jobs, pose_jobs))
    # values = values + tuple(repeat(0, len(pose_jobs)))
    # pd.Series(values, index=pd.MultiIndex.from_tuples(index)).unstack()

    pair_df = pd.Series(values, index=pd.MultiIndex.from_tuples(identifier_pairs)).fillna(0.).unstack()
    # symmetric_pair_values = sym(pair_df.values)

    # PCA analysis of distances
    # building_block_rmsd_matrix = sklearn.preprocessing.StandardScaler().fit_transform(symmetric_pair_values)
    # pca = PCA(putils.default_pca_variance)
    # building_block_rmsd_pc_np = pca.fit_transform(building_block_rmsd_matrix)
    # pca_distance_vector = pdist(building_block_rmsd_pc_np)
    # epsilon = pca_distance_vector.mean() * 0.5
    # Compute pose clusters using DBSCAN algorithm
    # precomputed specifies that a precomputed distance matrix is being passed
    dbscan = sklearn.cluster.DBSCAN(eps=epsilon, min_samples=2, metric='precomputed')
    dbscan.fit(utils.sym(pair_df.to_numpy()))
    # find the cluster representative by minimizing the cluster mean
    cluster_ids = set(dbscan.labels_)
    # print(dbscan.labels_)
    # Use of dbscan.core_sample_indices_ returns all core_samples which is not a nearest neighbors mean index
    # print(dbscan.core_sample_indices_)
    outlier = -1
    try:
        cluster_ids.remove(outlier)  # Remove outlier label, will add all these later
    except KeyError:
        pass

    # Find the cluster representative and members
    clustered_poses = {}
    for cluster_id in cluster_ids:
        # loc_indices = pair_df.index[np.where(cluster_id == dbscan.labels_)]
        # cluster_representative = pair_df.loc[loc_indices, loc_indices].mean().argmax()
        iloc_indices = np.where(dbscan.labels_ == cluster_id)
        # take mean (doesn't matter which axis) and find the minimum (most similar to others) as representative
        cluster_representative_idx = pair_df.iloc[iloc_indices, iloc_indices].mean().argmin()
        # set all the cluster members belonging to the cluster representative
        # pose_cluster_members = pair_df.index[iloc_indices].tolist()
        clustered_poses[pair_df.index[cluster_representative_idx]] = pair_df.index[iloc_indices].tolist()

    # Add all outliers to the clustered poses as a representative
    outlier_poses = pair_df.index[np.where(dbscan.labels_ == outlier)]
    clustered_poses.update(dict(zip(outlier_poses, outlier_poses)))

    return clustered_poses

apply_transform_groups_to_guide_coordinates

apply_transform_groups_to_guide_coordinates(*transforms: tuple[dict[str:ndarray]]) -> list[ndarray]

For each incoming transformation, transform guide coordinates according to the specified transformations

Parameters:

transforms (tuple[dict[str:ndarray]], default: () ) –

The individual transformation groups that should be applied to a guide coordinate

Returns: Guide coordinates transformed for each passed transform in each passed transform group

Source code in symdesign/protocols/cluster.py

def apply_transform_groups_to_guide_coordinates(*transforms: tuple[dict[str: np.ndarray]]) -> list[np.ndarray]:
    """For each incoming transformation, transform guide coordinates according to the specified transformations

    Args:
        transforms: The individual transformation groups that should be applied to a guide coordinate
    Returns:
        Guide coordinates transformed for each passed transform in each passed transform group
    """
    # Make a blank set of guide coordinates for each incoming transformation
    # number_of_coordinate_values = 9
    guide_coords = np.array([[0., 0., 0.], [1., 0., 0.], [0., 1., 0.]])
    try:
        allowed_keys = ['rotation', 'translation']
        operation_lengths = []
        for key in allowed_keys:
            operation = transforms[0].get(key)
            if operation is not None:
                operation_lengths.append(len(operation))
        try:
            tiled_length = max(operation_lengths)
        except ValueError:  # operation_lengths is empty
            raise KeyError(
                f'{apply_transform_groups_to_guide_coordinates.__name__}: Must pass one of the values '
                f'{" or ".join(allowed_keys)}')

        tiled_guide_coords = np.tile(guide_coords, (tiled_length, 1, 1))
    except IndexError:  # transforms[0] failed
        raise IndexError(
            f'{apply_transform_groups_to_guide_coordinates.__name__}: No arguments passed for transforms')

    transformed_guide_coords_sets = \
        [transform_coordinate_sets(tiled_guide_coords, **transform) for transform in transforms]

    return transformed_guide_coords_sets

cluster_transformation_pairs

cluster_transformation_pairs(*transforms: tuple[dict[str, ndarray]], distance: float = 1.0, minimum_members: int = 2) -> tuple[NearestNeighbors, DBSCAN]

Cluster a group of transformation parameters sets to find those which occupy essentially the same space

Parameters:

transforms (tuple[dict[str, ndarray]], default: () ) –

Group containing multiple sets of transformation operations where each transformation operation set takes the form {'rotation': rot_array, 'translation': tx_array, 'rotation2': rot2_array, 'translation2': tx2_array}
distance (float, default: 1.0 ) –

The distance to query neighbors in transformational space
minimum_members (int, default: 2 ) –

The minimum number of members in each cluster

Returns: The sklearn tree with the calculated nearest neighbors, the DBSCAN clustering object Representative indices, DBSCAN cluster membership indices

Source code in symdesign/protocols/cluster.py

def cluster_transformation_pairs(*transforms: tuple[dict[str, np.ndarray]], distance: float = 1.,
                                 minimum_members: int = 2) \
        -> tuple[sklearn.neighbors._unsupervised.NearestNeighbors, sklearn.cluster._dbscan.DBSCAN]:
    """Cluster a group of transformation parameters sets to find those which occupy essentially the same space

    Args:
        transforms: Group containing multiple sets of transformation operations where each transformation operation set
            takes the form {'rotation': rot_array, 'translation': tx_array,
                            'rotation2': rot2_array, 'translation2': tx2_array}
        distance: The distance to query neighbors in transformational space
        minimum_members: The minimum number of members in each cluster
    Returns:
        The sklearn tree with the calculated nearest neighbors, the DBSCAN clustering object
        Representative indices, DBSCAN cluster membership indices
    """
    transformed_guide_coord_pairs = apply_transform_groups_to_guide_coordinates(*transforms)
    transformed_guide_coords = np.concatenate(
        [coords.reshape(-1, number_of_coordinate_values) for coords in transformed_guide_coord_pairs], axis=1)

    # Create a tree structure describing the distances of all transformed points relative to one another
    nearest_neightbors_ball_tree = sklearn.neighbors.NearestNeighbors(algorithm='ball_tree', radius=distance)
    nearest_neightbors_ball_tree.fit(transformed_guide_coords)
    # sort_results only returns non-zero entries with the smallest distance first, however it doesn't seem to work...?
    distance_graph = nearest_neightbors_ball_tree.radius_neighbors_graph(mode='distance', sort_results=True)
    #                                                                    X=transformed_guide_coords is implied
    # Because this doesn't work to sort_results and pull out indices, I have to do another step 'radius_neighbors'
    # Todo Why is this happening? Perhaps when the precomputed data is too small?
    # Caution /home/kylemeador/miniconda3/envs/dev/lib/python3.10/site-packages/sklearn/neighbors/_base.py:206:
    # EfficiencyWarning: Precomputed sparse input was not sorted by data.
    dbscan_cluster: sklearn.cluster.DBSCAN = \
        sklearn.cluster.DBSCAN(eps=distance, min_samples=minimum_members, metric='precomputed').fit(distance_graph)
    #                                         sample_weight=A WEIGHT?

    # if return_representatives:
    #     return find_cluster_representatives(nearest_neightbors_ball_tree, dbscan_cluster)
    # else:  # return data structure
    return nearest_neightbors_ball_tree, dbscan_cluster  # .labels_

find_cluster_representatives

find_cluster_representatives(transform_tree: NearestNeighbors, cluster: DBSCAN) -> tuple[list[int], ndarray]

Return the cluster representative indices and the cluster membership identity for all member data

Parameters:

transform_tree (NearestNeighbors) –

The sklearn tree with the calculated nearest neighbors
cluster (DBSCAN) –

The DBSCAN clustering object

Returns: The list of representative indices, array of all indices membership

Source code in symdesign/protocols/cluster.py

def find_cluster_representatives(transform_tree: sklearn.neighbors._unsupervised.NearestNeighbors,
                                 cluster: sklearn.cluster._dbscan.DBSCAN) \
        -> tuple[list[int], np.ndarray]:
    """Return the cluster representative indices and the cluster membership identity for all member data

    Args:
        transform_tree: The sklearn tree with the calculated nearest neighbors
        cluster: The DBSCAN clustering object
    Returns:
        The list of representative indices, array of all indices membership
    """
    # Get the neighbors for each point in the tree according to the fit distance
    tree_distances, tree_indices = transform_tree.radius_neighbors(sort_results=True)
    # Find mean distance to all neighbors for each index
    with warnings.catch_warnings():
        # Empty slices can't compute mean, so catch warning if cluster is an outlier
        warnings.simplefilter('ignore', category=RuntimeWarning)
        mean_cluster_dist = np.array([tree_distance.mean() for tree_distance in list(tree_distances)])

    # For each label (cluster), add the minimal mean (representative) the representative transformation indices
    outlier = -1  # -1 are outliers in DBSCAN
    representative_transformation_indices = []
    for label in set(cluster.labels_) - {outlier}:  # labels live here
        cluster_indices = np.flatnonzero(cluster.labels_ == label)
        # Get the minimal argument from the mean distances for each index in the cluster
        # This index is the cluster representative
        representative_transformation_indices.append(cluster_indices[mean_cluster_dist[cluster_indices].argmin()])
    # Add all outliers to representatives
    representative_transformation_indices.extend(np.flatnonzero(cluster.labels_ == outlier).tolist())

    return representative_transformation_indices, cluster.labels_

cluster_pose_by_transformations

cluster_pose_by_transformations(compositions: list[PoseJob], **kwargs) -> dict[str | PoseJob, list[str | PoseJob]]

From a group of poses with matching protein composition, cluster the designs according to transformational parameters to identify the unique poses in each composition

Parameters:

compositions (list[PoseJob]) –

The group of PoseJob objects to pull transformation data from

Other Parameters:

distance –

float = 1. - The distance to query neighbors in transformational space
minimum_members –

int = 2 - The minimum number of members in each cluster

Returns:

dict[str | PoseJob, list[str | PoseJob]] –

Cluster with representative pose as the key and matching poses as the values

Source code in symdesign/protocols/cluster.py

def cluster_pose_by_transformations(compositions: list[PoseJob], **kwargs) -> dict[str | PoseJob, list[str | PoseJob]]:
    """From a group of poses with matching protein composition, cluster the designs according to transformational
    parameters to identify the unique poses in each composition

    Args:
        compositions: The group of PoseJob objects to pull transformation data from

    Keyword Args:
        distance: float = 1. - The distance to query neighbors in transformational space
        minimum_members: int = 2 - The minimum number of members in each cluster

    Returns:
        Cluster with representative pose as the key and matching poses as the values
    """
    # Format transforms for the selected compositions
    stacked_transforms1, stacked_transforms2 = zip(*[pose_jobs.transformations for pose_jobs in compositions])
    trans1_rot1, trans1_tx1, trans1_rot2, trans1_tx2 = \
        zip(*[transform.values() for transform in stacked_transforms1])
    trans2_rot1, trans2_tx1, trans2_rot2, trans2_tx2 = \
        zip(*[transform.values() for transform in stacked_transforms2])

    # Must add a new axis to translations so the operations are broadcast together in transform_coordinate_sets()
    transformation1 = {'rotation': np.array(trans1_rot1), 'translation': np.array(trans1_tx1)[:, np.newaxis, :],
                       'rotation2': np.array(trans1_rot2), 'translation2': np.array(trans1_tx2)[:, np.newaxis, :]}
    transformation2 = {'rotation': np.array(trans2_rot1), 'translation': np.array(trans2_tx1)[:, np.newaxis, :],
                       'rotation2': np.array(trans2_rot2), 'translation2': np.array(trans2_tx2)[:, np.newaxis, :]}

    # Find the representatives of the cluster based on minimal distance of each point to its nearest neighbors
    return cluster_by_transformations(transformation1, transformation2, values=compositions, **kwargs)

cluster_by_transformations

cluster_by_transformations(*transforms: tuple[dict[str, ndarray]], values: list[Any] = None, **kwargs) -> dict[Any, list[Any]]

From a set of objects with associated transformational parameters, identify and cluster the unique objects by representatives and members

Parameters:

transforms (tuple[dict[str, ndarray]], default: () ) –

Group containing multiple sets of transformation operations where each transformation operation set takes the form {'rotation': rot_array, 'translation': tx_array, 'rotation2': rot2_array, 'translation2': tx2_array}
values (list[Any], default: None ) –

The group of objects to cluster

Other Parameters:

distance –

float = 1. - The distance to query neighbors in transformational space
minimum_members –

int = 2 - The minimum number of members in each cluster

Returns:

dict[Any, list[Any]] –

Clustered objects with representative as the key and members as the values

Source code in symdesign/protocols/cluster.py

def cluster_by_transformations(*transforms: tuple[dict[str, np.ndarray]], values: list[Any] = None, **kwargs) \
        -> dict[Any, list[Any]]:
    """From a set of objects with associated transformational parameters, identify and cluster the unique objects by
    representatives and members

    Args:
        transforms: Group containing multiple sets of transformation operations where each transformation operation set
            takes the form {'rotation': rot_array, 'translation': tx_array,
                            'rotation2': rot2_array, 'translation2': tx2_array}
        values: The group of objects to cluster

    Keyword Args:
        distance: float = 1. - The distance to query neighbors in transformational space
        minimum_members: int = 2 - The minimum number of members in each cluster

    Returns:
        Clustered objects with representative as the key and members as the values
    """
    # Find the representatives of the cluster based on minimal distance of each point to its nearest neighbors
    # This section could be added to the Nanohedra docking routine
    cluster_representative_indices, cluster_labels = \
        find_cluster_representatives(*cluster_transformation_pairs(*transforms, **kwargs))

    representative_labels = cluster_labels[cluster_representative_indices]

    # Sort out clustered transform_values from the input transform_values
    outlier = -1
    cluster_map = \
        {values[rep_idx]: [values[idx] for idx in np.flatnonzero(cluster_labels == rep_label).tolist()]
         for rep_idx, rep_label in zip(cluster_representative_indices, representative_labels)
         if rep_label != outlier}
    # Add all outliers
    cluster_map.update({values[idx]: [values[idx]] for idx in np.flatnonzero(cluster_labels == outlier).tolist()})

    return cluster_map

group_compositions

group_compositions(pose_jobs: list[PoseJob]) -> dict[tuple[str, ...], list[PoseJob]]

From a set of DesignDirectories, find all the compositions and group together

Parameters:

pose_jobs (list[PoseJob]) –

The PoseJob to group according to composition

Returns: List of similarly named PoseJob mapped to their name

Source code in symdesign/protocols/cluster.py

def group_compositions(pose_jobs: list[PoseJob]) -> dict[tuple[str, ...], list[PoseJob]]:
    """From a set of DesignDirectories, find all the compositions and group together

    Args:
        pose_jobs: The PoseJob to group according to composition
    Returns:
        List of similarly named PoseJob mapped to their name
    """
    compositions = {}
    for pose_job in pose_jobs:
        entity_names = tuple(pose_job.entity_names)
        found_composition = None
        for permutation in combinations(entity_names, len(entity_names)):
            found_composition = compositions.get(permutation, None)
            if found_composition:
                break

        if found_composition:
            compositions[entity_names].append(pose_job)
        else:
            compositions[entity_names] = [pose_job]

    return compositions

invert_cluster_map

invert_cluster_map(cluster_map: dict[Any, list[Any]])

Return an inverted cluster map where the cluster members map to the representative

Parameters:

cluster_map (dict[Any, list[Any]]) –

The standard pose_cluster_map format

Returns: An inverted cluster_map where the members are keys and the representative is the value

Source code in symdesign/protocols/cluster.py

def invert_cluster_map(cluster_map: dict[Any, list[Any]]):
    """Return an inverted cluster map where the cluster members map to the representative

    Args:
        cluster_map: The standard pose_cluster_map format
    Returns:
        An inverted cluster_map where the members are keys and the representative is the value
    """
    inverted_map = {member: cluster_rep for cluster_rep, members in cluster_map.items() for member in members}
    # Add all representatives
    inverted_map.update({cluster_rep: cluster_rep for cluster_rep in cluster_map})

    return inverted_map