Source code for hsr.similarity

# HSR: Hyper-Shape Recognition
# This file is part of HSR, which is licensed under the
# GNU Lesser General Public License v3.0 (or any later version).
# See the LICENSE file for more details.

# Script to calculate similarity scores between molecules and/or their fingerprints

from .utils import * 
from .fingerprint import *

[docs] def calculate_manhattan_distance(moments1: list, moments2:list): """ Calculate the manhattan distance between two lists. Parameters ---------- moments1 : list The first list of numerical values. moments2 : list The second list of numerical values, must be of the same length as moments1. Returns ------- float The mean absolute difference between the two lists. """ manhattan_dist = 0 for i in range(len(moments1)): manhattan_dist += abs(moments1[i] - moments2[i]) return manhattan_dist
[docs] def calculate_similarity_from_distance(distance, n_components): """ Calculate similarity score from a distance score. This function converts a distance score into a similarity score using a reciprocal function. The distance is first normalized by the number of components of the fingerprint. The similarity score approaches 1 as the difference score approaches 0, and it approaches 0 as the difference score increases. Parameters ---------- partial_score : float The difference score, a non-negative number. n_components : int The number of components in the fingerprint. Returns ------- float The similarity score derived from the distance. """ return 1/(1 + distance/n_components)
[docs] def compute_similarity_score(fingerprint_1: list, fingerprint_2: list): """ Calculate the similarity score between two fingerprints. Parameters ---------- fingerprint_1 : list The fingerprint of the first molecule. fingerprint_2 : list The fingerprint of the second molecule. Returns ------- float The computed similarity score. """ distance = calculate_manhattan_distance(fingerprint_1, fingerprint_2) similarity = calculate_similarity_from_distance(distance, len(fingerprint_1)) return similarity
[docs] def compute_distance_from_ndarray(mol1_nd: np.array, mol2_nd: np.array, scaling='matrix', chirality=False): """ Calculate the distance score between two molecules represented as N-dimensional arrays. This function computes fingerprints for two molecules based on their N-dimensional array representations and then calculates a distance score between these fingerprints. Parameters ---------- mol1_nd : numpy.ndarray The N-dimensional array representing the first molecule. mol2_nd : numpy.ndarray The N-dimensional array representing the second molecule. scaling : str, float, or np.ndarray Specifies the scaling applied to reference points. If set to 'matrix' (default), a scaling matrix is automatically computed based on the PCA-transformed data. If a float is provided, it's used as a scaling factor. If a numpy.ndarray is provided, it's used as a scaling matrix. chirality : bool, optional Consider chirality in the generation of fingerprints if set to True. Returns ------- float The computed distance score between the two molecules. """ if chirality: f1, dimensionality1 = generate_fingerprint_from_data(mol1_nd, scaling=scaling, chirality=chirality) f2, dimensionality2 = generate_fingerprint_from_data(mol2_nd, scaling=scaling, chirality=chirality) if dimensionality1 != dimensionality2: print(f"WARNING: Comparison between molecules of different dimensionality: {dimensionality1} and {dimensionality2}.\n" "The similarity score may not be accurate!") else: f1 = generate_fingerprint_from_data(mol1_nd, scaling=scaling, chirality=chirality) f2 = generate_fingerprint_from_data(mol2_nd, scaling=scaling, chirality=chirality) distance_score = calculate_manhattan_distance(f1, f2) return distance_score
[docs] def compute_similarity_from_ndarray(mol1_nd: np.array, mol2_nd: np.array, scaling='matrix', chirality=False): """ Calculate the similarity score between two molecules represented as N-dimensional arrays. This function computes fingerprints for two molecules based on their N-dimensional array representations and then calculates a similarity score between these fingerprints. Parameters ---------- mol1_nd : numpy.ndarray The N-dimensional array representing the first molecule. mol2_nd : numpy.ndarray The N-dimensional array representing the second molecule. scaling : str, float, or np.ndarray Specifies the scaling applied to reference points. If set to 'matrix' (default), a scaling matrix is automatically computed based on the PCA-transformed data. If a float is provided, it's used as a scaling factor. If a numpy.ndarray is provided, it's used as a scaling matrix. chirality : bool, optional Consider chirality in the generation of fingerprints if set to True. Returns ------- float The computed similarity score between the two molecules. """ distance_score = compute_distance_from_ndarray(mol1_nd, mol2_nd, scaling=scaling, chirality=chirality) similarity_score = calculate_similarity_from_distance(distance_score, (mol1_nd.shape[1]+1)*3) return similarity_score
[docs] def compute_distance(mol1, mol2, features=DEFAULT_FEATURES, scaling='matrix', removeHs=False, chirality=False): """ Calculate the distance score between two molecules using their n-dimensional fingerprints. This function generates fingerprints for two molecules based on their structures and a set of features, and then computes a distance score between these fingerprints. Parameters ---------- mol1 : RDKit Mol The first RDKit molecule object. mol2 : RDKit Mol The second RDKit molecule object. features : dict, optional Dictionary of features to be considered. Default is DEFAULT_FEATURES. scaling : str, float, or np.ndarray Specifies the scaling applied to reference points. If set to 'matrix' (default), a scaling matrix is automatically computed based on the PCA-transformed data. If a float is provided, it's used as a scaling factor. If a numpy.ndarray is provided, it's used as a scaling matrix. removeHs : bool, optional If True, hydrogen atoms are removed from the molecule before generating the fingerprint. chirality : bool, optional Consider chirality in the generation of fingerprints if set to True. Returns ------- float The computed distance score between the two molecules. """ # Get molecules' fingerprints if chirality: f1, dimensionality1 = generate_fingerprint_from_molecule(mol1, features=features, scaling=scaling, removeHs=removeHs, chirality=chirality) f2, dimensionality2 = generate_fingerprint_from_molecule(mol2, features=features, scaling=scaling, removeHs=removeHs, chirality=chirality) # Compute distance score if dimensionality1 != dimensionality2: print(f"WARNING: Comparison between molecules of different dimensionality: {dimensionality1} and {dimensionality2}.\n" "The similarity score may not be accurate!") else: f1 = generate_fingerprint_from_molecule(mol1, features=features, scaling=scaling, removeHs=removeHs, chirality=chirality) f2 = generate_fingerprint_from_molecule(mol2, features=features, scaling=scaling, removeHs=removeHs, chirality=chirality) distance = calculate_manhattan_distance(f1, f2) return distance, len(f1)
[docs] def compute_similarity(mol1, mol2, features=DEFAULT_FEATURES, scaling='matrix', removeHs=False, chirality=False): """ Calculate the similarity score between two molecules using their n-dimensional fingerprints. This function generates fingerprints for two molecules based on their structures and a set of features, and then computes a similarity score between these fingerprints. Parameters ---------- mol1 : RDKit Mol The first RDKit molecule object. mol2 : RDKit Mol The second RDKit molecule object. features : dict, optional Dictionary of features to be considered. Default is DEFAULT_FEATURES. scaling : str, float, or np.ndarray Specifies the scaling applied to reference points. If set to 'matrix' (default), a scaling matrix is automatically computed based on the PCA-transformed data. If a float is provided, it's used as a scaling factor. If a numpy.ndarray is provided, it's used as a scaling matrix. removeHs : bool, optional If True, hydrogen atoms are removed from the molecule before generating the fingerprint. chirality : bool, optional Consider chirality in the generation of fingerprints if set to True. Returns ------- float The computed similarity score between the two molecules. """ distance, fp_dim = compute_distance(mol1, mol2, features=features, scaling=scaling, removeHs=removeHs, chirality=chirality) similarity = calculate_similarity_from_distance(distance, fp_dim) return similarity