Source code for hsr.similarity

# HSR: Hyper-Shape Recognition
# This file is part of HSR, which is licensed under the
# GNU Lesser General Public License v3.0 (or any later version).
# See the LICENSE file for more details.

# Script to calculate similarity scores between molecules and/or their fingerprints

from .utils import * 
from .fingerprint import *


[docs]
def calculate_manhattan_distance(moments1: list, moments2:list):
    """
    Calculate the manhattan distance between two lists.

    Parameters
    ----------
    moments1 : list
        The first list of numerical values.
    moments2 : list
        The second list of numerical values, must be of the same length as moments1.

    Returns
    -------
    float
        The mean absolute difference between the two lists.
    """
    manhattan_dist = 0
    for i in range(len(moments1)):
        manhattan_dist += abs(moments1[i] - moments2[i])
    return manhattan_dist



[docs]
def calculate_similarity_from_distance(distance, n_components):
    """
    Calculate similarity score from a distance score.

    This function converts a distance score into a similarity score using 
    a reciprocal function. The distance is first normalized by the number of components
    of the fingerprint. The similarity score approaches 1 as the difference 
    score approaches 0, and it approaches 0 as the difference score increases.

    Parameters
    ----------
    partial_score : float
        The difference score, a non-negative number.
    
    n_components : int
        The number of components in the fingerprint.

    Returns
    -------
    float
        The similarity score derived from the distance.
    """
    return 1/(1 + distance/n_components)




[docs]
def compute_similarity_score(fingerprint_1: list, fingerprint_2: list):
    """
    Calculate the similarity score between two fingerprints.
    
    Parameters
    ----------
    fingerprint_1 : list
        The fingerprint of the first molecule.
    fingerprint_2 : list
        The fingerprint of the second molecule.

    Returns
    -------
    float
        The computed similarity score.
    """
    distance = calculate_manhattan_distance(fingerprint_1, fingerprint_2)
    similarity = calculate_similarity_from_distance(distance, len(fingerprint_1))
    return similarity



[docs]
def compute_distance_from_ndarray(mol1_nd: np.array, mol2_nd: np.array, scaling='matrix', chirality=False):
    """
    Calculate the distance score between two molecules represented as N-dimensional arrays.

    This function computes fingerprints for two molecules based on their N-dimensional array 
    representations and then calculates a distance score between these fingerprints.
    
    Parameters
    ----------
    mol1_nd : numpy.ndarray
        The N-dimensional array representing the first molecule.
    mol2_nd : numpy.ndarray 
        The N-dimensional array representing the second molecule.
    scaling : str, float, or np.ndarray
        Specifies the scaling applied to reference points. If set to 'matrix' (default), 
        a scaling matrix is automatically computed based on the PCA-transformed data. 
        If a float is provided, it's used as a scaling factor. If a numpy.ndarray is provided, 
        it's used as a scaling matrix.
    chirality : bool, optional
        Consider chirality in the generation of fingerprints if set to True.

    Returns
    -------
    float
        The computed distance score between the two molecules.
    """
    if chirality:
        f1, dimensionality1 = generate_fingerprint_from_data(mol1_nd, scaling=scaling, chirality=chirality)
        f2, dimensionality2 = generate_fingerprint_from_data(mol2_nd, scaling=scaling, chirality=chirality)
        
        if dimensionality1 != dimensionality2:
            print(f"WARNING: Comparison between molecules of different dimensionality: {dimensionality1} and {dimensionality2}.\n"
                   "The similarity score may not be accurate!")
    else:
        f1 = generate_fingerprint_from_data(mol1_nd, scaling=scaling, chirality=chirality)
        f2 = generate_fingerprint_from_data(mol2_nd, scaling=scaling, chirality=chirality)
        
    distance_score = calculate_manhattan_distance(f1, f2)
    return distance_score



[docs]
def compute_similarity_from_ndarray(mol1_nd: np.array, mol2_nd: np.array, scaling='matrix', chirality=False):
    """
    Calculate the similarity score between two molecules represented as N-dimensional arrays.

    This function computes fingerprints for two molecules based on their N-dimensional array 
    representations and then calculates a similarity score between these fingerprints.
    
    Parameters
    ----------
    mol1_nd : numpy.ndarray
        The N-dimensional array representing the first molecule.
    mol2_nd : numpy.ndarray 
        The N-dimensional array representing the second molecule.
    scaling : str, float, or np.ndarray
        Specifies the scaling applied to reference points. If set to 'matrix' (default), 
        a scaling matrix is automatically computed based on the PCA-transformed data. 
        If a float is provided, it's used as a scaling factor. If a numpy.ndarray is provided, 
        it's used as a scaling matrix.
    chirality : bool, optional
        Consider chirality in the generation of fingerprints if set to True.

    Returns
    -------
    float
        The computed similarity score between the two molecules.
    """
    distance_score = compute_distance_from_ndarray(mol1_nd, mol2_nd, scaling=scaling, chirality=chirality)
    similarity_score = calculate_similarity_from_distance(distance_score, (mol1_nd.shape[1]+1)*3)
    return similarity_score



[docs]
def compute_distance(mol1, mol2, features=DEFAULT_FEATURES, scaling='matrix', removeHs=False, chirality=False):
    """
    Calculate the distance score between two molecules using their n-dimensional fingerprints.
    
    This function generates fingerprints for two molecules based on their structures and a set of features, 
    and then computes a distance score between these fingerprints.
    
    Parameters
    ----------
    mol1 : RDKit Mol
        The first RDKit molecule object.
    mol2 : RDKit Mol
        The second RDKit molecule object.
    features : dict, optional
        Dictionary of features to be considered. Default is DEFAULT_FEATURES.
    scaling : str, float, or np.ndarray
        Specifies the scaling applied to reference points. If set to 'matrix' (default), 
        a scaling matrix is automatically computed based on the PCA-transformed data. 
        If a float is provided, it's used as a scaling factor. If a numpy.ndarray is provided, 
        it's used as a scaling matrix.
    removeHs : bool, optional
        If True, hydrogen atoms are removed from the molecule before generating the fingerprint.
    chirality : bool, optional
        Consider chirality in the generation of fingerprints if set to True.

    Returns
    -------
    float
        The computed distance score between the two molecules.
    """
    # Get molecules' fingerprints
    if chirality:
        f1, dimensionality1 = generate_fingerprint_from_molecule(mol1, features=features, scaling=scaling, removeHs=removeHs, chirality=chirality)
        f2, dimensionality2 = generate_fingerprint_from_molecule(mol2, features=features, scaling=scaling, removeHs=removeHs, chirality=chirality)
        
        # Compute distance score
        if dimensionality1 != dimensionality2:
            print(f"WARNING: Comparison between molecules of different dimensionality: {dimensionality1} and {dimensionality2}.\n"
                   "The similarity score may not be accurate!")
    else:
        f1 = generate_fingerprint_from_molecule(mol1, features=features, scaling=scaling, removeHs=removeHs, chirality=chirality)
        f2 = generate_fingerprint_from_molecule(mol2, features=features, scaling=scaling, removeHs=removeHs, chirality=chirality)
   
    distance = calculate_manhattan_distance(f1, f2)
    return distance, len(f1)



[docs]
def compute_similarity(mol1, mol2, features=DEFAULT_FEATURES, scaling='matrix', removeHs=False, chirality=False):
    """
    Calculate the similarity score between two molecules using their n-dimensional fingerprints.
    
    This function generates fingerprints for two molecules based on their structures and a set of features, 
    and then computes a similarity score between these fingerprints.
    
    Parameters
    ----------
    mol1 : RDKit Mol
        The first RDKit molecule object.
    mol2 : RDKit Mol
        The second RDKit molecule object.
    features : dict, optional
        Dictionary of features to be considered. Default is DEFAULT_FEATURES.
    scaling : str, float, or np.ndarray
        Specifies the scaling applied to reference points. If set to 'matrix' (default), 
        a scaling matrix is automatically computed based on the PCA-transformed data. 
        If a float is provided, it's used as a scaling factor. If a numpy.ndarray is provided, 
        it's used as a scaling matrix.
    removeHs : bool, optional
        If True, hydrogen atoms are removed from the molecule before generating the fingerprint.
    chirality : bool, optional
        Consider chirality in the generation of fingerprints if set to True.

    Returns
    -------
    float
        The computed similarity score between the two molecules.
    """
    distance, fp_dim = compute_distance(mol1, mol2, features=features, scaling=scaling, removeHs=removeHs, chirality=chirality)
    similarity = calculate_similarity_from_distance(distance, fp_dim)
    return similarity