Source code for hsr.fingerprint

# HSR: Hyper-Shape Recognition
# This file is part of HSR, which is licensed under the
# GNU Lesser General Public License v3.0 (or any later version).
# See the LICENSE file for more details.

# Script that provides the fingerprints of the molecules for similarity comparison

import numpy as np
from scipy.spatial import distance
from scipy.stats import skew
from .pre_processing import *
from .pca_transform import *
from .utils import *


[docs]
def generate_reference_points(dimensionality, scaling=None):
    """
    Generate reference points in the n-dimensional space.
    
    Parameters
    ----------
    dimensionality : int
        The number of dimensions.
    scaling : float, np.ndarray
        The scaling applied to the reference points.

    Returns
    -------
    np.ndarray
        An array of reference points including the centroid and the points on each axis.
    """
    centroid = np.zeros(dimensionality)
    axis_points = np.eye(dimensionality)
    reference_points = np.vstack((centroid, axis_points))
    
    if scaling is not None:
        if isinstance(scaling, (float, int)):
            reference_points *= scaling
        elif isinstance(scaling, np.ndarray):
            reference_points = np.dot(reference_points, scaling)
        else:
            raise TypeError("Scaling must be either a number (factor) or a numpy array (matrix).")
    
    return reference_points



[docs]
def compute_distances(molecule_data: np.ndarray, scaling=None):
    """
    Calculate the Euclidean distance between each point in molecule_data and scaled reference points.
    
    This function computes the distances between each data point in a molecule and a set of reference points. 
    The reference points are scaled either by a factor or by a matrix depending on the type of the 'scaling' parameter.

    Parameters
    ----------
    molecule_data : np.ndarray
        Data of the molecule with each row representing a point.
    scaling : float, np.ndarray
        The scaling applied to the reference points.

    Returns
    -------
    np.ndarray
        A matrix of distances, where each element [i, j] is the distance between the i-th molecule data point and the j-th reference point.
    """
    reference_points = generate_reference_points(molecule_data.shape[1], scaling)
        
    distances = np.empty((molecule_data.shape[0], len(reference_points)))
    for i, point in enumerate(molecule_data):
        for j, ref_point in enumerate(reference_points):
            distances[i, j] = distance.euclidean(point, ref_point)
    return distances



[docs]
def compute_statistics(distances):
    """
    Calculate statistical moments (mean, standard deviation, skewness) for the given distances.
    
    Parameters
    ----------
    distances : np.ndarray
        Matrix with distances between each point and each reference point.

    Returns
    -------
    list
        A list of computed statistics.
    """
    means = np.mean(distances, axis=1)
    std_devs = np.std(distances, axis=1)
    skewness = np.nan_to_num(skew(distances, axis=1))

    statistics_matrix = np.vstack((means, std_devs, skewness)).T   
    statistics_list = [element for row in statistics_matrix for element in row]

    return statistics_list  



[docs]
def generate_fingerprint_from_transformed_data(molecule_data: np.ndarray, scaling):
    """
    Compute a fingerprint from transformed molecular data.

    This function generates a molecular fingerprint based on distance statistics. 
    It calculates distances between the transformed molecular data points and a set 
    of reference points that are scaled using the provided scaling parameter. 
    The fingerprint is derived from these distance measurements.

    Parameters
    ----------
    molecule_data : np.ndarray
        Transformed data of the molecule, each row representing a transformed point.
    scaling : float, np.ndarray
        The scaling applied to the reference points. 

    Returns
    -------
    list
        Fingerprint derived from the distance measurements to scaled reference points.
    """

    distances = compute_distances(molecule_data, scaling)
    fingerprint = compute_statistics(distances.T)
    
    return fingerprint



[docs]
def generate_fingerprint_from_data(molecule_data: np.array, scaling='matrix', chirality=False):
    """
    Generate a fingerprint directly from molecular data.

    This function takes the data of a molecule, applies PCA transformation considering chirality if needed,
    and computes the fingerprint.

    Parameters
    ----------
    molecule_data : np.array
        Data of the molecule, with each row representing a point.
    scaling : str, float, or np.ndarray
        Specifies the scaling applied to reference points. If set to 'matrix' (default), 
        a scaling matrix is automatically computed based on the PCA-transformed data. 
        If a float is provided, it's used as a scaling factor. If a numpy.ndarray is provided, 
        it's used as a scaling matrix.
    chirality : bool, optional
        Consider chirality in PCA transformation if set to True.

    Returns
    -------
    list or tuple
        Fingerprint of the molecule, and dimensionality if chirality is considered.
    """
    if chirality:
        transformed_data, dimensionality = compute_pca_using_covariance(molecule_data, chirality=chirality)
    else:
        transformed_data = compute_pca_using_covariance(molecule_data, chirality=chirality)
    # Determine scaling
    if scaling == 'matrix': # Default behaviour
        scaling_value = compute_scaling_matrix(transformed_data)
    else:
        scaling_value = scaling
        
    fingerprint = generate_fingerprint_from_transformed_data(transformed_data, scaling_value)
   
    return (fingerprint, dimensionality) if chirality else fingerprint



[docs]
def generate_fingerprint_from_molecule(molecule, features=DEFAULT_FEATURES, scaling='matrix', chirality=False, removeHs=False):
    """
    Generate a fingerprint from a molecular structure using specified features and scaling.

    This function processes an RDKit molecule object to generate its fingerprint. 
    It first converts the molecule into n-dimensional data based on the specified features, 
    optionally removing hydrogen atoms if specified. A PCA transformation is then performed, 
    with an option to consider chirality. The reference points for distance calculation are 
    scaled as per the provided scaling parameter, and the fingerprint is computed based on these distances.

    Parameters
    ----------
    molecule : RDKit Mol
        RDKit molecule object.
    features : dict, optional
        Features to consider for molecule conversion. Default is DEFAULT_FEATURES.
    scaling : str, float, or np.ndarray
        Specifies the scaling applied to reference points. If 'matrix', a scaling matrix is computed and applied.
        If a float, it is used as a scaling factor. If a numpy.ndarray, it is directly used as the scaling matrix.
    chirality : bool, optional
        If True, chirality is considered in the PCA transformation, which can be important for distinguishing chiral molecules.
    removeHs : bool, optional
        If True, hydrogen atoms are removed from the molecule before conversion, focusing on heavier atoms.

    Returns
    -------
    list or tuple
        Fingerprint of the molecule. If chirality is considered, also returns the dimensionality post-PCA transformation.
    """
    
    # Convert molecule to n-dimensional data
    molecule_data = molecule_to_ndarray(molecule, features, removeHs=removeHs)
    
    # Fingerprint
    if chirality:
        fingerprint, dimensionality = generate_fingerprint_from_data(molecule_data, scaling=scaling, chirality=chirality)
        return fingerprint, dimensionality
    else:
        fingerprint = generate_fingerprint_from_data(molecule_data, scaling=scaling, chirality=chirality)
        return fingerprint