Source code for hsr.pre_processing

# HSR: Hyper-Shape Recognition
# This file is part of HSR, which is licensed under the
# GNU Lesser General Public License v3.0 (or any later version).
# See the LICENSE file for more details.

# Script to collect and pre-process molecules from files and 
# convert them in datastructures to compute their similarity based on 
# PCA method considering coordinates and additional features.

import numpy as np
from rdkit import Chem
from rdkit.Chem.rdmolfiles import MolFromMol2File, MolFromMolFile, MolFromPDBFile, MolFromXYZFile, SDMolSupplier
from .utils import *



[docs]
def read_mol_from_file(path, removeHs=False, sanitize=False):
    """
    General reader for molecules from files.
    
    Parameters
    ----------
    path : str 
        Path to the file.
    removeHs : bool, optional
        Whether to remove hydrogens. Defaults to False.
    sanitize : bool, optional
        Whether to sanitize the molecules. Defaults to False.
    
    Returns
    -------
    rdkit.Chem.rdchem.Mol
        A RDKit molecule object.
    """
    extension = path.split('.')[-1]
    if extension == 'mol':
        return MolFromMolFile(path, removeHs=removeHs, sanitize=sanitize)
    elif extension == 'mol2':
        return MolFromMol2File(path, removeHs=removeHs, sanitize=sanitize)
    elif extension == 'pdb':
        return MolFromPDBFile(path, removeHs=removeHs, sanitize=sanitize)
    elif extension == 'xyz':
        return MolFromXYZFile(path)
    elif extension == 'sdf':
        suppl = Chem.SDMolSupplier(path, removeHs=removeHs, sanitize=sanitize)
        return next(suppl, None)
    else:
        print(f"Unsupported file format: {extension}")
        return None



[docs]
def load_molecules_from_sdf(path, removeHs=False, sanitize=False):
    """
    Load a list of molecules from an SDF file.
    
    Parameters
    ----------
    path : str
        Path to the SDF file.
    removeHs : bool, optional
        Whether to remove hydrogens. Defaults to False.
    sanitize : bool, optional
        Whether to sanitize the molecules. Defaults to False.

    Returns
    -------
    list of rdkit.Chem.rdchem.Mol
        A list of RDKit molecule objects.
    """
    suppl = Chem.SDMolSupplier(path, removeHs=removeHs, sanitize=sanitize)
    molecules = [mol for mol in suppl if mol is not None]
    return molecules



[docs]
def molecule_to_ndarray(molecule, features=DEFAULT_FEATURES, removeHs=False):
    """
    Generate a numpy array representing the given molecule in N dimensions.
    
    This function converts a molecule into an N-dimensional numpy array based on specified features. 
    Each feature is computed using a function defined in the 'features' dictionary.

    Parameters
    ----------
    molecule : rdkit.Chem.rdchem.Mol
        The input RDKit molecule object.
    features : dict[str, callable], optional
        A dictionary where each key is a feature name (str) and the value is a callable 
        function to compute that feature. The function takes an RDKit atom object as input 
        and returns a feature value (a numeric type).
        Defaults to DEFAULT_FEATURES.
    removeHs : : bool, optional
        If True, hydrogen atoms will not be included in the array representation.
        Defaults to False.

    Returns
    -------
    numpy.ndarray
        Array with shape (number of atoms, 3 spatial coordinates + number of features),
        representing the molecule.
    """
    
    molecule_info = {'coordinates': []}

    if features:
        for key in features:
            molecule_info[key] = []

    for atom in molecule.GetAtoms():
        # Skip hydrogens if removeHs is True
        if removeHs and atom.GetAtomicNum() == 1:
            continue
        position = molecule.GetConformer().GetAtomPosition(atom.GetIdx())
        molecule_info['coordinates'].append([position.x, position.y, position.z])

        if features:
            for key, func in features.items():
                value = func(atom)
                molecule_info[key].append(value)

    arrays = []
    for key in molecule_info:
        if key == 'coordinates':
            arrays.append(np.array(molecule_info[key]))  
        else:
            arrays.append(np.array(molecule_info[key]).reshape(-1, 1))
    mol_nd = np.hstack(arrays)
    # Centering data
    mol_nd = mol_nd - np.mean(mol_nd, axis=0)
    return mol_nd