Source code for hsr.pre_processing

# HSR: Hyper-Shape Recognition
# This file is part of HSR, which is licensed under the
# GNU Lesser General Public License v3.0 (or any later version).
# See the LICENSE file for more details.

# Script to collect and pre-process molecules from files and 
# convert them in datastructures to compute their similarity based on 
# PCA method considering coordinates and additional features.

import numpy as np
from rdkit import Chem
from rdkit.Chem.rdmolfiles import MolFromMol2File, MolFromMolFile, MolFromPDBFile, MolFromXYZFile, SDMolSupplier
from .utils import *


[docs] def read_mol_from_file(path, removeHs=False, sanitize=False): """ General reader for molecules from files. Parameters ---------- path : str Path to the file. removeHs : bool, optional Whether to remove hydrogens. Defaults to False. sanitize : bool, optional Whether to sanitize the molecules. Defaults to False. Returns ------- rdkit.Chem.rdchem.Mol A RDKit molecule object. """ extension = path.split('.')[-1] if extension == 'mol': return MolFromMolFile(path, removeHs=removeHs, sanitize=sanitize) elif extension == 'mol2': return MolFromMol2File(path, removeHs=removeHs, sanitize=sanitize) elif extension == 'pdb': return MolFromPDBFile(path, removeHs=removeHs, sanitize=sanitize) elif extension == 'xyz': return MolFromXYZFile(path) elif extension == 'sdf': suppl = Chem.SDMolSupplier(path, removeHs=removeHs, sanitize=sanitize) return next(suppl, None) else: print(f"Unsupported file format: {extension}") return None
[docs] def load_molecules_from_sdf(path, removeHs=False, sanitize=False): """ Load a list of molecules from an SDF file. Parameters ---------- path : str Path to the SDF file. removeHs : bool, optional Whether to remove hydrogens. Defaults to False. sanitize : bool, optional Whether to sanitize the molecules. Defaults to False. Returns ------- list of rdkit.Chem.rdchem.Mol A list of RDKit molecule objects. """ suppl = Chem.SDMolSupplier(path, removeHs=removeHs, sanitize=sanitize) molecules = [mol for mol in suppl if mol is not None] return molecules
[docs] def molecule_to_ndarray(molecule, features=DEFAULT_FEATURES, removeHs=False): """ Generate a numpy array representing the given molecule in N dimensions. This function converts a molecule into an N-dimensional numpy array based on specified features. Each feature is computed using a function defined in the 'features' dictionary. Parameters ---------- molecule : rdkit.Chem.rdchem.Mol The input RDKit molecule object. features : dict[str, callable], optional A dictionary where each key is a feature name (str) and the value is a callable function to compute that feature. The function takes an RDKit atom object as input and returns a feature value (a numeric type). Defaults to DEFAULT_FEATURES. removeHs : : bool, optional If True, hydrogen atoms will not be included in the array representation. Defaults to False. Returns ------- numpy.ndarray Array with shape (number of atoms, 3 spatial coordinates + number of features), representing the molecule. """ molecule_info = {'coordinates': []} if features: for key in features: molecule_info[key] = [] for atom in molecule.GetAtoms(): # Skip hydrogens if removeHs is True if removeHs and atom.GetAtomicNum() == 1: continue position = molecule.GetConformer().GetAtomPosition(atom.GetIdx()) molecule_info['coordinates'].append([position.x, position.y, position.z]) if features: for key, func in features.items(): value = func(atom) molecule_info[key].append(value) arrays = [] for key in molecule_info: if key == 'coordinates': arrays.append(np.array(molecule_info[key])) else: arrays.append(np.array(molecule_info[key]).reshape(-1, 1)) mol_nd = np.hstack(arrays) # Centering data mol_nd = mol_nd - np.mean(mol_nd, axis=0) return mol_nd