Source code for hsr.pca_transform

# HSR: Hyper-Shape Recognition
# This file is part of HSR, which is licensed under the
# GNU Lesser General Public License v3.0 (or any later version).
# See the LICENSE file for more details.

# Script to perform Principal Component Analysis (PCA) analysis on n-dimensional molecular data 
# and return the transformed data for fingerprint calculation

import numpy as np
from scipy.stats import skew


[docs]
def compute_pca_using_covariance(original_data, chirality=False, return_axes=False, print_steps=False):
    """    
    Perform Principal Component Analysis (PCA) using eigendecomposition of the covariance matrix.

    This function conducts PCA on a given dataset to produce a consistent reference system,
    facilitating comparison between different molecules. 
    It emphasizes generating eigenvectors that provide deterministic outcomes and consistent orientations. 
    The function also includes an option to handle chiral molecules by ensuring a positive determinant 
    for the transformation matrix.

    Parameters
    ----------
    original_data : numpy.ndarray
        An N-dimensional array representing a molecule, where each row is a sample/point. 
        The array should have a shape (n_samples, n_features), where n_samples is the number 
        of samples and n_features is the number of features.

    chirality : bool, optional
        If set to True, the function ensures that the determinant of the transformation 
        matrix is positive, allowing for the distinction of chiral molecules. 
        Default is False.
    
    return_axes : bool, optional
        If True, returns the principal axes (eigenvectors) in addition to the transformed data. 
        Default is False.
    
    print_steps : bool, optional
        If True, prints the steps of the PCA process: covariance matrix, eigenvalues, eigenvectors and
        transformed data. Default is False. 

    Returns
    -------
    transformed_data : numpy.ndarray
        The dataset after PCA transformation. This data is aligned to the principal components 
        and is of the same shape as the original data.
    
    dimensionality : int
        The number of significant dimensions in the transformed data. 
        Only returnd if chirality is True.
    
    eigenvectors : numpy.ndarray, optional
        Only returned if return_axes is True. The principal axes of the transformation, represented 
        as eigenvectors. Each column corresponds to an eigenvector.
    """
    covariance_matrix = np.cov(original_data, rowvar=False, ddof=0,) 
    
    eigenvalues, eigenvectors = np.linalg.eigh(covariance_matrix)
    eigenvalues, eigenvectors = eigenvalues[::-1], eigenvectors[:, ::-1]
    
    threshold = 1e-4
    significant_indices = np.where(abs(eigenvalues) > threshold)[0]

    # Handle chirality
    if chirality:
        original_eigenvectors_number = eigenvectors.shape[1]
        reduced_eigenvectors = extract_relevant_subspace(eigenvectors, significant_indices)
        # If the number of eigenvectors is different from the number of significant indices,
        # the chirality cannot be unambigously. The result may not be consistent.
        if original_eigenvectors_number != len(significant_indices):
            print(f'WARNING: Chirality may not be consistent. {original_eigenvectors_number-len(significant_indices)} vectors have arbitrary signs.')
        
        determinant = np.linalg.det(reduced_eigenvectors) 
        # if determinant < 0:
        #     eigenvectors[:, 0] *= -1
   
        adjusted_eigenvectors, n_changes, best_eigenvector_to_flip  = adjust_eigenvector_signs(original_data, eigenvectors[:, significant_indices], chirality) 
        eigenvectors[:, significant_indices] = adjusted_eigenvectors

        # if n_changes % 2 == 1:            
        #     eigenvectors[:, best_eigenvector_to_flip] *= -1
        
        # New approach to handle chirality by determinant imposition
        if determinant*(-1)**n_changes < 0:
            eigenvectors[:, best_eigenvector_to_flip] *= -1
    
        transformed_data = np.dot(original_data, eigenvectors)
        dimesnionality = len(significant_indices)
        
        if print_steps:
            print(f'Covariance matrix:\n{covariance_matrix}\n')
            print(f'Eigenvalues:\n{eigenvalues}\n')
            print(f'Eigenvectors:\n{eigenvectors}\n')
            print(f'Transformed data:\n{transformed_data}\n')
            
        if return_axes:
            return transformed_data, dimesnionality, eigenvectors
        else:
            return transformed_data, dimesnionality

    adjusted_eigenvectors, n_changes, best_eigenvector_to_flip  = adjust_eigenvector_signs(original_data, eigenvectors[:, significant_indices], chirality) 
    eigenvectors[:, significant_indices] = adjusted_eigenvectors
    
    transformed_data = np.dot(original_data, eigenvectors)
    
    if print_steps:
        print(f'Covariance matrix:\n{covariance_matrix}\n')
        print(f'Eigenvalues:\n{eigenvalues}\n')
        print(f'Eigenvectors:\n{eigenvectors}\n')
        print(f'Transformed data:\n{transformed_data}\n')

    if return_axes:
        return transformed_data, eigenvectors
    else:
        return  transformed_data



[docs]
def adjust_eigenvector_signs(original_data, eigenvectors, chirality=False, tolerance= 1e-10):
    """
    Adjust the sign of eigenvectors based on the data's projections.

    This function iterates through each eigenvector and determines its sign by examining 
    the direction of the data's maximum projection along that eigenvector. If the maximum 
    projection is negative, the sign of the eigenvector is flipped. The function also 
    handles special cases such as symmetric distributions of projections and can adjust 
    eigenvectors based on chirality considerations.
    
    Parameters
    ----------
    original_data : numpy.ndarray
        N-dimensional array representing a molecule, where each row is a sample/point.
    eigenvectors : numpy.ndarray
        Eigenvectors obtained from the PCA decomposition.
    chirality : bool, optional
        If True, the function also considers the skewness of the projections to decide 
        on flipping the eigenvector. This is necessary for distinguishing 
        chiral molecules. Defaults to False.
    tolerance : float, optional
        Tolerance used when comparing projections. Defaults to 1e-4.

    Returns
    -------
    eigenvectors : numpy.ndarray
        Adjusted eigenvectors with their sign possibly flipped.
    sign_changes : int
        The number of eigenvectors that had their signs changed.
    best_eigenvector_to_flip : int
        Index of the eigenvector with the highest skewness, relevant when chirality 
        is considered. This is the eigenvector most likely to be flipped to preserve 
        chirality.
    """
    sign_changes = 0
    symmetric_eigenvectors = []
    skewness_values = []
    best_eigenvector_to_flip = 0
    
    for i in range(eigenvectors.shape[1]):
        # Compute the projections of the original data onto the current eigenvector
        projections = original_data.dot(eigenvectors[:, i])
        
        # Compute skewness for current projections
        if chirality:
            current_skewness = skew(projections)
            skewness_values.append(abs(current_skewness))

        remaining_indices = np.arange(original_data.shape[0]) 
        max_abs_coordinate = np.max(np.abs(projections))

        while True:
            # Find the points with maximum absolute coordinate among the remaining ones
            mask_max = np.isclose(np.abs(projections[remaining_indices]), max_abs_coordinate, atol=tolerance)
            max_indices = remaining_indices[mask_max]  
            
            # If all points with the maximum absolute coordinate have the same sign, use them for a decision
            unique_signs = np.sign(projections[max_indices])
            if np.all(unique_signs == unique_signs[0]):
                break

            if len(max_indices) == 1:
                break
            
            # If there is a tie, ignore these points and find the maximum absolute coordinate again
            remaining_indices = remaining_indices[~mask_max]
            if len(remaining_indices) == 0: # if all points have the same component, break the loop
                symmetric_eigenvectors.append(i)
                break
            max_abs_coordinate = np.max(np.abs(projections[remaining_indices]))
        
        if len(remaining_indices) > 0 and projections[max_indices[0]] < 0:
            eigenvectors[:, i] *= -1
            sign_changes += 1
            
    if symmetric_eigenvectors:
        if sign_changes % 2 == 1:
            eigenvectors[:, symmetric_eigenvectors[0]] *= -1
            sign_changes = 0
            
    if chirality:
        best_eigenvector_to_flip = np.argmax(skewness_values)   
         
    return eigenvectors, sign_changes, best_eigenvector_to_flip 



[docs]
def extract_relevant_subspace(eigenvectors, significant_indices, tol=1e-10):
    """
    Extracts the subset of eigenvectors that's relevant for the determinant calculation.
    
    This function prunes eigenvectors by removing rows and columns that have all zeros 
    except for a single entry close to 1 or -1 within a given tolerance (eigenvectors with
    an eigenvalue equal to 0, and relative components). Then, it further 
    reduces the matrix using the provided significant indices to give a relevant 
    subset of eigenvectors.

    Parameters
    ----------
    eigenvectors : numpy.ndarray
        The eigenvectors matrix to prune and reduce.
    significant_indices : numpy.ndarray
        Indices of significant eigenvectors.
    tol : float, optional (default = 1e-10)
        Tolerance for determining whether a value is close to 0, 1, or -1.

    Returns
    -------
    numpy.ndarray
        The determinant-relevant subset of eigenvectors.
    """
    
    row_mask = ~np.all((np.abs(eigenvectors) < tol) | (np.abs(eigenvectors - 1) < tol) | (np.abs(eigenvectors + 1) < tol), axis=1)    
    col_mask = ~np.all((np.abs(eigenvectors.T) < tol) | (np.abs(eigenvectors.T - 1) < tol) | (np.abs(eigenvectors.T + 1) < tol), axis=1)
    pruned_eigenvectors = eigenvectors[row_mask][:, col_mask]
    reduced_eigenvectors = pruned_eigenvectors[significant_indices][:, significant_indices]
    return reduced_eigenvectors