Source code for modifinder.classes.Compound



import modifinder.utilities.general_utils as general_utils
from modifinder.utilities.mol_utils import _get_molecule
import rdkit.Chem.rdMolDescriptors as rdMolDescriptors
from rdkit import Chem
import numpy as np
import math
import uuid

# import modifinder as mf
from modifinder.classes.Spectrum import Spectrum
from modifinder.classes.StructureMeta import StructureMeta
from modifinder import convert


[docs]class Compound:
    """ A class to represent a compound 

    The compound always has spectrum data.
    If the compound structure is known, it also has a structure property.
    
    Attributes
    ----------
    
    id (str) : The id of the compound
        
    spectrum (Spectrum) : Spectrum Tuple representing mz, intensity, Precursor mass and Charge for mass spectrumetry data

    structure (Chem.Mol): The structure of the compound
    
    is_known (bool): A boolean indicating whether the compound is known, derived from the structure
    
    usi (str): The USI of the compound
    
    name (str): The name of the compound
    
    accession (str): The accession of the compound
    
    library_membership (str): The GNPS library membership of the compound
    
    Examples
    --------
    Create a compound by providing the necessary information:

    >>> compound = Compound(id="CCMSLIB00005435812", peaks=[[110.066925,38499.089844],[138.060638,412152.093750],[195.079575,6894530.000000],[195.200180,480874.812500],[196.082092,43027.628906]], precursor_mz=195.087, precursor_charge=1, adduct="[M+H]+", smiles="CN1C=NC2=C1C(=O)N(C(=O)N2C)C")

    Alternatively, you can create a compound by providing a dictionary of data:

    >>> data = {
    ...     "id": "CCMSLIB00005435812",
    ...     "peaks": [[110.066925,38499.089844],[138.060638,412152.093750],[195.079575,6894530.000000],[195.200180,480874.812500],[196.082092,43027.628906]],
    ...     "precursor_mz": 195.087,
    ...     "charge": 1,
    ...     "adduct": "[M+H]+",
    ...     "smiles": "CN1C=NC2=C1C(=O)N(C(=O)N2C)C"
    ... }
    >>> compound = Compound(data)

    You can also create a compound by providing a usi:

    >>> usi = "mzspec:GNPS:GNPS-LIBRARY:accession:CCMSLIB00005435812"
    >>> compound = Compound(usi)

    or with an accession:

    >>> accession = "CCMSLIB00005435812"
    >>> compound = Compound(accession)
    
    """
    
[docs]    def __init__(self, incoming_data=None, **kwargs):
        """Initialize the Compound class

        The compound class can be initialized in three different ways:
        1. By providing a dictionary of data to the *data* parameter that contains all the necessary information
        2. By providing a usi to the *data* parameter to retrieve the necessary information from GNPS
        3. By providing the necessary information as parameter
        
        If both the *data* and the parameters are provided, the parameters will override the data.
        
        If of the incoming data and kwargs are provided, an empty instance of the class will be created.

        Parameters
        ----------
        incoming_data : input data (optional, default: None)
            The data to initialize the class with, can be a dictionary of data, a usi, or a compound object. If not provided,
            the class will be initialized with the provided keyword arguments. If provided, the keyword arguments will still
            override the data.

        kwargs : keyword arguments (optional, default: No attributes)
            Attributes to initialize the class with, if provided, they will override the attributes from the data

        See Also
        --------
        modifinder.convert 
        
        Spectrum

        """
        
        # define the attributes of the class
        self._structure = None
        self.id = None
        self._spectrum = None
        self.usi = None
        self.is_known = None
        self.name = None
        self._distances = None
        self.additional_attributes = {}
        self._exact_mass = None

        if incoming_data is None and len(kwargs) == 0:
            return

        # attempt to initialize the class with the provided data
        if incoming_data is not None:
            convert.to_compound(incoming_data, use_object = self, **kwargs)
        else:
            self.update(**kwargs)

        # TODO: write setters for spectrum, structure to warn the user to update the dependent attributes

    @property
    def structure(self):
        return self._structure

    @structure.setter
    def structure(self, value):
        self._structure = value
        # Automatically update the related attributes
        if self._structure is not None:
            self._exact_mass = rdMolDescriptors.CalcExactMolWt(self._structure)
            self._distances = Chem.rdmolops.GetDistanceMatrix(self._structure)
        if self.is_known is None and self._structure is not None:
            self.is_known = True
        
        if self.is_known and self._structure is None:
            self.is_known = None
        
    @property
    def spectrum(self):
        return self._spectrum
    
    @spectrum.setter
    def spectrum(self, value):
        self._spectrum = value
            
    @property
    def distances(self) -> dict:
        """Get the distances between every pair of atoms in the compound
        
        Returns
        -------
            dict: A dictionary of distances between every pair of atoms in the compound
        """
        return self._distances
    
    @property
    def exact_mass(self) -> float:
        """Get the exact mass of the compound
        
        Returns
        -------
            float: The exact mass of the compound
        """
        return self._exact_mass
    
    
[docs]    def clear(self):
        """Clear the compound data and reset all the attributes to None"""
        self.structure = None
        self.id = None
        self.spectrum = None
        self.usi = None
        self.is_known = None
        self.name = None
        self._distances = None
        self.additional_attributes = {}
        self._exact_mass = None
        
    

[docs]    def update(self, _structure = None, structure = None, id: str = None, _spectrum: Spectrum = None, spectrum: Spectrum = None, usi: str = None, 
               is_known: bool = None, name: str = None,
               additional_attributes: dict = {}, 
               **kwargs):
        """Update the attributes of the class

        Parameters
        ----------
        _structure (Chem.Mol): The structure of the compound
        structure (Chem.Mol, Smiles, InChi): The structure of the compound
        id (str): The id of the compound
        _spectrum (Spectrum): an instance of Spectrum containing peak information [(mz, intensity)], precursor mass, charge, and adduct for mass spectrumetry data
        spectrum (Spectrum): an instance of Spectrum containing peak information [(mz, intensity)], precursor mass, charge, and adduct for mass spectrumetry data
        usi (str): The USI of the compound
        is_known (bool): A boolean indicating whether the compound is known, if set to False, annotators or other parts of the code will treat this compound as unknown, if not provided but the structure is provided, it will be set to True
        name (str): The name of the compound
        distances (dict): A dictionary of distances between every pair of atoms in the compound, if not provided, it will be calculated from the structure
        **kwargs: Additional data
            - A use case is for the scenarios where the *data* parameter is provided, these arguments will be used to parse and clean the data
        """
        # convert keys to lowercase
        lower_kwargs = {key.lower(): value for key, value in kwargs.items()}
        self.structure = _structure if _structure is not None else self.structure
        try:
            temp_structure = _get_molecule(structure, **lower_kwargs)
            self.structure = temp_structure if temp_structure is not None else self.structure
        except Exception:
            pass
        self.id = id if id is not None else self.id
        self.spectrum = _spectrum if _spectrum is not None else self.spectrum
        if spectrum is not None or "precursor_mz" in lower_kwargs:
            try:
                spectrum = convert.to_spectrum(spectrum, **lower_kwargs)
            except Exception as e:
                spectrum = None
                raise e
        
        if spectrum is not None and spectrum.mz is not None:
            self.spectrum = spectrum
        self.usi = usi if usi is not None else self.usi
        self.is_known = is_known if (is_known is not None) else self.is_known
        self.name = name if name is not None else self.name
        if self.name is None:
            self.name = lower_kwargs.get('compound_name', None)

        # update the rest of the attributes
        self.additional_attributes.update(additional_attributes)
        
        self._parse_data()
    

    def _parse_data(self):
        """ Parse missing and verify the data of the class"""
        # if self.is_known is None:
        #     self.is_known = (self.structure is not None)
            
        # if no id is provided, generate one
        if self.id is None:
            self.id = str(uuid.uuid4())
            
        
        # TODO: check for a valid compound
        # what is needed for a compound:
        # id, spectrum.peaks, spectrum.precursor_mass, spectrum.charge, spectrum.adduct
    

    def __str__(self):
        valuable_data_keys = ['id', 'name', 'usi']
        strings = [f"{key}: {self.__dict__[key]}" for key in valuable_data_keys if self.__dict__[key] is not None]
        joined = ', '.join(strings)

        result = f"Compound({joined}) with {len(self.spectrum.mz)} peaks"
        if self.structure is not None:
            result += f" and structure {Chem.MolToSmiles(self.structure)}"

        return result
    

[docs]    def copy(self):
        """Return a copy of the compound"""
        copied_compound = Compound()
        convert.to_compound(self, use_object = copied_compound)
        return copied_compound
    

[docs]    def get_meta_data(self):
        """ Get the meta data of the compound

        Returns:
            dict: A dictionary containing the meta data of the compound
        """
        description = {
            "num_peaks": len(self.spectrum.mz),
            "adduct": self.spectrum.adduct,
            "precursor_mz": self.spectrum.precursor_mz,
            "charge": self.spectrum.precursor_charge,
        }

        if self.name is not None:
            description["name"] = self.name
        
        if self.is_known:
            st_meta = StructureMeta.from_structure(self.structure)
            description.update(st_meta.__dict__)
        
        return description
    
    
[docs]    def print_peaks_to_fragments_map(self, peaks: list = None):
        """Print the peaks to fragments map
        
        Parameters
        ----------
        peaks : list, optional (default: None)
            The list of peaks to print the fragments for, if None, print all peaks
        """
        if peaks is None:
            peaks = range(len(self.spectrum.mz))
        
        for peak in peaks:
            print(f"Peak {peak}: {self.spectrum.mz[peak]}, Fragments: {self.spectrum.peak_fragments_map[peak]}")
        print()
        
    
[docs]    def find_existance(self, peakids: list):
        """
        For each atom, and for each peak in the list, find the fragments that the atom is part of
        
        Parameters
        ----------
            peakids : list of peak ids
        
        Returns
        -------
            existance : list of dicts for each atom, each dict contains the peak ids as keys and the fragments as values
            
        """
        
        existance = [dict() for i in range(len(self.structure.GetAtoms()))]
        for peak in peakids:
            for fragment in self.spectrum.peak_fragments_map[peak]:
                # get all the bits that are 1 in the fragment
                bin_fragment = bin(fragment)
                len_fragment = len(bin_fragment)
                hitAtoms = [len_fragment-i-1 for i in range(len(bin_fragment)) if bin_fragment[i] == '1']
                for atom in hitAtoms:
                    if peak not in existance[atom]:
                        existance[atom][peak] = []
                    existance[atom][peak].append(fragment)
        return existance
    
    
[docs]    def calculate_contribution_atom_in_peak(self, atom: int, peakindx:int, existance_data:list, CI:bool = False, CPA:bool = True, CFA:bool = True):
        """Calculates the contribution of an atom to a peak

        Parameters
        ----------
        atom : int
            The index of the atom
        peakindx : int
            The index of the peak
        existance_data : list of dicts
            The existance data for the atoms
        CI : bool, optional (default: False)
            Calculate the intensity factor
        CPA : bool, optional (default: True)
            Calculate the atom peak ambiguity factor
        CFA : bool, optional (default: True)
            Calculate the fragment ambiguity factor
        """
        contribution = 0
        if peakindx not in existance_data[atom]:
            return contribution
        
        intensity_factor = 1
        atom_peak_ambiguity_factor = 1
        fragment_ambiguity_factor = 1

        if CI:
            intensity_factor = self.spectrum.intensity[peakindx]
        if CPA:
            atom_peak_ambiguity_factor = 1/len(self.spectrum.peak_fragments_map[peakindx])

        for frag in existance_data[atom][peakindx]:
            if CFA:
                fragment_ambiguity_factor = 1/(bin(frag).count("1"))
            
            contribution += intensity_factor * atom_peak_ambiguity_factor * fragment_ambiguity_factor
        
        return contribution
    
    
[docs]    def calculate_contributions(self, peakids, CI = False, CPA = True, CFA = True, CPE = True):
        """ 
        input:
        peakids: list of peak ids
        CI: (Consider_Intensity) bool, if True, the intensity of the peaks is considered (default: False)
        CPA: (Consider_Peak_Ambiguity) bool, if True, the peak ambiguity (number of fragments assigned to a peak) is considered (default: True)
        CFA: (Consider_Fragment_Ambiguity) bool, if True, the fragment ambiguity (number of atoms in fragment) is considered (default: True)
        CPA: (Consider_Peak_Entropy) bool, if True, the peak entropy (how ambiguis the fragments are) is considered (default: True
        """
        num_atoms = len(self.structure.GetAtoms())
        existance_data = self.find_existance(peakids)
        contributions = [0 for i in range(num_atoms)]
        peak_atom_contributions = np.zeros((len(peakids), num_atoms))
        for i, peak in enumerate(peakids):
            for atom in range(num_atoms):
                peak_atom_contributions[i][atom] = self.calculate_contribution_atom_in_peak(atom, peak, existance_data, CI=CI, CPA=CPA, CFA=CFA)
        
        if CPE:
            peak_entropies = np.zeros(len(peakids))
            for i in range(len(peakids)):
                peak_entropies[i] = 1 - general_utils.entropy(peak_atom_contributions[i])
            
            # peak_entropies = peak_entropies / np.max(peak_entropies)
        else:
            peak_entropies = np.ones(len(peakids))
            
        
        for i in range(num_atoms):
            for j in range(len(peakids)):
                contributions[i] += peak_atom_contributions[j][i] * peak_entropies[j]
        
        return contributions


[docs]    def filter_fragments_by_atoms(self, atoms: list, peaks: list = None):
        """Filter the fragments by the atoms, remove fragments that do not contain at least one of the atoms
        Parameters
        ----------
            atoms: a list of atoms to filter the fragments
            peaks: a list of peaks to filter their fragments, if None, use all peaks
        
        Returns
        -------
            updated: the number of updated peaks
        """
        if peaks is None:
            peaks = [i for i in range(len(self.peaks))]
        
        updated = 0
        for i in peaks:
            updated_fragments = set()
            for fragment in self.spectrum.peak_fragments_map[i]:
                for atom in atoms:
                    if 1 << atom & fragment:
                        updated_fragments.add(fragment)
                        break

            if len(updated_fragments) != len(self.spectrum.peak_fragments_map[i]):
                updated += 1
            
            self.spectrum.peak_fragments_map[i] = updated_fragments

        return updated
    
    
    from typing import Tuple

[docs]    def calculate_peak_annotation_ambiguity(self, peaks: list=None) -> Tuple[float, float]:
        """Calculate the peak annotation ambiguity
        
        Parameters
        ----------
            peaks : list
                a list of peaks to calculate the ambiguity for, if None, use all peaks
        Returns
        -------
            (float, float) : a tuple of two values (ambiguity, ratio)
                ambiguity: the average number of fragments per annotated peaks
                ratio: the ratio of annotated peaks to all peaks
        """
        if peaks is None:
            peaks = [i for i in range(len(self.peaks))]
        ambiguity = 0
        annotated_peaks = 0
        for peak in peaks:
            if len(self.spectrum.peak_fragments_map[peak]) > 0:
                annotated_peaks += 1
                ambiguity += len(self.spectrum.peak_fragments_map[peak])
        if annotated_peaks == 0:
            return -1, 0
        if len(peaks) == 0:
            return -1, -1
        return ambiguity / annotated_peaks, annotated_peaks / len(peaks)


[docs]    def calculate_annotation_entropy(self, peaks: list=None):
        """Calculate the entropy of the annotation
        
        Parameters
        ----------
            peaks : list
                a list of peaks to calculate the entropy for, if None, use all peaks
        
        Returns
        -------
            float : the entropy of the annotation
        """
        if peaks is None:
            peaks = [i for i in range(len(self.peaks))]
        peak_entropies = [0 for i in range(len(self.peaks))]
        n = len(self.structure.GetAtoms())
        for peak in peaks:
            atoms_appearance = [0 for i in range(n)]
            for fragment in self.spectrum.peak_fragments_map[peak]:
                for atom in range(n):
                    if 1 << atom & fragment:
                        atoms_appearance[atom] += 1
            entropy = 0
            for atom in range(n):
                if atoms_appearance[atom] > 0:
                    p = atoms_appearance[atom] / len(self.spectrum.peak_fragments_map[peak])
                    entropy -= p * math.log(p)
            peak_entropies[peak] = entropy
        if len(peak_entropies) == 0:
            return -1
        entropy = sum(peak_entropies) / len(peak_entropies)
        return entropy