Source code for modifinder.utilities.mol_utils

"""
GNPS Utils - Molecule Utils
---------------------------
This file contains utility functions around molecules and molecules modification based on RDKit library.
"""

from rdkit import Chem
from rdkit.Chem import rdFMCS
from copy import deepcopy
from modifinder.utilities.network import get_data

import rdkit.rdBase as rkrb
import rdkit.RDLogger as rkl
logger = rkl.logger()
logger.setLevel(rkl.ERROR)
rkrb.DisableLog('rdApp.error')

[docs]def get_modification_nodes(mol1, mol2, in_mol1 = True): """ Calculates the modification sites between two molecules when one molecule is a substructure of the other molecule Input: :mol1: first molecule :mol2: second molecule :in_mol1: bool, if True, the modification sites are given in the mol1, if False, the modification sites are given in the mol2 Output: list of modification sites Example ------- .. code-block:: python import modifinder.utilities.mol_utils as mf_mu import modifinder.utilities.visualizer as mf_vis from matplotlib import pyplot as plt from rdkit import Chem def mol_with_atom_index(mol): for atom in mol.GetAtoms(): atom.SetAtomMapNum(atom.GetIdx()) return mol mol1 = mol_with_atom_index(Chem.MolFromSmiles("O=C1C2=C(N=CN2)N(C)C(N1C)=O")) mol2 = mol_with_atom_index(Chem.MolFromInchi("InChI=1S/C8H10N4O2/c1-10-4-9-6-5(10)7(13)12(3)8(14)11(6)2/h4H,1-3H3")) edit_in_mol1 = mf_mu.get_modification_nodes(mol1, mol2, True) edit_in_mol2 = mf_mu.get_modification_nodes(mol1, mol2, False) fig, ax = plt.subplots(1, 2, figsize=(10, 5)) ax[0].imshow(mf_vis.draw_molecule(mol1, highlightAtoms=edit_in_mol1)) ax[1].imshow(mf_vis.draw_molecule(mol2, highlightAtoms=edit_in_mol2)) for i in range(2): ax[i].axis("off") plt.show() print("edit_in_mol1", edit_in_mol1) print("edit_in_mol2", edit_in_mol2) >>> edit_in_mol1 6 >>> edit_in_mol2 9 .. image:: ../../_static/get_modification_nodes.png """ copy_mol1, copy_mol2 = _get_molecules(mol1, mol2) if copy_mol1.HasSubstructMatch(copy_mol2): return _calculateModificationSites(copy_mol1, copy_mol2, in_mol1) elif copy_mol2.HasSubstructMatch(copy_mol1): return _calculateModificationSites(copy_mol2, copy_mol1, not in_mol1) else: raise ValueError("One molecule should be a substructure of the other molecule")
[docs]def get_modification_edges(mol1, mol2, only_outward_edges = False): """ Calculates the modification edges between two molecules when one molecule is a substructure of the other molecule Input: :mol1: first molecule :mol2: second molecule :only_outward_edges: bool, if True, only the modification edges that go from atoms in the substructure to atoms outside the substructure are returned Output: list of the the modification edges in the parent molecule as tuples of atom indices """ copy_mol1, copy_mol2 = _get_molecules(mol1, mol2) if not mol1.HasSubstructMatch(mol2): if mol2.HasSubstructMatch(mol1): copy_mol1, copy_mol2 = copy_mol2, copy_mol1 else: raise ValueError("One molecule should be a substructure of the other molecule") matches = _find_minimal_modification_edges_match(copy_mol1, copy_mol2) modificationEdgesOutward, modificationEdgesInside, _ = _get_edge_modifications(copy_mol1, copy_mol2, matches) if only_outward_edges: return modificationEdgesOutward else: return modificationEdgesOutward + modificationEdgesInside
[docs]def get_edit_distance(mol1, mol2): """ Calculates the edit distance between mol1 and mol2. Input: :mol1: first molecule :mol2: second molecule Output: :edit_distance: edit distance between mol1 and mol2 """ copy_mol1, copy_mol2 = _get_molecules(mol1, mol2) mcs1 = rdFMCS.FindMCS([copy_mol1, copy_mol2]) mcs_mol = Chem.MolFromSmarts(mcs1.smartsString) dist1 = get_modification_edges(copy_mol1, mcs_mol) dist2 = get_modification_edges(copy_mol2, mcs_mol) return len(dist1) + len(dist2)
[docs]def get_edit_distance_detailed(mol1, mol2, mcs = None): """ Calculates the edit distance between mol1 and mol2. Input: :mol1: first molecule :mol2: second molecule :mcs: the maximum common substructure between mol1 and mol2 Output: :removed edges: the removed modification edges :added edges: the added modification edges """ copy_mol1, copy_mol2 = _get_molecules(mol1, mol2) if mcs is None: mcs1 = rdFMCS.FindMCS([copy_mol1, copy_mol2]) mcs_mol = Chem.MolFromSmarts(mcs1.smartsString) else: mcs_mol = Chem.Mol(mcs) dist1 = get_modification_edges(copy_mol1, mcs_mol) dist2 = get_modification_edges(copy_mol2, mcs_mol) return len(dist1), len(dist2)
[docs]def get_transition(input1, input2): """ Calculates the transition between mol1 and mol2. Input: :input1: first molecule :input2: second molecule Output: :result: a dictionary with the following keys: **merged_mol**: the merged molecule **common_bonds**: the common bonds between mol1 and mol2 **common_atoms**: the common atoms between mol1 and mol2 **removed_atoms**: the removed atoms from mol1 **added_atoms**: the added atoms from mol2 **modified_added_edges_inside**: the added edges inside the common substructure **modified_added_edges_bridge**: the added edges between the common substructure and the added atoms **modified_removed_edges_inside**: the removed edges inside the common substructure **modified_removed_edges_bridge**: the removed edges between the common substructure and the removed **added_edges**: the added edges that are not modification edges **removed_edges**: the removed edges that are not modification edges """ mol1, mol2 = _get_molecules(input1, input2) copy_mol1, copy_mol2 = Chem.Mol(mol1), Chem.Mol(mol2) # finding the maximum common substructure mcs1 = rdFMCS.FindMCS([mol1, mol2]) mcs_mol = Chem.MolFromSmarts(mcs1.smartsString) # finding the mapping between the atoms of the common substructure mol1_indices = _find_minimal_modification_edges_match(mol1, mcs_mol) mol2_indices = _find_minimal_modification_edges_match(mol2, mcs_mol) map_mcs_from_mol1_to_mol2 = dict() map_mcs_from_mol2_to_mol1 = dict() for i in range(mcs_mol.GetNumAtoms()): map_mcs_from_mol1_to_mol2[mol1_indices[i]] = mol2_indices[i] map_mcs_from_mol2_to_mol1[mol2_indices[i]] = mol1_indices[i] # calculating the removed atoms from mol1 removed_atoms = list(set(range(mol1.GetNumAtoms())) - set(mol1_indices)) modified_removed_edges_bridge, modified_removed_edges_inside, removed_edges = _get_edge_modifications(mol1, mcs_mol, mol1_indices) # calculating the added atoms from mol2 added_atoms = set(range(mol2.GetNumAtoms())) - set(mol2_indices) modified_added_edges_bridge, modified_added_edges_inside, added_edges = _get_edge_modifications(mol2, mcs_mol, mol2_indices) # creating the merged molecule editable_mol = Chem.EditableMol(copy_mol1) ## adding the atoms from mol2 n = mol1.GetNumAtoms() map_old_mol2_to_added = dict() # mapping from the old mol2 indices to the new indices in the merged molecule (for the added atoms) for atom in added_atoms: editable_mol.AddAtom(mol2.GetAtomWithIdx(atom)) map_old_mol2_to_added[atom] = n n += 1 for atom in mol2_indices: map_old_mol2_to_added[atom] = map_mcs_from_mol2_to_mol1[atom] ## adding the bonds from mol2 updated_modified_added_edges_inside = [] for bond in modified_added_edges_inside: editable_mol.AddBond(map_old_mol2_to_added[bond[0]], map_old_mol2_to_added[bond[1]], order=copy_mol2.GetBondBetweenAtoms(bond[0], bond[1]).GetBondType()) updated_modified_added_edges_inside.append((map_old_mol2_to_added[bond[0]], map_old_mol2_to_added[bond[1]])) updated_modified_added_edges_bridge = [] for bond in modified_added_edges_bridge: if bond[0] in added_atoms: index1 = map_old_mol2_to_added[bond[0]] index2 = map_mcs_from_mol2_to_mol1[bond[1]] else: index1 = map_mcs_from_mol2_to_mol1[bond[0]] index2 = map_old_mol2_to_added[bond[1]] updated_modified_added_edges_bridge.append((index1, index2)) editable_mol.AddBond(index1, index2, order=copy_mol2.GetBondBetweenAtoms(bond[0], bond[1]).GetBondType()) updated_added_edges = [] for bond in added_edges: index1 = map_old_mol2_to_added[bond[0]] index2 = map_old_mol2_to_added[bond[1]] updated_added_edges.append((index1, index2)) editable_mol.AddBond(index1, index2, order=copy_mol2.GetBondBetweenAtoms(bond[0], bond[1]).GetBondType()) ## adding the common bonds common_bonds = [] for bond in mol1.GetBonds(): pair = (bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()) if pair in modified_removed_edges_inside or pair in modified_removed_edges_bridge or pair in removed_edges: continue pair2 = (bond.GetEndAtomIdx(), bond.GetBeginAtomIdx()) if pair2 in modified_removed_edges_inside or pair2 in modified_removed_edges_bridge or pair2 in removed_edges: continue common_bonds.append(pair) common_atoms = mol1_indices mol1_atoms = removed_atoms mol2_atoms = [map_old_mol2_to_added[i] for i in added_atoms] # create a structure for result result = { 'merged_mol': editable_mol.GetMol(), 'common_bonds': common_bonds, 'common_atoms': common_atoms, 'removed_atoms': mol1_atoms, 'added_atoms': mol2_atoms, 'added_edges': updated_added_edges, 'removed_edges': removed_edges, 'modified_added_edges_inside': updated_modified_added_edges_inside, 'modified_added_edges_bridge': updated_modified_added_edges_bridge, 'modified_removed_edges_inside': modified_removed_edges_inside, 'modified_removed_edges_bridge': modified_removed_edges_bridge } return result
[docs]def get_modification_graph(main_struct, sub_struct): """ Calculates the substructure difference between main_struct and sub_struct when there is exactly one modification edge, if there are multiple modification edges, one of the modifications is returned randomly. Input: :main_struct: main molecule :sub_struct: substructure molecule Output: :all_modifications: a list of the modifications structures, each modification is a tuple of: 1. the modified subgraph molecule (as an rdkit editable molecule) 2. a dictionary that maps the wildcard atom indices in subgraph to its true index in the main molecule 3. the SMARTS representation of the modification """ main_struct, sub_struct = _get_molecules(main_struct, sub_struct) if not main_struct.HasSubstructMatch(sub_struct): if sub_struct.HasSubstructMatch(main_struct): return get_modification_graph(sub_struct, main_struct) else: raise ValueError("One molecule should be a substructure of the other molecule") atoms_of_substructure = _find_minimal_modification_edges_match(main_struct, sub_struct) modificationEdgesOutward, modificationEdgesInside, noneModificationDifferentEdges = _get_edge_modifications(main_struct, sub_struct, atoms_of_substructure) all_modification_edges = modificationEdgesOutward + modificationEdgesInside + noneModificationDifferentEdges def dfs(mol, index, visited, color, all_modification_edges): if index in visited: return visited[index] = color for bond in all_modification_edges: if bond[0] == index: target = bond[1] elif bond[1] == index: target = bond[0] else: continue if target not in visited: dfs(mol, target, visited, color, all_modification_edges) return visited = {} color = 0 for atom in range(main_struct.GetNumAtoms()): if atom not in visited and atom not in atoms_of_substructure: dfs(main_struct, atom, visited, color, all_modification_edges) color += 1 all_modifications = [] for modification in range(color): true_map = dict() edit_mol = Chem.RWMol(main_struct) # delete any atom that is not in the modification for atom in range(main_struct.GetNumAtoms()-1, -1, -1): if atom not in visited or visited[atom] != modification: edit_mol.RemoveAtom(atom) elif atom in atoms_of_substructure: edit_mol.GetAtomWithIdx(atom).SetProp('atomNote', f'substructure_{atom}') new_edit_mol = Chem.RWMol() for atom in edit_mol.GetAtoms(): try: note = atom.GetProp('atomNote') except: note = None if note is not None and note.startswith('substructure'): new_edit_mol.AddAtom(Chem.Atom(0)) true_map[atom.GetIdx()] = note.split('_')[1] else: new_edit_mol.AddAtom(atom) for bond in edit_mol.GetBonds(): new_edit_mol.AddBond(bond.GetBeginAtomIdx(), bond.GetEndAtomIdx(), bond.GetBondType()) # new_edit_mol = new_edit_mol.GetMol() all_modifications.append((new_edit_mol, true_map, Chem.MolToSmarts(new_edit_mol))) return all_modifications
[docs]def attach_mols(main_mol, attachment_mol, attach_location_main, attach_location_attachment, bond_type): """ Attaches the attachment structure to main molecule at the attach_location_main and attach_location_attachment with bond_type. Input: :main_mol: rdkit molecule of the main molecule :attachment_mol: rdkit molecule of the attachment molecule :attach_location_main: the index of the atom in the main molecule where the attachment should be done :attach_location_attachment: the index of the atom in the attachment molecule where the attachment should be done :bond_type: the type of the bond between the main molecule and the attachment molecule Output: :new_mol: the new molecule after attachment """ new_mol = Chem.CombineMols(main_mol, attachment_mol) new_mol = Chem.EditableMol(new_mol) new_mol.AddBond(attach_location_main, attach_location_attachment + main_mol.GetNumAtoms(), bond_type) new_mol = new_mol.GetMol() return new_mol
[docs]def generate_possible_stuctures(main_struct, sub_struct): """ Generates all possible structures after attaching the difference between sub_struct and main_struct to main_struct. Input: :main_struct: main molecule :sub_struct: substructure molecule Output: :list of possible_structures: all possible structures after attachment with the index of the atom Example ------- .. code-block:: python import modifinder.utilities.mol_utils as mf_mu import modifinder.utilities.visualizer as mf_vis from matplotlib import pyplot as plt from rdkit import Chem from rdkit.Chem import Draw modification = Chem.MolFromSmiles("C1=C(NC(=O)N=C1)N") mol1 = Chem.MolFromSmiles("CC1=C(NC(=O)N=C1)N") res = mf_mu.generate_possible_stuctures(mol1, modification) img = mf_vis.draw_modifications(modification, mol1) plt.imshow(img) plt.axis("off") plt.show() res_mols = [x[1] for x in res] res_index = ["attach modification at location: " + str(x[0]) for x in res] img = Draw.MolsToGridImage(res_mols, molsPerRow=2, subImgSize=(200, 200), legends=res_index, returnPNG=False) img.save("molgrid.png") display(img) .. image:: ../../_static/generate_possible_stuctures1.png .. image:: ../../_static/generate_possible_stuctures2.png """ main_struct, sub_struct = _get_molecules(main_struct, sub_struct) if not main_struct.HasSubstructMatch(sub_struct): if sub_struct.HasSubstructMatch(main_struct): return generate_possible_stuctures(sub_struct, main_struct) else: raise ValueError("One molecule should be a substructure of the other molecule") all_modifications = get_modification_graph(main_struct, sub_struct) if len(all_modifications) == 0: raise ValueError("No modification is found") if len(all_modifications) > 1: raise ValueError("Multiple modifications are found") if len(all_modifications[0][1]) > 1: raise ValueError("Multiple modifications are found") if len(all_modifications[0][1]) == 0: raise ValueError("ISSUE WITH CODE, PLEASE REPORT") wild_atom = list(all_modifications[0][1].keys())[0] neighbor = all_modifications[0][0].GetAtomWithIdx(wild_atom).GetNeighbors()[0] bondType = all_modifications[0][0].GetBondBetweenAtoms(wild_atom, neighbor.GetIdx()).GetBondType() all_modifications[0][0].RemoveAtom(wild_atom) frag = all_modifications[0][0] index_in_frag = neighbor.GetIdx() structs = [] for atom in sub_struct.GetAtoms(): temp_struct = _attach_struct_try(sub_struct, frag, atom.GetIdx(), index_in_frag, bondType) if temp_struct is None: continue structs.append((atom.GetIdx(), temp_struct)) return structs
def _attach_struct_try(main_struct, frag, main_location, frag_location, bond_type): mol = attach_mols(main_struct, frag, main_location, frag_location, bond_type) try: Chem.SanitizeMol(mol) return mol except Exception: for bond in [Chem.BondType.SINGLE, Chem.BondType.DOUBLE, Chem.BondType.TRIPLE]: if bond != bond_type: try: mol = attach_mols(main_struct, frag, main_location, frag_location, bond) Chem.SanitizeMol(mol) return mol except Exception: continue return None def _find_minimal_modification_sites_match(mol, substructure): """ Finds the matching that results in the minimum number of modification sites, if multiple matches are found. If multiple results are found, one random result is returned. Input: mol: first molecule substructure: substructure molecule Output: match array """ matches = mol.GetSubstructMatches(substructure) if len(matches) == 1: return matches[0] else: min_modifications = float('inf') min_match = None for match in matches: modif_sites = set() for bond in mol.GetBonds(): if bond.GetBeginAtomIdx() in match and ~(bond.GetEndAtomIdx() in match): modif_sites.add(bond.GetBeginAtomIdx()) if bond.GetEndAtomIdx() in match and ~(bond.GetBeginAtomIdx() in match): modif_sites.add(bond.GetEndAtomIdx()) if len(modif_sites) < min_modifications: min_modifications = len(modif_sites) min_match = match return min_match def _find_minimal_modification_edges_match(mol, substructure): """ Finds the matching that results in the minimum number of modification edges, if multiple matches are found. If multiple matches have the minimum number, one random result is returned. Input: mol: first molecule substructure: substructure molecule Output: match array """ if not mol.HasSubstructMatch(substructure): raise ValueError("The substructure is not a substructure of the molecule") matches = mol.GetSubstructMatches(substructure) if len(matches) == 1: return matches[0] else: min_modifications = float('inf') min_match = None for match in matches: modificationEdgesOutward, modificationEdgesInside, _ = _get_edge_modifications(mol, substructure, match) num_modifications = len(modificationEdgesOutward) + len(modificationEdgesInside) if num_modifications < min_modifications: min_modifications = num_modifications min_match = match return min_match def _get_edge_modifications(mol, substructure, match): """ Calculates the modification edges to get mol from substructure given the mapping of the atoms Input: mol: first molecule substructure: substructure molecule match: the match array, mapping atoms from substructure to mol Output: modificationEdgesOutward: the modification edges that go from atoms in the substructure to atoms outside the substructure modificationEdgesInside: the modification edges that happen within the atoms in the substructure noneModificationDifferentEdges: the edges that are not modification edges but are different between the two molecules (edges that are completely outside the substructure) """ reverse_match = {v: k for k, v in enumerate(match)} intersect = set(match) modificationEdgesOutward = [] modificationEdgesInside = [] noneModificationDifferentEdges = [] for bond in mol.GetBonds(): if bond.GetBeginAtomIdx() in intersect or bond.GetEndAtomIdx() in intersect: if bond.GetBeginAtomIdx() in intersect and bond.GetEndAtomIdx() in intersect: a1 = reverse_match[bond.GetBeginAtomIdx()] a2 = reverse_match[bond.GetEndAtomIdx()] if substructure.GetBondBetweenAtoms(a1, a2) is None: modificationEdgesInside.append((bond.GetBeginAtomIdx(), bond.GetEndAtomIdx())) else: modificationEdgesOutward.append((bond.GetBeginAtomIdx(), bond.GetEndAtomIdx())) else: noneModificationDifferentEdges.append((bond.GetBeginAtomIdx(), bond.GetEndAtomIdx())) return modificationEdgesOutward, modificationEdgesInside, noneModificationDifferentEdges def _calculateModificationSites(mol, substructure, inParent = True): """ Calculates the number of modification sites to get mol from substructure (works when one moleculte is a substructure of the other molecule) if multiple matches are found, the modification sites that result in the minimum number of modifications are returned Input: mol1: first molecule substructure: substructure molecule inParent: bool, if True, the modification sites are given in the parent molecule, if False, the modification sites are given in the substructure Output: count: modification sites """ matches = _find_minimal_modification_edges_match(mol, substructure) intersect = set(matches) modificationSites = [] for atom in mol.GetAtoms(): if atom.GetIdx() not in intersect: for tempAtom in intersect: if mol.GetBondBetweenAtoms(atom.GetIdx(), tempAtom) is not None: modificationSites.append(tempAtom) if inParent: return modificationSites else: res = [] if type(matches[0]) is tuple: # multiple matches (substructure is happening multiple times in the parent molecule) for match in matches: subMatches = list(match) temp_res = [] for atom in modificationSites: idx = subMatches.index(atom) # based on rdkit library, the value in matches array are sorted by index temp_res.append(idx) res.append(temp_res) else: for atom in modificationSites: subMatches = list(matches) idx = subMatches.index(atom) # based on rdkit library, the value in matches array are sorted by index res.append(idx) return res def _get_molecules(mol1, mol2): copy_mol1 = deepcopy(mol1) copy_mol2 = deepcopy(mol2) copy_mol1 = _get_molecule(copy_mol1) copy_mol2 = _get_molecule(copy_mol2) if copy_mol1 is None or copy_mol2 is None: raise ValueError("The molecules are not valid") return copy_mol1, copy_mol2 def _get_molecule(identifier = None, smiles=None, inchi=None, molblock=None, smarts=None, **kwargs): """ Get the rdkit molecule from smiles, inchi or GNPS identifier Parameters: ---------- identifier: str GNPS identifier, smiles, inchi, molblock or smart smiles: str smiles string inchi: str inchi string molblock: str molblock string smart: str :return: rdkit molecule or None """ if identifier is not None: if isinstance(identifier, str): try: molecule = Chem.MolFromSmiles(identifier) if molecule: return molecule except Exception: pass try: molecule = Chem.MolFromInchi(identifier) if molecule: return molecule except Exception: pass try: molecule = Chem.MolFromMolBlock(identifier) if molecule: return molecule except Exception: pass try: molecule = Chem.MolFromSmarts(identifier) if molecule: return molecule except Exception: pass try: data = get_data(identifier) smiles = data.get("Smiles", None) if smiles: molecule = Chem.MolFromSmiles(smiles) if molecule: return molecule except Exception: raise ValueError("The identifier is not valid or it is not supported") if isinstance(identifier, Chem.Mol): return identifier if smiles is not None: molecule = Chem.MolFromSmiles(smiles) if molecule: return molecule else: raise ValueError("The smiles string is not valid") if inchi is not None: molecule = Chem.MolFromInchi(inchi) if molecule: return molecule else: raise ValueError("The inchi string is not valid") if molblock is not None: molecule = Chem.MolFromMolBlock(molblock) if molecule: return molecule else: raise ValueError("The molblock string is not valid") if smarts is not None: molecule = Chem.MolFromSmarts(smarts) if molecule: return molecule else: raise ValueError("The smart string is not valid") raise ValueError("No valid input is provided")