Spaces:
Sleeping
Sleeping
""" Chemoinformatics utilities for PROTAC Splitter. """ | |
import logging | |
from typing import List, Union, Optional, Literal | |
from multiprocessing import Process, Queue | |
from hashlib import sha256 | |
from rdkit import Chem | |
from rdkit.Chem import rdFingerprintGenerator | |
def GetSubstructMatchesWorker(q, mol, substruct, useChirality, maxMatches): | |
""" Worker function to get substructure matches in a separate process. """ | |
q.put(list(mol.GetSubstructMatches( | |
substruct, | |
useChirality=useChirality, | |
maxMatches=maxMatches, | |
))) | |
def GetSubstructMatchesWithTimeout( | |
mol: Chem.Mol, | |
substruct: Chem.Mol, | |
useChirality: bool = True, | |
maxMatches: int = 50, | |
timeout: Union[int, float] = 10, | |
) -> Optional[List[List[int]]]: | |
""" Get substructure matches with a timeout. | |
Args: | |
mol (Chem.Mol): The molecule to search for substructure matches. | |
substruct (Chem.Mol): The substructure to search for in the molecule. | |
useChirality (bool, optional): Whether to use chirality in the substructure search. Defaults to True. | |
maxMatches (int, optional): The maximum number of matches to return. Defaults to 50. | |
timeout (int | float, optional): The timeout in seconds. Defaults to 10. | |
Returns: | |
Optional[List[List[int]]]: A list of lists containing the atom indices of the substructure matches. Returns None if the search times out or failed. | |
""" | |
q = Queue() | |
p = Process( | |
target=GetSubstructMatchesWorker, | |
args=(q, mol, substruct, useChirality, maxMatches), | |
) | |
p.start() | |
p.join(timeout) | |
if p.is_alive(): | |
p.terminate() | |
p.join() | |
return None | |
return q.get() | |
def remove_stereo(smiles: str) -> str: | |
""" | |
Remove stereochemistry from a SMILES string. | |
Args: | |
smiles (str): The input SMILES string. | |
Returns: | |
str: The SMILES string with stereochemistry removed. | |
""" | |
try: | |
mol = Chem.MolFromSmiles(smiles) | |
Chem.rdmolops.RemoveStereochemistry(mol) | |
return Chem.MolToSmiles(mol) | |
except Exception as e: | |
logging.warning(f"Error removing stereochemistry: {e}") | |
return None | |
def get_mol(smiles: str, remove_stereo: bool = False) -> Chem.Mol: | |
""" | |
Get a molecule object from a SMILES string. | |
Args: | |
smiles (str): The SMILES string representing the molecule. | |
Returns: | |
Chem.Mol: The molecule object. | |
""" | |
mol = Chem.MolFromSmiles(smiles) | |
if mol is not None and remove_stereo: | |
Chem.rdmolops.RemoveStereochemistry(mol) | |
return mol | |
def canonize_smarts(smarts: str) -> str: | |
""" | |
Cleans a SMARTS string by converting it to canonical SMARTS representation. | |
NOTE: It might not work for complex patterns: https://github.com/rdkit/rdkit/discussions/6929 | |
Args: | |
smarts (str): The input SMARTS string. | |
Returns: | |
str: The cleaned SMARTS string. | |
""" | |
mol = Chem.MolFromSmarts(smarts) | |
if mol is None: | |
return None | |
canonical_smarts = Chem.MolToSmarts(Chem.MolFromSmiles(Chem.MolToSmiles(mol), sanitize=False)) | |
return canonical_smarts | |
def smiles2mol(smiles: str) -> Chem.Mol: | |
"""Converts a SMILES string to an RDKit molecule object. | |
Args: | |
smiles (str): The input SMILES string. | |
Returns: | |
Chem.Mol: The RDKit molecule object. | |
""" | |
return Chem.MolFromSmiles(smiles) | |
def mol2smiles(mol: Chem.Mol) -> str: | |
"""Converts an RDKit molecule object to a SMILES string. | |
Args: | |
mol (Chem.Mol): The RDKit molecule object. | |
Returns: | |
str: The SMILES string. | |
""" | |
return Chem.MolToSmiles(mol) | |
def canonize_smiles(smiles: str) -> str: | |
""" Canonizes a SMILES string by converting it to canonical SMILES representation. | |
Args: | |
smiles (str): The input SMILES string. | |
Returns: | |
str: The canonized SMILES string. | |
""" | |
if smiles is None: | |
return None | |
try: | |
mol = Chem.MolFromSmiles(smiles) | |
except Exception as e: | |
print(f"Error: {e}") | |
return None | |
if mol is None: | |
return None | |
try: | |
return Chem.MolToSmiles(mol, canonical=True) | |
except: | |
return None | |
def canonize(x: Union[str, Chem.Mol]) -> Union[str, Chem.Mol]: | |
""" Canonizes a SMILES string or RDKit molecule object. | |
Args: | |
x: The input SMILES string or RDKit molecule object. | |
Returns: | |
str | Chem.Mol: The canonized SMILES string or RDKit molecule object, according to the input type. | |
""" | |
if x is None: | |
return None | |
if isinstance(x, str): | |
return canonize_smiles(x) | |
return Chem.MolFromSmiles(Chem.MolToSmiles(x, canonical=True)) | |
def compute_RDKitFP( | |
smiles: Union[str, List[str], List[Chem.Mol]], | |
maxPath: int = 7, | |
fpSize: int = 2048, | |
) -> List[Chem.RDKFingerprint]: | |
""" | |
Compute RDKit fingerprints for a given list of SMILES strings or RDKit molecules. | |
Args: | |
smiles (Union[str, List[str], List[Chem.Mol]]): A single SMILES string or a list of SMILES strings | |
or a list of RDKit molecules. | |
maxPath (int, optional): The maximum path length for the fingerprints. Defaults to 7. | |
fpSize (int, optional): The size of the fingerprint vector. Defaults to 2048. | |
Returns: | |
List[Chem.RDKFingerprint]: A list of RDKit fingerprints computed from the input SMILES strings or molecules. | |
""" | |
if isinstance(smiles[0], str): | |
mols = [get_mol(smi) for smi in smiles] | |
else: | |
mols = smiles # assume mols were fed instead | |
rdgen = rdFingerprintGenerator.GetRDKitFPGenerator( | |
maxPath=maxPath, fpSize=fpSize) | |
fps = [rdgen.GetCountFingerprint(mol) for mol in mols] | |
return fps | |
def remove_dummy_atoms(mol: Union[str, Chem.Mol], canonical=True) -> Union[str, Chem.Mol]: | |
""" | |
Removes all dummy atoms (attachment points) from a molecule. | |
Args: | |
mol: RDKit Mol object with dummy atoms. | |
Returns: | |
A new RDKit Mol object without dummy atoms. | |
""" | |
return_smiles = False | |
if isinstance(mol, str): | |
return_smiles = True | |
mol = Chem.MolFromSmiles(mol) | |
if mol is None: | |
return None | |
# Remove all dummy atoms with a query | |
mol_no_dummy = Chem.DeleteSubstructs(mol, Chem.MolFromSmarts('[#0]')) | |
if mol_no_dummy is None: | |
# -------------------------------------------------------------------------- | |
# Other approach: editing molecule and removing dummy atoms | |
# -------------------------------------------------------------------------- | |
# Create an editable molecule to remove atoms | |
editable_mol = Chem.EditableMol(mol) | |
# List of atoms to remove (dummy atoms have atomic number 0) | |
dummy_atoms = [atom.GetIdx() for atom in mol.GetAtoms() if atom.GetAtomicNum() == 0] | |
# Remove dummy atoms | |
for atom_idx in sorted(dummy_atoms, reverse=True): # Remove from the highest index to avoid index shifts | |
editable_mol.RemoveAtom(atom_idx) | |
if editable_mol is None: | |
return None | |
# Return the modified molecule | |
if return_smiles: | |
return Chem.MolToSmiles(editable_mol.GetMol()) | |
editable_mol = editable_mol.GetMol() | |
editable_mol.UpdatePropertyCache() | |
return editable_mol | |
# -------------------------------------------------------------------------- | |
# Return the modified molecule | |
if return_smiles: | |
return Chem.MolToSmiles(mol_no_dummy, canonical=canonical) | |
return mol_no_dummy | |
def dummy2query(mol: Chem.Mol) -> Chem.Mol: | |
""" Converts dummy atoms to query atoms, so that a molecule with attachment points can be used in HasSubstructMatch. | |
Args: | |
mol: The molecule to convert. | |
Returns: | |
The molecule with dummy atoms converted to query atoms | |
""" | |
if mol is None: | |
return None | |
p = Chem.AdjustQueryParameters.NoAdjustments() | |
p.makeDummiesQueries = True | |
return Chem.AdjustQueryProperties(mol, p) | |
def get_substr_match( | |
protac_mol: Chem.Mol, | |
substr: Chem.Mol, | |
max_allowed_fragments: int = 1, | |
replace: Literal['core', 'sidechains'] = 'core', | |
useChirality: bool = True, | |
) -> bool: | |
""" Check if a molecule contains a substructure match with a given molecule. | |
Compared to RDKit HasSubstructMatch, this function also checks the number of fragments when replacing the substr in the PROTAC. | |
Args: | |
protac_mol (Chem.Mol): The PROTAC molecule. | |
substr (Chem.Mol): The substructure molecule. | |
max_allowed_fragments (int, optional): The maximum number of fragments allowed when replacing the substr in the PROTAC. Defaults to 1. Example when equal to 1: if removing the warhead, a single fragment should remain. | |
Returns: | |
bool: True if the PROTAC contains a substructure match with the given molecule and the fragments count is equal, False otherwise. | |
""" | |
# Count the number of fragments when replacing the substr in the PROTAC | |
if replace == 'core': | |
fragments = Chem.ReplaceCore(protac_mol, dummy2query(substr), useChirality=useChirality) | |
elif replace == 'sidechains': | |
fragments = Chem.ReplaceSidechains(protac_mol, dummy2query(substr), useChirality=useChirality) | |
else: | |
raise ValueError(f"replace argument should be either 'core' or 'sidechains', provided: {replace}") | |
# Check if the number of fragments is equal to the max allowed fragments | |
if fragments is None: | |
return False | |
try: | |
fragments = Chem.GetMolFrags(fragments, sanitizeFrags=False) | |
except Exception as e: | |
print(e) | |
return False | |
return len(fragments) == max_allowed_fragments | |
def remove_attach_atom(mol: Chem.Mol, attach_id: int, sanitize: bool = False) -> Chem.Mol: | |
""" Removes the atom with the specified attachment id from the molecule. | |
Example: | |
>>> remove_attach_atom(Chem.MolFromSmiles('CC[*:1]'), 1) | |
CC | |
There are no checks on the molecule, so it is assumed it is not None. | |
Args: | |
mol (Chem.Mol): The molecule. | |
attach_id (int): The attachment id of the atom to remove. | |
sanitize (bool, optional): Whether to sanitize the molecule after removing the atom. When used in `fix_prediction` function, it is used to "remove" substructures, so there is no need to have them sanitized. Default: False. | |
Returns: | |
(Chem.Mol) The molecule with the atom removed. | |
""" | |
atoms_to_remove = [] | |
for atom in mol.GetAtoms(): | |
if atom.GetAtomicNum() == 0: # Dummy atom | |
map_num = atom.GetAtomMapNum() | |
if map_num == attach_id: # Targeting only [*:attach_id] | |
atoms_to_remove.append(atom.GetIdx()) | |
# Remove atoms using an EditableMol | |
editable_mol = Chem.EditableMol(mol) | |
for idx in sorted(atoms_to_remove, reverse=True): # Remove from highest index to avoid shifting | |
editable_mol.RemoveAtom(idx) | |
# Convert back to a molecule | |
new_mol = editable_mol.GetMol() | |
if sanitize: | |
Chem.SanitizeMol(new_mol) | |
return new_mol | |
def get_bond_idx(smi: str, bonds_start_end_atoms: List[List[int]]) -> List[int]: | |
""" | |
Get the indices of bonds in a molecule that match the given start and end atom indices. | |
Args: | |
smi (str): The SMILES representation of the molecule. | |
bonds_start_end_atoms (List[List[int]]): A list of lists containing the start and end atom indices of the bonds to search for. | |
Returns: | |
List[int]: A list of bond indices that match the given start and end atom indices. | |
""" | |
mol = Chem.MolFromSmiles(smi) | |
bond_indices = [] | |
for bond in mol.GetBonds(): | |
begin_idx = bond.GetBeginAtomIdx() | |
end_idx = bond.GetEndAtomIdx() | |
if [begin_idx, end_idx] in bonds_start_end_atoms or [end_idx, begin_idx] in bonds_start_end_atoms: | |
bond_indices.append(bond.GetIdx()) | |
elif (begin_idx, end_idx) in bonds_start_end_atoms or (end_idx, begin_idx) in bonds_start_end_atoms: | |
bond_indices.append(bond.GetIdx()) | |
return bond_indices | |
def get_mol_id(smiles: str) -> str | None: | |
""" Get the Hash of a given SMILES string. | |
Args: | |
smiles (str): The SMILES string to hash. | |
Returns: | |
str | None: The Hash of the SMILES string. None if the function failed. | |
""" | |
try: | |
mol = Chem.MolFromSmiles(smiles) | |
if mol is None: | |
return None | |
Chem.RemoveStereochemistry(mol) | |
except Exception as e: | |
logging.warning(f"Error while removing stereochemistry: {e}") | |
logging.warning(f"SMILES: {smiles}") | |
return None | |
# Get the InChIKey for the molecule | |
inchi_key = Chem.MolToInchiKey(mol) | |
smiles = Chem.MolToSmiles(mol, canonical=True) | |
# Encode the InChIKey and SMILES to create a unique identifier | |
return sha256((inchi_key + smiles).encode()).hexdigest() | |
def get_atom_idx_at_attachment( | |
protac: Chem.Mol, | |
substruct: Chem.Mol, | |
linker: Optional[Chem.Mol] = None, | |
timeout: Optional[Union[int, float]] = None, | |
return_dict: bool = False, | |
verbose: int = 0, | |
) -> List[int]: | |
""" Get the atom index of the attachment point of a substructure in the PROTAC molecule. | |
Args: | |
protac: The PROTAC molecule. | |
substruct: The substructure of the PROTAC that contains the attachment point, e.g., the POI or E3 ligase. | |
linker: The linker molecule. | |
verbose: Verbosity level. | |
Returns: | |
List[int]: The two atom indices at the attachment point. | |
""" | |
if linker is None: | |
# Get the "other" substructure, i.e., replace side chain of PROTAC using the substruct | |
linker = Chem.DeleteSubstructs(protac, remove_dummy_atoms(substruct), useChirality=True) | |
if timeout is None: | |
timeout = 60 | |
logging.warning(f'No timeout set when linker is not provided, using default value of {timeout} seconds.') | |
substruct_match = set(protac.GetSubstructMatch(dummy2query(substruct), useChirality=True)) | |
if verbose: | |
print(f'Substruct match: {substruct_match}') | |
linker_no_dummy = remove_dummy_atoms(linker) | |
if verbose: | |
print(f'Linker without dummy atoms found.') | |
max_matches = 2 | |
linker_match = set() | |
shared_atoms = set() | |
# NOTE: The following is a hacky way to speed up the search for linker | |
# matches. In fact, the linker can be quite short, so it might match in | |
# multiple places of the PROTAC molecule. | |
# If the number of max matches in GetSubstructMatches is low, then the | |
# search tends to be faster, but imprecise. However, we are interested in | |
# the interesection of the matches, so we can progressively increase the | |
# number of max matches until we find a single atom in common. | |
while len(shared_atoms) != 1 and max_matches <= 50: | |
if timeout is None: | |
linker_matches = list(protac.GetSubstructMatches(linker_no_dummy, useChirality=True, maxMatches=max_matches)) | |
else: | |
linker_matches = GetSubstructMatchesWithTimeout(protac, linker_no_dummy, useChirality=True, maxMatches=max_matches, timeout=timeout) | |
if verbose: | |
print(f'Linker matches: {linker_matches}') | |
if not linker_matches: | |
# return None | |
linker_match = set() | |
shared_atoms = set() | |
max_matches += 1 | |
continue | |
for match in linker_matches: | |
shared_atoms = set(match) & set(substruct_match) | |
linker_match = match | |
if len(shared_atoms) == 1: | |
if verbose: | |
print(f'Shared atoms: {list(shared_atoms)}') | |
break | |
if len(shared_atoms) != 1: | |
linker_match = set() | |
shared_atoms = set() | |
max_matches += 1 | |
if not shared_atoms: | |
if verbose: | |
print('No shared atoms found.') | |
return None | |
attachment_idx = list(shared_atoms) | |
attachments = {'substruct': attachment_idx[0]} | |
# Get the other atom at the attachment point that is NOT in the linker | |
for neighbor in protac.GetAtomWithIdx(attachment_idx[0]).GetNeighbors(): | |
if neighbor.GetIdx() not in linker_match: | |
attachment_idx.append(neighbor.GetIdx()) | |
attachments['linker'] = neighbor.GetIdx() | |
break | |
if return_dict: | |
return attachments | |
return attachment_idx | |