ribesstefano's picture
Setup the spaces app
9dd777e
from rdkit import Chem
import pandas as pd
from protac_splitter.chemoinformatics import (
canonize_smiles,
remove_stereo,
get_mol_id,
)
def update_dictionary(
dictionary: pd.DataFrame,
substr_to_add: list,
morgan_fp_generator = None,
verbose: int = 0,
) -> pd.DataFrame:
""" Updates a dictionary with a list of additional substructures.
The dictionary is a dataframe with columns 'SMILES', 'Molecule', 'ID', and 'FP'.
Args:
dictionary: The input dictionary dataframe.
substr_to_add: The list of additional substructures.
Returns:
The updated dictionary dataframe.
"""
# Canonize the SMILES strings
substr_to_add = [canonize_smiles(smiles) for smiles in substr_to_add if smiles is not None]
substr_to_add = list(set(substr_to_add))
# Remove entries already in the dictionary
for smiles in substr_to_add:
if not dictionary.empty and smiles in dictionary[f'SMILES'].unique().tolist():
if verbose > 1:
print(f'\tWARNING. SMILES already in the dictionary: {smiles}')
# Remove it from the list
substr_to_add.remove(smiles)
new_entries = []
for smiles in substr_to_add:
try:
mol = Chem.MolFromSmiles(smiles)
except Exception as e:
if verbose:
print(e)
mol = None
# Remove entries that result in invalid molecules
if mol is None:
continue
new_entries.append({
'SMILES': smiles,
'Molecule': mol,
'ID': get_mol_id(smiles),
})
# Try adding its no-stereochemistry version as well
smiles_nostereo = remove_stereo(smiles)
if smiles_nostereo is not None and smiles_nostereo != smiles:
mol_nostereo = Chem.MolFromSmiles(smiles_nostereo)
if mol_nostereo is not None:
new_entries.append({
'SMILES': canonize_smiles(smiles_nostereo),
'Molecule': mol_nostereo,
'ID': get_mol_id(smiles_nostereo),
})
new_entries = pd.DataFrame(new_entries).drop_duplicates()
if len(new_entries) > 0:
# Add fingerprints to the new entries
if morgan_fp_generator is None:
morgan_fp_generator = Chem.rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048, useBondTypes=True, includeChirality=True)
new_entries['FP'] = new_entries['Molecule'].apply(lambda x: morgan_fp_generator.GetFingerprint(x) if x is not None else None)
if verbose:
print(f'Number of substructures added to the dictionary: {len(new_entries)}')
# Return the updated dictionary
return pd.concat([dictionary, pd.DataFrame(new_entries)], axis=0).drop_duplicates(subset='SMILES').reset_index(drop=True)