Spaces:
Sleeping
Sleeping
File size: 2,889 Bytes
9dd777e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
from rdkit import Chem
import pandas as pd
from protac_splitter.chemoinformatics import (
canonize_smiles,
remove_stereo,
get_mol_id,
)
def update_dictionary(
dictionary: pd.DataFrame,
substr_to_add: list,
morgan_fp_generator = None,
verbose: int = 0,
) -> pd.DataFrame:
""" Updates a dictionary with a list of additional substructures.
The dictionary is a dataframe with columns 'SMILES', 'Molecule', 'ID', and 'FP'.
Args:
dictionary: The input dictionary dataframe.
substr_to_add: The list of additional substructures.
Returns:
The updated dictionary dataframe.
"""
# Canonize the SMILES strings
substr_to_add = [canonize_smiles(smiles) for smiles in substr_to_add if smiles is not None]
substr_to_add = list(set(substr_to_add))
# Remove entries already in the dictionary
for smiles in substr_to_add:
if not dictionary.empty and smiles in dictionary[f'SMILES'].unique().tolist():
if verbose > 1:
print(f'\tWARNING. SMILES already in the dictionary: {smiles}')
# Remove it from the list
substr_to_add.remove(smiles)
new_entries = []
for smiles in substr_to_add:
try:
mol = Chem.MolFromSmiles(smiles)
except Exception as e:
if verbose:
print(e)
mol = None
# Remove entries that result in invalid molecules
if mol is None:
continue
new_entries.append({
'SMILES': smiles,
'Molecule': mol,
'ID': get_mol_id(smiles),
})
# Try adding its no-stereochemistry version as well
smiles_nostereo = remove_stereo(smiles)
if smiles_nostereo is not None and smiles_nostereo != smiles:
mol_nostereo = Chem.MolFromSmiles(smiles_nostereo)
if mol_nostereo is not None:
new_entries.append({
'SMILES': canonize_smiles(smiles_nostereo),
'Molecule': mol_nostereo,
'ID': get_mol_id(smiles_nostereo),
})
new_entries = pd.DataFrame(new_entries).drop_duplicates()
if len(new_entries) > 0:
# Add fingerprints to the new entries
if morgan_fp_generator is None:
morgan_fp_generator = Chem.rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048, useBondTypes=True, includeChirality=True)
new_entries['FP'] = new_entries['Molecule'].apply(lambda x: morgan_fp_generator.GetFingerprint(x) if x is not None else None)
if verbose:
print(f'Number of substructures added to the dictionary: {len(new_entries)}')
# Return the updated dictionary
return pd.concat([dictionary, pd.DataFrame(new_entries)], axis=0).drop_duplicates(subset='SMILES').reset_index(drop=True) |