Spaces:

ailab-bio
/

PROTAC-Splitter-App

Sleeping

App Files Files Community

PROTAC-Splitter-App / protac_splitter /data /curation /mapping_utils.py

ribesstefano

Setup the spaces app

9dd777e 2 months ago

raw

history blame contribute delete

2.89 kB

	from rdkit import Chem
	import pandas as pd

	from protac_splitter.chemoinformatics import (
	canonize_smiles,
	remove_stereo,
	get_mol_id,
	)

	def update_dictionary(
	dictionary: pd.DataFrame,
	substr_to_add: list,
	morgan_fp_generator = None,
	verbose: int = 0,
	) -> pd.DataFrame:
	""" Updates a dictionary with a list of additional substructures.

	The dictionary is a dataframe with columns 'SMILES', 'Molecule', 'ID', and 'FP'.

	Args:
	dictionary: The input dictionary dataframe.
	substr_to_add: The list of additional substructures.

	Returns:
	The updated dictionary dataframe.
	"""
	# Canonize the SMILES strings
	substr_to_add = [canonize_smiles(smiles) for smiles in substr_to_add if smiles is not None]
	substr_to_add = list(set(substr_to_add))

	# Remove entries already in the dictionary
	for smiles in substr_to_add:
	if not dictionary.empty and smiles in dictionary[f'SMILES'].unique().tolist():
	if verbose > 1:
	print(f'\tWARNING. SMILES already in the dictionary: {smiles}')
	# Remove it from the list
	substr_to_add.remove(smiles)

	new_entries = []
	for smiles in substr_to_add:
	try:
	mol = Chem.MolFromSmiles(smiles)
	except Exception as e:
	if verbose:
	print(e)
	mol = None
	# Remove entries that result in invalid molecules
	if mol is None:
	continue
	new_entries.append({
	'SMILES': smiles,
	'Molecule': mol,
	'ID': get_mol_id(smiles),
	})
	# Try adding its no-stereochemistry version as well
	smiles_nostereo = remove_stereo(smiles)
	if smiles_nostereo is not None and smiles_nostereo != smiles:
	mol_nostereo = Chem.MolFromSmiles(smiles_nostereo)
	if mol_nostereo is not None:
	new_entries.append({
	'SMILES': canonize_smiles(smiles_nostereo),
	'Molecule': mol_nostereo,
	'ID': get_mol_id(smiles_nostereo),
	})
	new_entries = pd.DataFrame(new_entries).drop_duplicates()

	if len(new_entries) > 0:
	# Add fingerprints to the new entries
	if morgan_fp_generator is None:
	morgan_fp_generator = Chem.rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048, useBondTypes=True, includeChirality=True)

	new_entries['FP'] = new_entries['Molecule'].apply(lambda x: morgan_fp_generator.GetFingerprint(x) if x is not None else None)
	if verbose:
	print(f'Number of substructures added to the dictionary: {len(new_entries)}')

	# Return the updated dictionary
	return pd.concat([dictionary, pd.DataFrame(new_entries)], axis=0).drop_duplicates(subset='SMILES').reset_index(drop=True)