File size: 2,889 Bytes
9dd777e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
from rdkit import Chem
import pandas as pd

from protac_splitter.chemoinformatics import (
    canonize_smiles,
    remove_stereo,
    get_mol_id,
)

def update_dictionary(
        dictionary: pd.DataFrame,
        substr_to_add: list,
        morgan_fp_generator = None,
        verbose: int = 0,
) -> pd.DataFrame:
    """ Updates a dictionary with a list of additional substructures.
    
    The dictionary is a dataframe with columns 'SMILES', 'Molecule', 'ID', and 'FP'.

    Args:
        dictionary: The input dictionary dataframe.
        substr_to_add: The list of additional substructures.

    Returns:
        The updated dictionary dataframe.
    """
    # Canonize the SMILES strings
    substr_to_add = [canonize_smiles(smiles) for smiles in substr_to_add if smiles is not None]
    substr_to_add = list(set(substr_to_add))

    # Remove entries already in the dictionary
    for smiles in substr_to_add:
        if not dictionary.empty and smiles in dictionary[f'SMILES'].unique().tolist():
            if verbose > 1:
                print(f'\tWARNING. SMILES already in the dictionary: {smiles}')
            # Remove it from the list
            substr_to_add.remove(smiles)

    new_entries = []
    for smiles in substr_to_add:
        try:
            mol = Chem.MolFromSmiles(smiles)
        except Exception as e:
            if verbose:
                print(e)
            mol = None
        # Remove entries that result in invalid molecules
        if mol is None:
            continue
        new_entries.append({
            'SMILES': smiles,
            'Molecule': mol,
            'ID': get_mol_id(smiles),
        })
        # Try adding its no-stereochemistry version as well
        smiles_nostereo = remove_stereo(smiles)
        if smiles_nostereo is not None and smiles_nostereo != smiles:
            mol_nostereo = Chem.MolFromSmiles(smiles_nostereo)
            if mol_nostereo is not None:
                new_entries.append({
                    'SMILES': canonize_smiles(smiles_nostereo),
                    'Molecule': mol_nostereo,
                    'ID': get_mol_id(smiles_nostereo),
                })
    new_entries = pd.DataFrame(new_entries).drop_duplicates()
    
    if len(new_entries) > 0:
        # Add fingerprints to the new entries
        if morgan_fp_generator is None:
            morgan_fp_generator = Chem.rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048, useBondTypes=True, includeChirality=True)

        new_entries['FP'] = new_entries['Molecule'].apply(lambda x: morgan_fp_generator.GetFingerprint(x) if x is not None else None)
        if verbose:
            print(f'Number of substructures added to the dictionary: {len(new_entries)}')

    # Return the updated dictionary
    return pd.concat([dictionary, pd.DataFrame(new_entries)], axis=0).drop_duplicates(subset='SMILES').reset_index(drop=True)