File size: 15,725 Bytes
9dd777e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
import logging
from typing import Optional

from rdkit import Chem

from protac_splitter.chemoinformatics import (
    canonize,
    dummy2query,
    remove_attach_atom,
    remove_dummy_atoms,
)
from protac_splitter.evaluation import (
    split_prediction,
    check_reassembly,
)
from protac_splitter.data.curation.substructure_extraction import get_attachment_bonds

def fix_tetrahedral_centers_ligand(
        protac_mol: Chem.Mol,
        ligand_smiles: str,
        attachment_id: int = 1,
) -> Optional[str]:
    """ Fixes the tetrahedral centers of a ligand in a PROTAC molecule.
    
    Args:
        protac_mol (Chem.Mol): The RDKit molecule object of the PROTAC.
        ligand_smiles (str): The SMILES of the ligand to fix.
        attachment_id (int): The attachment point id of the ligand. Default is 1.
        
    Returns:
        A string containing the fixed ligand SMILES, or None if the fixing process failed.
    """
    ligand_mol = Chem.MolFromSmiles(ligand_smiles)
    if ligand_mol is None:
        logging.error(f"Invalid ligand SMILES: {ligand_smiles}")
        return None

    ligand_mol = remove_dummy_atoms(ligand_mol)
    ligand_match = protac_mol.GetSubstructMatch(ligand_mol, useChirality=False) # useChirality=True

    # Get bonds to break to separate the ligand
    bonds_to_break = get_attachment_bonds(protac_mol, ligand_match)

    # Return if no bonds are found
    if len(bonds_to_break) != 1:
        logging.error('ERROR: Multiple attachment bonds')
        return None

    # Break the bonds to isolate the ligand
    frag_ligand_mol = Chem.FragmentOnBonds(protac_mol, bonds_to_break, addDummies=True, dummyLabels=[(attachment_id, attachment_id)])

    # Get the fragments resulting from bond breaking
    try:
        frags = Chem.GetMolFrags(frag_ligand_mol, asMols=True, sanitizeFrags=True)
    except Exception as e:
        logging.error(e)
        return None

    # Identify the ligand fragment
    ligand_fragment = None
    for frag in frags:
        if frag.HasSubstructMatch(ligand_mol):
            ligand_fragment = frag
            break
    if ligand_fragment is None:
        logging.error('ERROR: POI fragment not found')

    ligand_fixed = Chem.MolToSmiles(ligand_fragment)
    ligand_fixed = canonize(ligand_fixed.replace(f'[{attachment_id}*]', f'[*:{attachment_id}]'))
    return ligand_fixed


def fix_prediction(
        protac_smiles: str,
        pred_smiles: str,
        poi_attachment_id: int = 1,
        e3_attachment_id: int = 2,
        remove_stereochemistry: bool = False,
        verbose: int = 0,
) -> Optional[str]:
    """ Fixes a prediction by replacing the substructure that does not match the PROTAC with the rest of the PROTAC.
    
    Args:
        protac_smiles (str): The SMILES of the PROTAC.
        pred_smiles (str): The SMILES of the prediction.
        poi_attachment_id (int): The attachment point id of the POI. Default is 1.
        e3_attachment_id (int): The attachment point id of the E3 ligase. Default is 2.
        verbose (int): The verbosity level. Default is 0.

    Returns:
        A string containing the fixed predictions, or None if the fixing process failed.
    """
    protac_mol = Chem.MolFromSmiles(protac_smiles)
    if protac_mol is None:
        logging.warning(f"Invalid PROTAC SMILES: {protac_smiles}")
        return None

    substructs = split_prediction(pred_smiles)

    # If there are at least two None values, there's nothing we can do to fix it
    if sum(v is None for v in substructs.values()) >= 2:
        logging.warning(f'Unable to continue, more than two substructures are not valid for given input: "{pred_smiles}"')
        return None

    # Get molecules of PROTAC and substructures
    substructs = {k: {'smiles': v, 'mol': Chem.MolFromSmiles(v) if v is not None else v} for k, v in substructs.items()}

    # Check if renaming the attachment points might already fix the prediction
    for sub in ['poi', 'e3', 'both']:
        if sub == 'e3':
            if substructs['e3']['smiles'] is None:
                continue
            e3_attempt = substructs['e3']['smiles'].replace(f'[*:{poi_attachment_id}]', f'[*:{e3_attachment_id}]')
            poi_attempt = substructs['poi']['smiles']
        if sub == 'poi':
            if substructs['poi']['smiles'] is None:
                continue
            e3_attempt = substructs['e3']['smiles']
            poi_attempt = substructs['poi']['smiles'].replace(f'[*:{e3_attachment_id}]', f'[*:{poi_attachment_id}]')
        else:
            if substructs['e3']['smiles'] is None or substructs['poi']['smiles'] is None:
                continue
            e3_attempt = substructs['e3']['smiles'].replace(f'[*:{e3_attachment_id}]', f'[*:{poi_attachment_id}]')
            poi_attempt = substructs['poi']['smiles'].replace(f'[*:{poi_attachment_id}]', f'[*:{e3_attachment_id}]')

        protac_attempt = f"{e3_attempt}.{substructs['linker']['smiles']}.{poi_attempt}"
        if check_reassembly(protac_smiles, protac_attempt):
            logging.info(f'Input works when renaming attachment points in {sub.title()} substruct. SMILES: "{protac_attempt}"')
            return protac_attempt
    
        # Check if swapping the POI and E3 attachments in the linker might already fix the prediction
        if substructs['linker']['smiles'] is None:
            continue
        linker_attempt = substructs['linker']['smiles']
        linker_attempt = linker_attempt.replace(f'[*:{poi_attachment_id}]', f'[*:DUMMY]')
        linker_attempt = linker_attempt.replace(f'[*:{e3_attachment_id}]', f'[*:{poi_attachment_id}]')
        linker_attempt = linker_attempt.replace(f'[*:DUMMY]', f'[*:{e3_attachment_id}]')

        # Try with the original POI and E3 substructures
        protac_attempt = f"{substructs['e3']['smiles']}.{linker_attempt}.{substructs['poi']['smiles']}"
        if check_reassembly(protac_smiles, protac_attempt):
            logging.info(f'Input works when swapping POI and E3 attachment points in the linker. Fixed SMILES: "{protac_attempt}"')
            return protac_attempt

        # Try with the swapped POI and E3 substructures
        protac_attempt = f"{e3_attempt}.{linker_attempt}.{poi_attempt}"
        if check_reassembly(protac_smiles, protac_attempt):
            logging.info(f'Input works when swapping POI and E3 attachment points in the linker and in {sub.title()} substruct. Fixed SMILES: "{protac_attempt}"')
            return protac_attempt

    # Check if removing stereochemistry results in a valid prediction
    if remove_stereochemistry:
        Chem.RemoveStereochemistry(protac_mol)
        protac_smiles = Chem.MolToSmiles(protac_mol, canonical=True)
        for k, v in substructs.items():
            if v['mol'] is not None:
                Chem.RemoveStereochemistry(v['mol'])
                substructs[k]['smiles'] = Chem.MolToSmiles(v['mol'], canonical=True)

    if all(v['mol'] is not None for v in substructs.values()):
        if check_reassembly(
            protac_smiles,
            '.'.join([v['smiles'] for v in substructs.values()]),
        ):
            logging.info(f'Input works when removing stereochemistry. SMILES: "{pred_smiles}"')
            return f"{substructs['e3']['smiles']}.{substructs['linker']['smiles']}.{substructs['poi']['smiles']}"

    # Check if any of the substructures is NOT a substructure of the PROTAC, if
    # so, we mark it as the wrong substructure to fix.
    num_matches = 0
    wrong_substruct = None
    for sub in ['poi', 'linker', 'e3']:
        if substructs[sub]['mol'] is None:
            substructs[sub]['match'] = False
            wrong_substruct = sub
        elif protac_mol.HasSubstructMatch(dummy2query(substructs[sub]['mol'])):
            substructs[sub]['match'] = True
            num_matches += 1
        else:
            substructs[sub]['match'] = False
            wrong_substruct = sub

    if num_matches < 2:
        logging.warning(f'Prediction does not contain at least two matching substructures of the PROTAC. Num matches: {num_matches}. Prediction SMILES: "{pred_smiles}"')
        return None

    # If the wrong substructure is still matching in the PROTAC, we need to a
    # more complex approach to fix the prediction (see below).
    def remove_substructure(mol, substructure, attachment_id, replaceDummies=False):
        if mol is None or substructure is None:
            return None
        smaller_mol = Chem.ReplaceCore(
            mol,
            substructure,
            labelByIndex=False,
            replaceDummies=replaceDummies,
        )
        if smaller_mol is None:
            logging.warning(f'Failed to remove substructure from prediction SMILES: "{pred_smiles}"')
            return None
        smaller_smiles = Chem.MolToSmiles(smaller_mol, canonical=True)
        smaller_smiles = smaller_smiles.replace('[1*]', f'[*:{attachment_id}]')
        smaller_smiles = smaller_smiles.replace('[2*]', f'[*:{attachment_id}]')
        smaller_mol = canonize(Chem.MolFromSmiles(smaller_smiles))
        return smaller_mol

    # If we still have 3 matches: for each substructure, we progressively remove
    # the other substructures, then we check if the resulting molecule is valid
    # and has only one fragment.
    if num_matches == 3:
        wrong_substruct = None
        for sub in ['poi', 'linker', 'e3']:
            removed_mol = Chem.MolFromSmiles(protac_smiles)

            # Put the current substructure at the end of the list [poi, e3, linker]
            sub_names = ['poi', 'e3', 'linker']
            sub_names.remove(sub)
            sub_names.append(sub)
            # The linker often matches in many parts of the PROTAC, so we remove
            # it when checking the E3 and POI substructures.
            if sub != 'linker':
                sub_names.remove('linker')

            for s in sub_names:
                attachment_id = poi_attachment_id if s == 'poi' else e3_attachment_id
                removed_mol = remove_substructure(
                    removed_mol,
                    dummy2query(substructs[s]['mol']),
                    attachment_id=attachment_id,
                )

            # Check if resulting molecule is None, if so, it is the wrong one
            if removed_mol is None:
                substructs[sub]['match'] = False
                wrong_substruct = sub
                num_matches -= 1
                break

            # Count the number of fragments in the removed molecule
            num_fragments = Chem.GetMolFrags(removed_mol, asMols=True, sanitizeFrags=False)
            if len(num_fragments) > 1:
                substructs[sub]['match'] = False
                wrong_substruct = sub
                num_matches -= 1
                break

    if num_matches == 3:
        logging.warning(f'Prediction already contains all matching substructures of the PROTAC. Prediction SMILES: "{pred_smiles}"')
        return None

    # Get the order in which to remove the substructures and get the final one
    # as the fixed molecule.
    if wrong_substruct == 'linker':
        poi_atoms = substructs['poi']['mol'].GetNumAtoms()
        e3_atoms = substructs['e3']['mol'].GetNumAtoms()
        order = ['poi', 'e3'] if poi_atoms > e3_atoms else ['e3', 'poi']
    else:
        if wrong_substruct == 'poi':
            order = ['e3', 'linker']
        else:
            order = ['poi', 'linker']

    logging.debug(f'Wrong substructure: {wrong_substruct.upper()}. Order: {order}')

    fixed_mol = protac_mol
    for sub in order:
        logging.debug(f'Removing substructure {sub.upper()} from PROTAC.')

        if 'linker' not in order:
            fixed_attach_id = poi_attachment_id if sub == 'poi' else e3_attachment_id
        else:
            fixed_attach_id = poi_attachment_id if 'e3' in order else e3_attachment_id

        if sub == 'linker':
            attach_id = poi_attachment_id if wrong_substruct == 'poi' else e3_attachment_id
            fixed_attach_id = poi_attachment_id if wrong_substruct == 'poi' else e3_attachment_id
            query_mol = remove_attach_atom(substructs[sub]['mol'], attach_id)
            replaceDummies = True
        else:
            query_mol = dummy2query(substructs[sub]['mol'])
            replaceDummies = False

        if verbose:
            # display(Draw.MolToImage(fixed_mol, legend=f"Starting molecule", size=(800, 300)))
            # display(Draw.MolToImage(query_mol, legend=f"Molecule {sub.upper()} to remove", size=(800, 300)))
            pass

        fixed_mol_tmp = remove_substructure(
            fixed_mol,
            query_mol,
            attachment_id=fixed_attach_id,
            replaceDummies=replaceDummies,
        )
        if fixed_mol_tmp is None:
            logging.debug(f'Failed to replace substructure "{sub}" in prediction SMILES: "{pred_smiles}"')
            continue

        fixed_mol = fixed_mol_tmp

        # If there are multiple fragments, keep the biggest one
        fragments = Chem.GetMolFrags(fixed_mol, asMols=True)
        if len(fragments) > 1:
            logging.debug(f'Fixed molecule contains more than one fragment. Keeping the biggest one.')
            max_frag = max(fragments, key=lambda x: x.GetNumAtoms())
            fixed_mol = max_frag

    # Get the SMILES of the fixed molecule
    fixed_smiles = Chem.MolToSmiles(canonize(fixed_mol), canonical=True)
    substructs[wrong_substruct]['smiles'] = fixed_smiles

    if verbose:
        # display(Draw.MolToImage(fixed_mol, legend=f"{wrong_substruct.upper()} fixed molecule: {fixed_smiles}", size=(800, 300)))
        pass

    # Concatenate the substructures check if the re-assembly is correct
    fixed_pred_smiles = f"{substructs['e3']['smiles']}.{substructs['linker']['smiles']}.{substructs['poi']['smiles']}"

    if not check_reassembly(
        protac_smiles,
        fixed_pred_smiles,
    ):
        # logging.warning(f"Failed to fix prediction, re-assembly check failed. Generated fixed prediction (failing): {fixed_pred_smiles}")
        # return None
        
        # Check if by flipping the tetrahedral centers of the ligands we can
        # still fix the prediction.
        protac_mol = canonize(Chem.MolFromSmiles(protac_smiles))
        chiral_centers = Chem.FindMolChiralCenters(
            protac_mol,
            includeUnassigned=True,
            useLegacyImplementation=False,
        )
        if not chiral_centers:
            logging.warning(f"Failed to fix prediction, re-assembly check failed. Generated fixed prediction (failing): {fixed_pred_smiles}")
            return None

        # Attempt to fix the tetrahedral centers of the ligands
        e3_fixed = fix_tetrahedral_centers_ligand(protac_mol, substructs['e3']['smiles'], attachment_id=e3_attachment_id)
        poi_fixed = fix_tetrahedral_centers_ligand(protac_mol, substructs['poi']['smiles'], attachment_id=poi_attachment_id)
        if e3_fixed is None or poi_fixed is None:
            logging.warning(f"Failed to fix prediction, re-assembly check failed. Generated fixed prediction (failing): {fixed_pred_smiles}")
            return None

        # Update the substructures with the fixed ligands and check re-assembly
        substructs['e3']['smiles'] = e3_fixed
        substructs['poi']['smiles'] = poi_fixed
        fixed_pred_smiles = f"{substructs['e3']['smiles']}.{substructs['linker']['smiles']}.{substructs['poi']['smiles']}"
        if not check_reassembly(
            protac_smiles,
            fixed_pred_smiles,
        ):
            logging.warning(f"Failed to fix prediction, re-assembly check failed. Generated fixed prediction (failing): {fixed_pred_smiles}")
            return None

    return fixed_pred_smiles