File size: 2,333 Bytes
9dd777e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from typing import Any, Optional, List

import numpy as np
from rdkit import Chem, DataStructs
from rdkit.Chem import rdFingerprintGenerator

def get_fp(
    smiles: str,
    fp_generator: Optional[Any] = None,
    return_np: bool = True,
) -> Optional[np.ndarray]:
    """
    Get the Morgan fingerprint of a molecule from its SMILES representation.
    
    Parameters:
        smiles (str): The SMILES string of the molecule.
        fp_generator (Any, optional): The fingerprint generator to use. If None, a default generator is used.
        return_np (bool): Whether to return the fingerprint as a NumPy array. Defaults to True.
        
    Returns:
        Optional[np.ndarray]: The Morgan fingerprint of the molecule as a NumPy array, or None if the SMILES is invalid.
    """
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
 
    if fp_generator is None:
        fp_generator = rdFingerprintGenerator.GetMorganGenerator(
            radius=16,
            fpSize=1024,
            useBondTypes=True,
            includeChirality=True,
        )

    if return_np:
        return fp_generator.GetFingerprintAsNumPy(mol)
    else:
        return fp_generator.GetFingerprint(mol)

def average_tanimoto_distance(
    smiles: str,
    fingerprints: List[DataStructs.ExplicitBitVect],
    morgan_fp_generator: Optional[Any] = None,
) -> float:
    """
    Compute the average Tanimoto distance between a query SMILES and a list of RDKit fingerprints.

    Parameters:
        smiles (str): SMILES string of the query molecule.
        fingerprints (list): List of RDKit fingerprint objects (e.g., ExplicitBitVect).
        morgan_fp_generator: RDKit Morgan fingerprint generator.

    Returns:
        float: Average Tanimoto distance (1 - similarity) between the query and the fingerprints.
    """
    query_fp = get_fp(smiles, morgan_fp_generator, return_np=False)
    if query_fp is None:
        raise ValueError(f"Invalid SMILES string: {smiles}")
    distances = DataStructs.BulkTanimotoSimilarity(query_fp, fingerprints, returnDistance=True)

    return np.array(distances).mean()

def numpy_to_rdkit_fp(arr: np.ndarray) -> DataStructs.ExplicitBitVect:
    """
    Convert a NumPy array to an RDKit ExplicitBitVect.
    """
    return DataStructs.CreateFromBitString(''.join(arr.astype(str)))