# MoL-MoE Foundation Models - Multi Output (K=4)

In [None]:
# System
import warnings
import sys
sys.path.insert(1, '../')
sys.path.insert(2, '../experts')
sys.path.insert(3, '../moe')
warnings.filterwarnings("ignore")

# Deep learning
import torch.nn.functional as F
import torch
from torch import nn
from moe import MoE, train
from models import Net

# Machine learning
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

# Data
import pandas as pd
import numpy as np

# Chemistry
from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem import Descriptors
PandasTools.RenderImagesInAllDataFrames(True)

def normalize_smiles(smi, canonical=True, isomeric=False):
 try:
 normalized = Chem.MolToSmiles(
 Chem.MolFromSmiles(smi), canonical=canonical, isomericSmiles=isomeric
 )
 except:
 normalized = None
 return normalized

torch.manual_seed(0)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Load Foundation Models

In [None]:
from experts.selfies_ted.load import SELFIES

model_selfies = SELFIES()
model_selfies.load()

In [None]:
from experts.mhg_model.load import load

mhg_gnn = load()

In [None]:
from experts.smi_ted_light.load import load_smi_ted, MolTranBertTokenizer

smi_ted = load_smi_ted()

## Load datasets

In [None]:
train_df = pd.read_csv("../data/moleculenet/bbbp/train.csv")
valid_df = pd.read_csv("../data/moleculenet/bbbp/valid.csv")
test_df = pd.read_csv("../data/moleculenet/bbbp/test.csv")

In [None]:
train_df['canon_smiles'] = train_df['smiles'].apply(normalize_smiles)
train_df = train_df.dropna(subset='canon_smiles')
print(train_df.shape)
train_df.head()

In [None]:
valid_df['canon_smiles'] = valid_df['smiles'].apply(normalize_smiles)
valid_df = valid_df.dropna(subset='canon_smiles')
print(valid_df.shape)
valid_df.head()

In [None]:
test_df['canon_smiles'] = test_df['smiles'].apply(normalize_smiles)
test_df = test_df.dropna(subset='canon_smiles')
print(test_df.shape)
test_df.head()

In [None]:
smiles_col = 'canon_smiles'
target = 'p_np'

# training
X_train = train_df[smiles_col].to_list()
y_train = train_df[target]

# validation
X_valid = valid_df[smiles_col].to_list()
y_valid = valid_df[target]

# test
X_test = test_df[smiles_col].to_list()
y_test = test_df[target]

## Training MoE

In [None]:
# arguments
input_size = 768
output_size = 2048
num_experts = 12
k = 4
batch_size = 16
learning_rate = 3e-5
epochs = 100

# experts
models = [
 smi_ted, smi_ted, smi_ted, smi_ted, # SMI-TED
 model_selfies, model_selfies, model_selfies, model_selfies, # SELFIES-BART
 mhg_gnn, mhg_gnn, mhg_gnn, mhg_gnn # MHG-GNN
]

# instantiate the MoE layer
net = Net(smiles_embed_dim=2048, dropout=0.2, output_dim=2)
tokenizer = MolTranBertTokenizer('../experts/smi_ted_light/bert_vocab_curated.txt')
moe_model = MoE(input_size, 
 output_size, 
 num_experts, 
 models=models, 
 tokenizer=tokenizer, 
 tok_emb=smi_ted.encoder.tok_emb, 
 k=k, 
 noisy_gating=False, 
 verbose=False).to(DEVICE)

net.apply(smi_ted._init_weights)

loss_fn = nn.CrossEntropyLoss()
params = list(moe_model.parameters()) + list(net.parameters())
optim = torch.optim.AdamW(params, lr=learning_rate)

train_loader = torch.utils.data.DataLoader(list(zip(X_train, y_train)), batch_size=batch_size,
 shuffle=True, num_workers=1)

# train
moe_model, net = train(train_loader, moe_model, net, loss_fn, optim, epochs)

## Evaluate (using auxiliary Net)

In [None]:
moe_model.eval()
net.eval()

with torch.no_grad():
 out, _ = moe_model(X_test, verbose=False)
 preds = net(out)
 preds_cpu = F.softmax(preds, dim=1)[:, 1]
 print('Prediction probabilities:', preds_cpu[:30])

In [None]:
roc_auc = roc_auc_score(y_test, preds_cpu.detach().numpy())
print(f"ROC-AUC Score: {roc_auc:.4f}")

# Training XGBoost from MoE

In [None]:
# extract embeddings
moe_model.eval()
net.eval()

with torch.no_grad():
 xgb_train, _ = moe_model(X_train, verbose=True)
 xgb_valid, _ = moe_model(X_valid, verbose=True)
 xgb_test, _ = moe_model(X_test, verbose=True)
 
xgb_train = xgb_train.detach().numpy()
xgb_valid = xgb_valid.detach().numpy()
xgb_test = xgb_test.detach().numpy()

In [None]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
import numpy as np

# Define lists to store ROC-AUC scores and model instances
roc_auc_scores = []

# Loop over seeds from 0 to 90 in steps of 10
for seed in range(0, 91, 10):
 # Define XGBoost parameters with different values for each seed
 xgb_params = {
 'learning_rate': [0.01, 0.4, 0.6, 0.8],
 'max_depth': [6, 8, 10, 12],
 'n_estimators': [1500, 2000, 2200]
 }

 # Initialize XGBoost classifier
 xgb_classifier = XGBClassifier()

 # Perform RandomizedSearchCV to find optimal hyperparameters
 random_search = RandomizedSearchCV(estimator=xgb_classifier, param_distributions=xgb_params, n_iter=10, scoring='roc_auc', cv=3, random_state=seed)
 random_search.fit(xgb_train, y_train)

 # Get best estimator and predict probabilities
 best_estimator = random_search.best_estimator_
 y_prob = best_estimator.predict_proba(xgb_test)[:, 1]

 # Evaluate ROC-AUC score
 roc_auc = roc_auc_score(y_test, y_prob)
 roc_auc_scores.append(roc_auc)

 print(f"Seed {seed}: ROC-AUC Score: {roc_auc:.4f}")

# Calculate standard deviation and average ROC-AUC score
std_dev = np.std(roc_auc_scores)
avg_roc_auc = np.mean(roc_auc_scores)

# Plot ROC-AUC scores
plt.figure(figsize=(8, 6))
plt.errorbar(range(0, 91, 10), roc_auc_scores, yerr=std_dev, fmt='o', color='b')
plt.hlines(avg_roc_auc, xmin=-1, xmax=91, colors='r', linestyles='dashed', label=f'Average ROC-AUC: {avg_roc_auc:.4f}')
plt.xlabel('Seed')
plt.ylabel('ROC-AUC Score')
plt.title('ROC-AUC Scores with Standard Deviation')
plt.legend()
plt.grid(True)
plt.show()