In [None]:
import torch
from tqdm import tqdm

def generate_from_extrapolated_embeddings(model, tokenizer, base_embeddings,
 target_properties, extrapolation_factor=0.2):
 """
 Generate molecules from extrapolated embeddings
 
 Args:
 model: Trained encoder-decoder model
 tokenizer: SMILES tokenizer
 base_embeddings: Base embeddings to extrapolate from
 target_properties: Target property values for extrapolation direction
 extrapolation_factor: How much to extrapolate (0.2 = 20% increase)
 """
 model.eval()
 device = next(model.parameters()).device
 
 # Calculate property gradient direction in embedding space
 # This is a simplified approach - you might want to use more sophisticated methods
 mean_embedding = torch.mean(base_embeddings, dim=0)
 
 # Find direction of increasing properties
 high_prop_mask = target_properties > torch.median(target_properties)
 low_prop_mask = target_properties < torch.median(target_properties)
 
 high_prop_embeddings = base_embeddings[high_prop_mask].mean(dim=0)
 low_prop_embeddings = base_embeddings[low_prop_mask].mean(dim=0)
 
 property_direction = high_prop_embeddings - low_prop_embeddings
 property_direction = property_direction / torch.norm(property_direction)
 
 # Generate extrapolated embeddings
 extrapolated_embeddings = mean_embedding + extrapolation_factor * property_direction
 extrapolated_embeddings = extrapolated_embeddings.unsqueeze(0).to(device)
 
 # Generate SMILES from extrapolated embeddings
 with torch.no_grad():
 generated_ids = model.generate(extrapolated_embeddings)
 generated_smiles = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
 
 return generated_smiles, extrapolated_embeddings

def generate_slerp(model, tokenizer, base_embeddings,
 df, target_col, extrapolation_factor=0.2):
 """
 Generate molecules from extrapolated embeddings
 
 Args:
 model: Trained encoder-decoder model
 tokenizer: SMILES tokenizer
 base_embeddings: Base embeddings to extrapolate from
 target_properties: Target property values for extrapolation direction
 extrapolation_factor: How much to extrapolate (0.2 = 20% increase)
 """
 model.eval()
 device = next(model.parameters()).device

 co2_properties = torch.tensor(df['CO2'].values, dtype=torch.float32)
 ch4_properties = torch.tensor(df['CH4'].values, dtype=torch.float32)
 
 # Normalize properties to same scale before combining
 co2_norm = (co2_properties - co2_properties.mean()) / co2_properties.std()
 ch4_norm = (ch4_properties - ch4_properties.mean()) / ch4_properties.std()
 
 # Combined property (equal weighting)
 target_properties = co2_norm + ch4_norm
 
 extrapolated_embeddings = slerp_extrapolation(base_embeddings, target_properties, factor=extrapolation_factor)
 generated_molecules = []
 # Generate SMILES from extrapolated embeddings
 with torch.no_grad():
 generated_ids = model.generate(extrapolated_embeddings.cuda())
 generated_smiles = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

 mol = Chem.MolFromSmiles(generated_smiles)
 is_valid = mol is not None
 
 if is_valid:
 canonical_smiles = MolToSmiles(mol, canonical=True)
 else:
 canonical_smiles = "INVALID"
 
 generated_molecules.append({
 'generated_smiles': generated_smiles,
 'is_valid': is_valid,
 'target_property': 'slerp',
 'extrapolation_factor': extrapolation_factor
 })
 
 print(f"Generation: {generated_smiles} (Valid: {is_valid})")
 return generated_molecules




def generate_dual_enhanced_molecules(model, tokenizer, val_df, base_embeddings, 
 extrapolation_factor=0.2, num_generations=10):
 """Generate molecules with enhanced both CO₂ and CH₄ permeability"""
 
 # Combine both properties (you can weight them differently if needed)
 co2_properties = torch.tensor(val_df['CO2'].values, dtype=torch.float32)
 ch4_properties = torch.tensor(val_df['CH4'].values, dtype=torch.float32)
 
 # Normalize properties to same scale before combining
 co2_norm = (co2_properties - co2_properties.mean()) / co2_properties.std()
 ch4_norm = (ch4_properties - ch4_properties.mean()) / ch4_properties.std()
 
 # Combined property (equal weighting)
 combined_properties = co2_norm + ch4_norm
 
 generated_molecules = []
 
 print(f"Generating {num_generations} molecules with enhanced dual permeability...")
 print(f"Extrapolation factor: {extrapolation_factor}")
 
 for i in range(num_generations):
 generated_smiles, extrapolated_embedding = generate_from_extrapolated_embeddings(
 model, tokenizer, base_embeddings, combined_properties, extrapolation_factor
 )
 
 # Validate generated molecule
 mol = Chem.MolFromSmiles(generated_smiles)
 is_valid = mol is not None
 
 if is_valid:
 canonical_smiles = MolToSmiles(mol, canonical=True)
 else:
 canonical_smiles = "INVALID"
 
 generated_molecules.append({
 'generation_id': i + 1,
 'generated_smiles': generated_smiles,
 'canonical_smiles': canonical_smiles,
 'is_valid': is_valid,
 'target_property': 'DUAL_enhanced',
 'extrapolation_factor': extrapolation_factor
 })
 
 print(f"Generation {i+1}: {generated_smiles} (Valid: {is_valid})")
 
 return generated_molecules

ch4_results = generate_enhanced_molecules_ch4(
 model, tokenizer, val_df, base_embeddings, 
 extrapolation_factor=factor, num_generations=1
 )
 
 # Dual enhanced
 dual_results = generate_dual_enhanced_molecules(
 model, tokenizer, val_df, base_embeddings, 
 extrapolation_factor=factor, num_generations=1
 )

In [None]:
from safetensors import safe_open

checkpoint_path = '/home/jovyan/simson_training_bolgov/regression/decoder_checkpoints/checkpoint-110000/model.safetensors'

state_dict = {}
with safe_open(checkpoint_path, framework="pt", device="cpu") as f:
 for k in f.keys():
 state_dict[k] = f.get_tensor(k)

model.load_state_dict(state_dict)

In [None]:
print(property_loss.detach().item(), scale_factor * (lambda_reg * regularization_loss).detach().item())