{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "73c4a5d6-a444-43c0-9812-298113480923", "metadata": {}, "outputs": [], "source": [ "import torch\n", "from tqdm import tqdm\n", "\n", "def generate_from_extrapolated_embeddings(model, tokenizer, base_embeddings,\n", " target_properties, extrapolation_factor=0.2):\n", " \"\"\"\n", " Generate molecules from extrapolated embeddings\n", " \n", " Args:\n", " model: Trained encoder-decoder model\n", " tokenizer: SMILES tokenizer\n", " base_embeddings: Base embeddings to extrapolate from\n", " target_properties: Target property values for extrapolation direction\n", " extrapolation_factor: How much to extrapolate (0.2 = 20% increase)\n", " \"\"\"\n", " model.eval()\n", " device = next(model.parameters()).device\n", " \n", " # Calculate property gradient direction in embedding space\n", " # This is a simplified approach - you might want to use more sophisticated methods\n", " mean_embedding = torch.mean(base_embeddings, dim=0)\n", " \n", " # Find direction of increasing properties\n", " high_prop_mask = target_properties > torch.median(target_properties)\n", " low_prop_mask = target_properties < torch.median(target_properties)\n", " \n", " high_prop_embeddings = base_embeddings[high_prop_mask].mean(dim=0)\n", " low_prop_embeddings = base_embeddings[low_prop_mask].mean(dim=0)\n", " \n", " property_direction = high_prop_embeddings - low_prop_embeddings\n", " property_direction = property_direction / torch.norm(property_direction)\n", " \n", " # Generate extrapolated embeddings\n", " extrapolated_embeddings = mean_embedding + extrapolation_factor * property_direction\n", " extrapolated_embeddings = extrapolated_embeddings.unsqueeze(0).to(device)\n", " \n", " # Generate SMILES from extrapolated embeddings\n", " with torch.no_grad():\n", " generated_ids = model.generate(extrapolated_embeddings)\n", " generated_smiles = tokenizer.decode(generated_ids[0], skip_special_tokens=True)\n", " \n", " return generated_smiles, extrapolated_embeddings\n", "\n", "def generate_slerp(model, tokenizer, base_embeddings,\n", " df, target_col, extrapolation_factor=0.2):\n", " \"\"\"\n", " Generate molecules from extrapolated embeddings\n", " \n", " Args:\n", " model: Trained encoder-decoder model\n", " tokenizer: SMILES tokenizer\n", " base_embeddings: Base embeddings to extrapolate from\n", " target_properties: Target property values for extrapolation direction\n", " extrapolation_factor: How much to extrapolate (0.2 = 20% increase)\n", " \"\"\"\n", " model.eval()\n", " device = next(model.parameters()).device\n", "\n", " co2_properties = torch.tensor(df['CO2'].values, dtype=torch.float32)\n", " ch4_properties = torch.tensor(df['CH4'].values, dtype=torch.float32)\n", " \n", " # Normalize properties to same scale before combining\n", " co2_norm = (co2_properties - co2_properties.mean()) / co2_properties.std()\n", " ch4_norm = (ch4_properties - ch4_properties.mean()) / ch4_properties.std()\n", " \n", " # Combined property (equal weighting)\n", " target_properties = co2_norm + ch4_norm\n", " \n", " extrapolated_embeddings = slerp_extrapolation(base_embeddings, target_properties, factor=extrapolation_factor)\n", " generated_molecules = []\n", " # Generate SMILES from extrapolated embeddings\n", " with torch.no_grad():\n", " generated_ids = model.generate(extrapolated_embeddings.cuda())\n", " generated_smiles = tokenizer.decode(generated_ids[0], skip_special_tokens=True)\n", "\n", " mol = Chem.MolFromSmiles(generated_smiles)\n", " is_valid = mol is not None\n", " \n", " if is_valid:\n", " canonical_smiles = MolToSmiles(mol, canonical=True)\n", " else:\n", " canonical_smiles = \"INVALID\"\n", " \n", " generated_molecules.append({\n", " 'generated_smiles': generated_smiles,\n", " 'is_valid': is_valid,\n", " 'target_property': 'slerp',\n", " 'extrapolation_factor': extrapolation_factor\n", " })\n", " \n", " print(f\"Generation: {generated_smiles} (Valid: {is_valid})\")\n", " return generated_molecules\n", "\n", "\n", "\n", "\n", "def generate_dual_enhanced_molecules(model, tokenizer, val_df, base_embeddings, \n", " extrapolation_factor=0.2, num_generations=10):\n", " \"\"\"Generate molecules with enhanced both CO₂ and CH₄ permeability\"\"\"\n", " \n", " # Combine both properties (you can weight them differently if needed)\n", " co2_properties = torch.tensor(val_df['CO2'].values, dtype=torch.float32)\n", " ch4_properties = torch.tensor(val_df['CH4'].values, dtype=torch.float32)\n", " \n", " # Normalize properties to same scale before combining\n", " co2_norm = (co2_properties - co2_properties.mean()) / co2_properties.std()\n", " ch4_norm = (ch4_properties - ch4_properties.mean()) / ch4_properties.std()\n", " \n", " # Combined property (equal weighting)\n", " combined_properties = co2_norm + ch4_norm\n", " \n", " generated_molecules = []\n", " \n", " print(f\"Generating {num_generations} molecules with enhanced dual permeability...\")\n", " print(f\"Extrapolation factor: {extrapolation_factor}\")\n", " \n", " for i in range(num_generations):\n", " generated_smiles, extrapolated_embedding = generate_from_extrapolated_embeddings(\n", " model, tokenizer, base_embeddings, combined_properties, extrapolation_factor\n", " )\n", " \n", " # Validate generated molecule\n", " mol = Chem.MolFromSmiles(generated_smiles)\n", " is_valid = mol is not None\n", " \n", " if is_valid:\n", " canonical_smiles = MolToSmiles(mol, canonical=True)\n", " else:\n", " canonical_smiles = \"INVALID\"\n", " \n", " generated_molecules.append({\n", " 'generation_id': i + 1,\n", " 'generated_smiles': generated_smiles,\n", " 'canonical_smiles': canonical_smiles,\n", " 'is_valid': is_valid,\n", " 'target_property': 'DUAL_enhanced',\n", " 'extrapolation_factor': extrapolation_factor\n", " })\n", " \n", " print(f\"Generation {i+1}: {generated_smiles} (Valid: {is_valid})\")\n", " \n", " return generated_molecules\n", "\n", "ch4_results = generate_enhanced_molecules_ch4(\n", " model, tokenizer, val_df, base_embeddings, \n", " extrapolation_factor=factor, num_generations=1\n", " )\n", " \n", " # Dual enhanced\n", " dual_results = generate_dual_enhanced_molecules(\n", " model, tokenizer, val_df, base_embeddings, \n", " extrapolation_factor=factor, num_generations=1\n", " )" ] }, { "cell_type": "code", "execution_count": null, "id": "b088097d-26b1-45c8-848f-b02784092e75", "metadata": {}, "outputs": [], "source": [ "from safetensors import safe_open\n", "\n", "checkpoint_path = '/home/jovyan/simson_training_bolgov/regression/decoder_checkpoints/checkpoint-110000/model.safetensors'\n", "\n", "state_dict = {}\n", "with safe_open(checkpoint_path, framework=\"pt\", device=\"cpu\") as f:\n", " for k in f.keys():\n", " state_dict[k] = f.get_tensor(k)\n", "\n", "model.load_state_dict(state_dict)" ] }, { "cell_type": "code", "execution_count": null, "id": "b9abe169-0a17-4728-be0c-4befb2439102", "metadata": {}, "outputs": [], "source": [ "print(property_loss.detach().item(), scale_factor * (lambda_reg * regularization_loss).detach().item())" ] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:.mlspace-bolgov_simson_training]", "language": "python", "name": "conda-env-.mlspace-bolgov_simson_training-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.11" } }, "nbformat": 4, "nbformat_minor": 5 }