Spaces:
Sleeping
Sleeping
import os | |
from typing import Optional, Dict, Any, Callable, Tuple, Union | |
from functools import partial | |
import subprocess | |
import copy | |
import datetime | |
import logging | |
import math | |
import json | |
import torch | |
import numpy as np | |
import huggingface_hub as hf | |
from transformers import ( | |
Trainer, | |
TrainingArguments, | |
Seq2SeqTrainer, | |
Seq2SeqTrainingArguments, | |
DataCollatorForSeq2Seq, | |
DataCollatorForLanguageModeling, | |
AutoTokenizer, | |
GenerationConfig, | |
TrainerCallback, | |
set_seed, | |
) | |
from accelerate.utils import write_basic_config | |
from accelerate import Accelerator | |
import optuna | |
from optuna.samplers import QMCSampler | |
from optuna.pruners import ( | |
BasePruner, | |
HyperbandPruner, | |
ThresholdPruner, | |
PatientPruner, | |
MedianPruner, | |
) | |
from optuna.study._study_direction import StudyDirection | |
from .data_utils import load_tokenized_dataset | |
from .evaluation import decode_and_get_metrics | |
from .hf_utils import ( | |
create_hf_repository, | |
delete_hf_repository, | |
repo_exists, | |
upload_single_file, | |
) | |
from .model_utils import get_encoder_decoder_model, get_causal_model | |
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Use GPU with index 0 | |
# logging.basicConfig(level=logging.DEBUG) | |
class PrintStepCallback(TrainerCallback): | |
def on_init_end(self, args, state, control, **kwargs): | |
print(f"[{datetime.datetime.now()}] Initialization complete. Training is starting.") | |
def on_step_begin(self, args, state, control, **kwargs): | |
if state.global_step % args.logging_steps == 0: | |
print(f"[{datetime.datetime.now()}] Global step: {state.global_step:,}") | |
class ScoreMetric: | |
def __init__(self): | |
self.batch_scores = [] | |
def update(self, scores): | |
self.batch_scores.append(scores) | |
def compute(self): | |
all_labels = set() | |
for scores in self.batch_scores: | |
all_labels.update(scores.keys()) | |
aggregate_scores = {} | |
for k in all_labels: | |
scores = [s.get(k, np.nan) for s in self.batch_scores] | |
print(f"{k}: {np.nanmean(scores):.4f}") | |
aggregate_scores[k] = np.nanmean(scores) | |
self.batch_scores = [] | |
return aggregate_scores | |
score_metric = ScoreMetric() | |
hp_score_metric = ScoreMetric() | |
class WrappedEarlyStoppingPruner(BasePruner): | |
""" | |
Pruner that wraps another pruner and checks if the trial should be pruned. | |
It first evaluates the wrapped pruner and, if the wrapped pruner suggests | |
pruning, prune. Otherwise, evaluates based on a patience threshold with a | |
tolerance (min_delta) and eventually prunes. | |
Args: | |
wrapped_pruner: | |
Wrapped pruner to check first. Pruning is only applied if this pruner recommends it. | |
patience: | |
Number of steps to wait for an improvement before pruning. | |
min_delta: | |
Minimum improvement required to reset patience. | |
n_warmup_steps: | |
Number of initial steps to skip the patience check. | |
""" | |
def __init__( | |
self, | |
wrapped_pruner: BasePruner, | |
patience: int, | |
min_delta: float = 0.0, | |
n_warmup_steps: int = 0, | |
) -> None: | |
if wrapped_pruner is None or not isinstance(wrapped_pruner, BasePruner): | |
raise ValueError(f"wrapped_pruner must be an instance of BasePruner but got {wrapped_pruner}.") | |
if patience < 0: | |
raise ValueError(f"patience cannot be negative but got {patience}.") | |
if min_delta < 0: | |
raise ValueError(f"min_delta cannot be negative but got {min_delta}.") | |
if n_warmup_steps < 0: | |
raise ValueError(f"n_warmup_steps cannot be negative but got {n_warmup_steps}.") | |
self._wrapped_pruner = wrapped_pruner | |
self._patience = patience | |
self._min_delta = min_delta | |
self._n_warmup_steps = n_warmup_steps | |
def prune(self, study: "optuna.study.Study", trial: "optuna.trial.FrozenTrial") -> bool: | |
step = trial.last_step | |
if step is None: | |
return False | |
intermediate_values = trial.intermediate_values | |
steps = np.asarray(list(intermediate_values.keys())) | |
# If there are insufficient steps or we are still in the warmup phase, do not prune. | |
if steps.size <= self._patience + 1 or step < self._n_warmup_steps: | |
return False | |
# First, check the wrapped pruner. If it suggests pruning, prune. | |
if self._wrapped_pruner.prune(study, trial): | |
return True | |
steps.sort() | |
# This is the score patience steps ago | |
steps_before_patience = steps[: -self._patience - 1] | |
scores_before_patience = np.asarray( | |
list(intermediate_values[step] for step in steps_before_patience) | |
) | |
# And these are the scores after that | |
steps_after_patience = steps[-self._patience - 1 :] | |
scores_after_patience = np.asarray( | |
list(intermediate_values[step] for step in steps_after_patience) | |
) | |
direction = study.direction | |
if direction == StudyDirection.MINIMIZE: | |
should_prune = np.nanmin(scores_before_patience) + self._min_delta < np.nanmin( | |
scores_after_patience | |
) | |
else: | |
should_prune = np.nanmax(scores_before_patience) - self._min_delta > np.nanmax( | |
scores_after_patience | |
) | |
return should_prune | |
def get_lr_scheduler_kwargs(lr_scheduler_type: str) -> Dict[str, Any]: | |
""" Returns the default learning rate scheduler kwargs for a given type. | |
Reference: https://huggingface.co/docs/timm/en/reference/schedulers | |
Args: | |
lr_scheduler_type (str): The type of the learning rate scheduler. | |
Returns: | |
Dict[str, Any]: The default learning rate scheduler kwargs. | |
""" | |
if lr_scheduler_type == "cosine": | |
return {} | |
elif lr_scheduler_type == "cosine_with_restarts": | |
return {"num_cycles": 3} | |
elif lr_scheduler_type == "cosine_with_min_lr": | |
return {} | |
elif lr_scheduler_type == "polynomial": | |
return {"power": 1.0} | |
elif lr_scheduler_type == "reduce_lr_on_plateau": | |
return {"min_lr": 1e-6} | |
else: | |
raise ValueError(f"Unknown learning rate scheduler type: '{lr_scheduler_type}'") | |
def get_best_hyperparameters( | |
model_init: Callable, | |
tokenizer: AutoTokenizer, | |
data_collator: Union[DataCollatorForSeq2Seq, DataCollatorForLanguageModeling], | |
compute_metrics: Callable, | |
dataset_tokenized: Dict[str, Any], | |
training_args: Dict[str, Any], | |
num_optuna_trials: int, | |
lr_scheduler_type: Optional[str] = None, | |
causal_language_modeling: bool = False, | |
all_fragments_as_labels: bool = True, | |
linkers_only_as_labels: bool = False, | |
) -> Tuple[float, Dict[str, Any], Dict[str, Any]]: | |
"""Runs an Optuna hyperparameter search to find the best hyperparameters. | |
Args: | |
model_init (Callable): The model initialization function. | |
tokenizer (AutoTokenizer): The tokenizer. | |
data_collator (DataCollatorForSeq2Seq): The data collator. | |
compute_metrics (Callable): The compute metrics function. | |
dataset_tokenized (Dict[str, Any]): The tokenized dataset. | |
training_args (Dict[str, Any]): The training arguments. | |
num_optuna_trials (int): The number of Optuna trials. | |
Returns: | |
Tuple[float, Dict[str, Any], Dict[str, Any]]: The best objective, the best hyperparameters, and the best training arguments. | |
""" | |
def optuna_hp_space(trial): | |
# NOTE: Tuning generation config is not implemented yet, please refer to this issue: https://github.com/huggingface/transformers/issues/33755 | |
# Suggest hparams "shared" across all scheduler types | |
# learning_rate = trial.suggest_float("learning_rate", 1e-6, 1e-3, log=True) | |
# warmup_ratio = trial.suggest_float("warmup_ratio", 0.01, 0.1, step=0.01) | |
# Restrict learning rate closer to best-performing values | |
learning_rate = trial.suggest_float("learning_rate", 5e-6, 2e-4, log=True) # Previously 1e-6 to 1e-3 | |
# Slightly adjust warmup ratio to avoid extreme values | |
warmup_ratio = trial.suggest_float("warmup_ratio", 0.02, 0.06, step=0.01) # Previously 0.01 to 0.1 | |
# NOTE: We might want to use QMCSampler instead of TPESampler, which | |
# doesn't support categorical parameters. Categories can be encoded as | |
# integers and then decoded back to the original categories. | |
# NOTE: According to the GitHub code, the number of training and warmup | |
# steps for the scheduler types are automatically set, we don't need to | |
# pass them in the lr_scheduler_kwargs. | |
if lr_scheduler_type is None: | |
lr_scheduler_types = ["cosine", "cosine_with_restarts", "reduce_lr_on_plateau"] # "cosine_with_min_lr", "polynomial" | |
suggested_lr_sched = trial.suggest_int("lr_scheduler_type", 0, len(lr_scheduler_types) - 1) | |
suggested_lr_sched = lr_scheduler_types[suggested_lr_sched] | |
lr_scheduler_kwargs = get_lr_scheduler_kwargs(lr_scheduler_type) | |
elif lr_scheduler_type == "cosine": | |
lr_scheduler_kwargs = { | |
"num_cycles": trial.suggest_float("num_cycles", 0.5, 10, step=0.5), | |
} | |
elif lr_scheduler_type == "cosine_with_restarts": | |
lr_scheduler_kwargs = { | |
"num_cycles": trial.suggest_int("num_cycles", 1, 10, step=1), | |
} | |
elif lr_scheduler_type == "reduce_lr_on_plateau": | |
lr_scheduler_kwargs = { | |
"min_lr": trial.suggest_float("min_lr", 1e-10, 1e-8, log=True), # Previously 1e-12 to 1e-9 | |
"factor": trial.suggest_float("factor", 0.8, 0.98, step=0.01), # Previously 0.1 to 0.99 | |
} | |
return { | |
"lr_scheduler_kwargs": lr_scheduler_kwargs, | |
"lr_scheduler_type": lr_scheduler_type if lr_scheduler_type is not None else suggested_lr_sched, | |
"learning_rate": learning_rate, | |
"warmup_ratio": warmup_ratio, | |
} | |
if causal_language_modeling: | |
def compute_objective(metrics: Dict[str, float]): | |
# NOTE: We want to minimize the model perplexity, which is the | |
# exponential of the negative log-likelihood loss. Optuna is setup | |
# to maximize the objective, so we return the negative perplexity. | |
return -math.exp(metrics["eval_loss"]) | |
else: | |
if all_fragments_as_labels: | |
def compute_objective(metrics: Dict[str, float]): | |
# NOTE: Having a higher eval_reassembly score should also correspond | |
# to a low eval loss, so we just focus on the reassembly score. | |
return metrics["eval_all_ligands_equal"] | |
else: | |
if linkers_only_as_labels: | |
def compute_objective(metrics: Dict[str, float]): | |
return metrics["eval_linker_equal"] | |
else: | |
def compute_objective(metrics: Dict[str, float]): | |
return metrics["eval_e3_equal"] + metrics["eval_poi_equal"] | |
def hp_name(trial: Any) -> str: | |
trial_name = f"trial-number={trial.number}" | |
for hparam, value in trial.params.items(): | |
# Check if the value is a float and round it to 3 decimals | |
if hparam == "learning_rate": | |
value = f"{value:.1e}" | |
elif isinstance(value, float): | |
value = f"{value:.3f}" | |
trial_name += f"-{hparam}={value}" | |
return trial_name | |
# Override the training steps | |
hp_training_args = copy.deepcopy(training_args) | |
hp_training_args["num_train_epochs"] = -1 | |
hp_training_args["max_steps"] = 10_000 | |
hp_training_args["eval_steps"] = 2500 | |
hp_training_args["eval_delay"] = 5000 # TODO: Double check if this is needed | |
hp_training_args["logging_steps"] = 500 | |
hp_training_args["save_steps"] = 5000 | |
if not causal_language_modeling: | |
# Use greedy decoding for the evaluation during HP search | |
hp_training_args["generation_config"] = GenerationConfig( | |
max_length=512, | |
max_new_tokens=512, | |
do_sample=False, | |
num_beams=1, | |
) | |
print("Hyperparameter search training arguments:") | |
for k, v in hp_training_args.items(): | |
if 'token' in k: | |
continue | |
print(f" - {k}: {v}") | |
if causal_language_modeling: | |
TrainerClass = Trainer | |
TrainingArgumentsClass = TrainingArguments | |
else: | |
TrainerClass = Seq2SeqTrainer | |
TrainingArgumentsClass = Seq2SeqTrainingArguments | |
# Setup a "fake" Trainer for the hyperparameter search | |
trainer = TrainerClass( | |
model_init=model_init, | |
tokenizer=tokenizer, | |
data_collator=data_collator, | |
args=TrainingArgumentsClass(**hp_training_args), | |
compute_metrics=compute_metrics, | |
train_dataset=dataset_tokenized["train"], | |
eval_dataset=dataset_tokenized["validation"], | |
callbacks=[PrintStepCallback], | |
) | |
# Setup the Optuna pruner and sampler | |
max_warmup_ratio = 0.1 | |
pruner = WrappedEarlyStoppingPruner( | |
MedianPruner( | |
n_startup_trials=0, | |
interval_steps=1, | |
n_warmup_steps=int(max_warmup_ratio * hp_training_args["max_steps"]), | |
), | |
patience=5, # Check every 5000 training steps | |
min_delta=0.01, | |
n_warmup_steps=int(max_warmup_ratio * hp_training_args["max_steps"]), | |
) | |
sampler = QMCSampler(scramble=True, seed=42) | |
# NOTE: The Trainer will return a BestRun object, not the Optuna trial | |
best_run = trainer.hyperparameter_search( | |
direction="maximize", | |
backend="optuna", | |
hp_space=optuna_hp_space, | |
hp_name=hp_name, | |
n_trials=num_optuna_trials, | |
compute_objective=compute_objective, # Default: Will sum over all metrics but loss | |
sampler=sampler, | |
pruner=pruner, | |
) | |
# Set the best hyperparameters in the original Trainer arguments | |
try: | |
print("-" * 80) | |
print(f"Best trial objective: {best_run.objective:.4f}. Summary: {best_run.run_summary}") | |
except Exception as e: | |
print(e) | |
print("WARNING. Best trial objective could not be printed.") | |
return best_run, hp_training_args | |
def train_model( | |
model_id: str, | |
ds_name: str, | |
ds_config: str = 'default', | |
learning_rate: float = 5e-5, | |
max_steps: int = -1, | |
num_train_epochs: int = 40, | |
batch_size: int = 128, | |
batch_size_tokenizer: int = 512, | |
gradient_accumulation_steps: int = 4, | |
hub_token: Optional[str] = None, | |
organization: Optional[str] = None, | |
output_dir: str = "./models/", | |
tokenizer: Union[AutoTokenizer, str] = "seyonec/ChemBERTa-zinc-base-v1", | |
pretrained_encoder: str = "seyonec/ChemBERTa-zinc-base-v1", | |
pretrained_decoder: str = "seyonec/ChemBERTa-zinc-base-v1", | |
encoder_max_length: int = 512, | |
decoder_max_length: int = 512, | |
tie_encoder_decoder: bool = False, | |
delete_repo_if_exists: bool = False, | |
delete_local_repo_if_exists: bool = False, | |
training_args: Optional[Dict[str, Any]] = None, | |
resume_from_checkpoint: Optional[str] = None, | |
num_optuna_trials: int = 0, | |
num_proc_map: int = 1, | |
per_device_train_batch_size: Optional[int] = None, | |
per_device_eval_batch_size: Optional[int] = None, | |
lr_scheduler_type: Optional[str] = None, | |
cache_dir: Optional[str] = None, | |
randomize_smiles: bool = False, | |
randomize_smiles_prob: float = 0.0, | |
all_fragments_as_labels: bool = True, | |
linkers_only_as_labels: bool = False, | |
warmup_ratio: Optional[float] = None, | |
num_cycles: Optional[int] = None, | |
warmup_steps: Optional[int] = None, | |
causal_language_modeling: bool = False, | |
train_size_ratio: float = 1.0, | |
training_args_bin: Optional[str] = None, | |
): | |
"""Trains a model on a given dataset. | |
Args: | |
model_id (str): The name of the model to be trained. | |
ds_name (str): The name of the dataset to be used for training. | |
ds_config (str, optional): The name of the dataset configuration to be used for training. Defaults to 'default'. | |
learning_rate (float, optional): The learning rate. Defaults to 5e-5. | |
max_steps (int, optional): The maximum number of training steps. Defaults to -1. | |
num_train_epochs (int, optional): The number of training epochs. Defaults to 40. | |
batch_size (int, optional): The batch size. Defaults to 128. | |
batch_size_tokenizer (int, optional): The batch size for the tokenizer. Defaults to 512. | |
gradient_accumulation_steps (int, optional): The number of gradient accumulation steps. Defaults to 4. | |
hub_token (Optional[str], optional): The Hugging Face token. Defaults to None. | |
organization (Optional[str], optional): The Hugging Face organization. Defaults to None. | |
output_dir (str, optional): The output directory. Defaults to "./models/". | |
tokenizer (AutoTokenizer | str, optional): The tokenizer. Defaults to "seyonec/ChemBERTa-zinc-base-v1". | |
pretrained_encoder (str, optional): The name of the pretrained encoder. Defaults to "seyonec/ChemBERTa-zinc-base-v1". | |
pretrained_decoder (str, optional): The name of the pretrained decoder. Defaults to "seyonec/ChemBERTa-zinc-base-v1". | |
encoder_max_length (int, optional): The maximum length of the encoder. Defaults to 256. | |
decoder_max_length (int, optional): The maximum length of the decoder. Defaults to 256. | |
delete_repo_if_exists (bool, optional): Whether to delete the repository first. Defaults to False. | |
training_args (Optional[Seq2SeqTrainingArguments], optional): The training arguments. Defaults to None. | |
resume_from_checkpoint (Optional[str], optional): The checkpoint to resume training from. Defaults to None. | |
num_optuna_trials (int, optional): The number of Optuna trials. Defaults to 0, i.e., no Optuna hyperparameter search. | |
""" | |
set_seed(42) | |
# if torch.cuda.is_available(): | |
# write_basic_config(mixed_precision='fp16') | |
accelerator = Accelerator() | |
accelerator.print(f"Accelerator state from the current environment:\n{accelerator.state}") | |
# Check if resume_from_checkpoint exists and it's a file | |
if resume_from_checkpoint is not None: | |
# Check if the checkpoint exists: it can be either a file or a directory | |
if not os.path.exists(resume_from_checkpoint): | |
raise ValueError(f"Checkpoint file '{resume_from_checkpoint}' does not exist.") | |
if hub_token is not None: | |
hf.login(token=hub_token) | |
# Setup output directory and Hugging Face repository | |
output_dir += f"/{model_id}" | |
if organization is not None: | |
hub_model_id = f"{organization}/{model_id}" | |
if delete_local_repo_if_exists and os.path.exists(output_dir): | |
subprocess.run(["rm", "-rf", output_dir]) | |
if not os.path.exists(output_dir): | |
print(f"Local repository '{output_dir}' deleted.") | |
else: | |
print(f"Local repository '{output_dir}' could not be deleted.") | |
return | |
if delete_repo_if_exists and repo_exists(hub_model_id, token=hub_token): | |
delete_hf_repository(repo_id=hub_model_id, token=hub_token, missing_ok=True) | |
print(f"Repository '{hub_model_id}' deleted.") | |
repo_url = create_hf_repository( | |
repo_id=hub_model_id, | |
repo_type="model", | |
exist_ok=True, | |
private=True, | |
token=hub_token, | |
) | |
print(f"Repository '{hub_model_id}' created at URL: {repo_url}") | |
else: | |
hub_model_id = None | |
print(f"Hub model ID: {hub_model_id}") | |
if isinstance(tokenizer, str): | |
tokenizer = AutoTokenizer.from_pretrained(tokenizer) | |
elif tokenizer is None: | |
tokenizer = AutoTokenizer.from_pretrained(pretrained_encoder) | |
# Load the tokenized dataset | |
print("Loading tokenized dataset.") | |
dataset_tokenized = load_tokenized_dataset( | |
ds_name, | |
ds_config, | |
tokenizer, | |
batch_size_tokenizer, | |
encoder_max_length, | |
decoder_max_length, | |
token=hub_token, | |
num_proc_map=num_proc_map, | |
cache_dir=cache_dir, | |
randomize_smiles=randomize_smiles, | |
randomize_smiles_prob=randomize_smiles_prob, | |
all_fragments_as_labels=all_fragments_as_labels, | |
linkers_only_as_labels=linkers_only_as_labels, | |
causal_language_modeling=causal_language_modeling, | |
train_size_ratio=train_size_ratio, | |
) | |
print("Dataset loaded.") | |
if causal_language_modeling: | |
# Setup the model for `model_init` in the Trainer | |
model_lambda = lambda: get_causal_model( | |
pretrained_model=pretrained_decoder, | |
) | |
# Setup the data collator, which will efficiently pad the inputs and targets | |
data_collator = DataCollatorForLanguageModeling( | |
tokenizer, | |
mlm=False, | |
pad_to_multiple_of=8, # Default: None, Original: 8 | |
) | |
else: | |
# Precompute a "length" column for the dataset using the map function | |
def add_length(x): | |
x["length"] = len(x["input_ids"]) | |
return x | |
dataset_tokenized = dataset_tokenized.map( | |
add_length, | |
num_proc=num_proc_map, | |
) | |
# Setup the model for `model_init` in the Trainer | |
model_lambda = lambda: get_encoder_decoder_model( | |
pretrained_encoder=pretrained_encoder, | |
pretrained_decoder=pretrained_decoder, | |
max_length=encoder_max_length, | |
tie_encoder_decoder=tie_encoder_decoder, | |
) | |
# Setup the data collator, which will efficiently pad the inputs and targets | |
data_collator = DataCollatorForSeq2Seq( | |
tokenizer, | |
model=model_lambda(), | |
pad_to_multiple_of=32, # Default: None, Original: 8 | |
) | |
# Setup the training arguments | |
if per_device_train_batch_size is None: | |
per_device_train_batch_size = batch_size // gradient_accumulation_steps | |
if per_device_eval_batch_size is None: | |
per_device_eval_batch_size = batch_size // gradient_accumulation_steps | |
if training_args is None: | |
training_args = { | |
"output_dir": output_dir, | |
# Optimizer-related configs | |
"learning_rate": learning_rate, | |
"optim": "adamw_torch", | |
"lr_scheduler_type": "cosine" if lr_scheduler_type is None else lr_scheduler_type, | |
"lr_scheduler_kwargs": get_lr_scheduler_kwargs(lr_scheduler_type), | |
# "warmup_steps": int(0.08 * 10_000), # NOTE: ChemFormer: 8000 | |
# "warmup_ratio": warmup_ratio, | |
"adam_beta1": 0.9, # NOTE: ChemFormer: 0.9 | |
"adam_beta2": 0.999, # NOTE: ChemFormer: 0.999 | |
"adam_epsilon": 1e-8, # Default: 1e-8 | |
# Batch size, device, and performance optimizations configs | |
"batch_eval_metrics": False, # Default: False | |
"group_by_length": True, | |
"per_device_train_batch_size": per_device_train_batch_size, | |
"per_device_eval_batch_size": per_device_eval_batch_size, | |
"gradient_accumulation_steps": gradient_accumulation_steps, | |
"auto_find_batch_size": True, | |
"fp16": True if torch.cuda.is_available() else False, | |
"fp16_full_eval" : True, # Enable full BF16 evaluation for efficiency | |
"half_precision_backend" : "auto", # Let Hugging Face decide the best backend. Default: "auto" | |
"use_cpu": False, # Default: False | |
"dataloader_num_workers": 8, # Default: 0 (main process only) | |
"dataloader_prefetch_factor": None, # Default: None | |
# Evaluation and checkpointing configs | |
"max_steps": max_steps, | |
"num_train_epochs": num_train_epochs, | |
"save_steps": 20_000, # NOTE: 200 | |
"save_strategy": "steps", | |
"eval_steps": 20_000, # NOTE: 500 | |
"eval_delay": max(int(max(max_steps, num_train_epochs) * 0.7), 0), # Default: None | |
"eval_strategy": "steps", # NOTE: "evaluation_strategy" is deprecated. | |
"save_total_limit": 2, # This will save both the best and the last trainer checkpoint | |
"load_best_model_at_end": True, | |
"metric_for_best_model": "all_ligands_equal", | |
"include_inputs_for_metrics": True, | |
"eval_on_start": False, # Default: False | |
# Logging configs | |
"log_level": "debug", | |
"logging_steps": 5000, | |
"disable_tqdm": True, | |
"report_to": ["tensorboard"], | |
"save_only_model": False, # Default: False | |
# Hub information configs | |
"push_to_hub": hub_model_id is not None, # NOTE: Also manually done further down | |
"push_to_hub_model_id": model_id, | |
"push_to_hub_organization": organization, | |
"hub_model_id": hub_model_id, | |
"hub_token": hub_token, | |
"hub_strategy": "checkpoint", # NOTE: Allows to resume training from last checkpoint | |
"hub_private_repo": True, | |
# Other configs | |
"seed": 42, | |
"data_seed": 42, | |
} | |
if 'num_cycles' in training_args["lr_scheduler_kwargs"] and num_cycles is not None: | |
training_args["lr_scheduler_kwargs"]["num_cycles"] = num_cycles | |
if warmup_ratio is not None: | |
training_args["warmup_ratio"] = warmup_ratio | |
if warmup_steps is not None: | |
training_args["warmup_steps"] = warmup_steps | |
# Add Generation configs | |
if causal_language_modeling: | |
training_args["metric_for_best_model"] = "eval_loss" | |
else: | |
generation_config = GenerationConfig( | |
max_length=512, | |
max_new_tokens=512, | |
do_sample=True, | |
num_beams=5, | |
temperature=1.0, | |
) | |
training_args["generation_config"] = generation_config | |
training_args["predict_with_generate"] = True | |
training_args["generation_config"] = generation_config | |
training_args["generation_max_length"] = 512 | |
print("Training arguments:") | |
for k, v in training_args.items(): | |
if 'token' in k: | |
continue | |
print(f" - {k}: {v}") | |
# Modify the training arguments with Optuna hyperparameter search | |
if num_optuna_trials > 0: | |
# Setup the compute_metrics function for the hyperparameter search | |
hp_compute_metrics = partial( | |
decode_and_get_metrics, | |
tokenizer=tokenizer, | |
compute_rdkit_metrics=False, | |
compute_graph_metrics=False, | |
num_proc=num_proc_map, | |
causal_language_modeling=causal_language_modeling, | |
) | |
# Run the HP search (and update the training_args accordingly) | |
best_run, hp_training_args = get_best_hyperparameters( | |
model_init=model_lambda, | |
tokenizer=tokenizer, | |
data_collator=data_collator, | |
compute_metrics=hp_compute_metrics, | |
dataset_tokenized=dataset_tokenized, | |
training_args=copy.deepcopy(training_args), | |
lr_scheduler_type=lr_scheduler_type, | |
num_optuna_trials=num_optuna_trials, | |
causal_language_modeling=causal_language_modeling, | |
all_fragments_as_labels=all_fragments_as_labels, | |
linkers_only_as_labels=linkers_only_as_labels, | |
) | |
best_objective = best_run.objective | |
best_trial_number = best_run.run_id | |
best_hparams = best_run.hyperparameters | |
# Save to output directory the best hyperparameters | |
with open(f"{output_dir}/best_hyperparameters.md", "w") as f: | |
f.write(f"Number of Optuna trials: {num_optuna_trials}\n\n") | |
f.write(f"Best trial objective: {best_objective:.4f} (best trial number: {best_trial_number})\n\n") | |
f.write("Best hyperparameters:\n") | |
for hparam, value in best_hparams.items(): | |
f.write(f"- {hparam}: {value}\n") | |
f.write("\n") | |
f.write("Training arguments:\n") | |
for hparam, value in hp_training_args.items(): | |
if "token" in hparam: | |
continue | |
elif isinstance(value, str): | |
if 'hf_' in value: | |
continue | |
f.write(f"- {hparam}: {value}\n") | |
# Open the file and remove any line that might contain the token | |
with open(f"{output_dir}/best_hyperparameters.md", "r") as f: | |
lines = f.readlines() | |
with open(f"{output_dir}/best_hyperparameters.md", "w") as f: | |
for line in lines: | |
if "hf_" in line: | |
continue | |
f.write(line) | |
print(f"Best hyperparameters saved to '{output_dir}/best_hyperparameters.md'.") | |
if hub_model_id is not None: | |
upload_single_file( | |
path_or_fileobj=f"{output_dir}/best_hyperparameters.md", | |
path_in_repo="best_hyperparameters.md", | |
repo_id=hub_model_id, | |
token=hub_token, | |
) | |
# Save the best_hparams to a JSON file | |
with open(f"{output_dir}/best_hyperparameters.json", "w") as f: | |
json.dump(best_hparams, f, indent=4) | |
print(f"Best hyperparameters saved to '{output_dir}/best_hyperparameters.json'.") | |
if hub_model_id is not None: | |
upload_single_file( | |
path_or_fileobj=f"{output_dir}/best_hyperparameters.json", | |
path_in_repo="best_hyperparameters.json", | |
repo_id=hub_model_id, | |
token=hub_token, | |
) | |
# Update the training arguments with the best hyperparameters | |
hp_specific_args = [ | |
"num_train_epochs", | |
"max_steps", | |
"eval_steps", | |
"eval_delay", | |
"logging_steps", | |
"save_steps", | |
"generation_config", | |
] | |
for k, v in hp_training_args.items(): | |
# Skip the specific arguments set/modifed by the HP search | |
if k in hp_specific_args: | |
continue | |
training_args[k] = v | |
# Update the num_cycles according to the original max_steps | |
lr_scheduler_kwargs = hp_training_args["lr_scheduler_kwargs"] | |
if "num_cycles" in lr_scheduler_kwargs: | |
hp_num_cycles = lr_scheduler_kwargs["num_cycles"] | |
hp_max_steps = hp_training_args["max_steps"] | |
# Adjust/scale the max_cycles according to the number of steps | |
if hp_max_steps > 0: | |
hp_cycle_ratio = hp_num_cycles / hp_max_steps | |
num_cycles = int(hp_cycle_ratio * max_steps) | |
training_args["lr_scheduler_kwargs"]["num_cycles"] = num_cycles | |
print(f"Adjusted number of cycles: {num_cycles}") | |
# Adjust the warmup steps according to the original max_steps | |
if "warmup_ratio" in hp_training_args: | |
hp_warmup_ratio = hp_training_args["warmup_ratio"] | |
hp_max_steps = hp_training_args["max_steps"] | |
warmup_steps = int(hp_warmup_ratio * hp_max_steps) | |
warmup_ratio = warmup_steps / max_steps | |
training_args["warmup_steps"] = warmup_steps | |
training_args["warmup_ratio"] = warmup_ratio | |
print("Training arguments updated with the best hyperparameters:") | |
for k, v in training_args.items(): | |
if 'token' in k: | |
continue | |
print(f" - {k}: {v}") | |
print("-" * 80) | |
print("Starting training with the best hyperparameters.") | |
print("-" * 80) | |
# rouge = evaluate.load("rouge") # , cache_dir="/mimer/NOBACKUP/groups/naiss2023-6-290/stefano/.cache/huggingface/evaluate/") | |
# fpgen = Chem.rdFingerprintGenerator.GetMorganGenerator( | |
# radius=11, | |
# fpSize=1024, | |
# ) | |
rouge = None | |
fpgen = None | |
compute_metrics = partial( | |
decode_and_get_metrics, | |
tokenizer=tokenizer, | |
rouge=rouge, | |
fpgen=fpgen, | |
compute_rdkit_metrics=False, | |
compute_graph_metrics=True, | |
num_proc=max(1, num_proc_map - 2), # NOTE: Use 2 less process for the metrics, since there will be a timeout logic | |
causal_language_modeling=causal_language_modeling, | |
) | |
if training_args_bin is not None: | |
print(f"Loading training arguments from: {training_args_bin}.") | |
# Load training arguments from a binary file and update model-specific arguments | |
args = torch.load(training_args_bin) | |
args.output_dir = output_dir | |
args.overwrite_output_dir = True if delete_local_repo_if_exists else False | |
args.push_to_hub_model_id = model_id | |
args.push_to_hub_organization = organization | |
args.hub_model_id = hub_model_id | |
args.hub_token = hub_token | |
# Print all the training arguments | |
print("Training arguments loaded:") | |
for k, v in args.__dict__.items(): | |
if 'token' in k: | |
continue | |
print(f" - {k}: {v}") | |
else: | |
if causal_language_modeling: | |
args = TrainingArguments(**training_args) | |
else: | |
args = Seq2SeqTrainingArguments(**training_args) | |
if causal_language_modeling: | |
TrainerClass = Trainer | |
else: | |
TrainerClass = Seq2SeqTrainer | |
# Setup the Trainer and start training (no Optuna hyperparameter search) | |
trainer = TrainerClass( | |
model_init=model_lambda, | |
tokenizer=tokenizer, | |
data_collator=data_collator, | |
args=args, | |
compute_metrics=compute_metrics, | |
train_dataset=dataset_tokenized["train"], | |
eval_dataset=dataset_tokenized["test"], | |
) | |
if resume_from_checkpoint is not None: | |
trainer.train( | |
resume_from_checkpoint=resume_from_checkpoint, | |
) | |
else: | |
trainer.train() | |
print("-" * 80) | |
print("Training completed.") | |
print("-" * 80) | |
if causal_language_modeling: | |
tasks = ["Text Generation"] | |
else: | |
tasks = ["Text2Text Generation", "question-answering"] | |
tokenizer.save_pretrained(output_dir) | |
if hub_model_id is not None: | |
print("Pushing model to Hugging Face Hub.") | |
print("-" * 80) | |
trainer.push_to_hub( | |
commit_message="Initial version", | |
model_name=hub_model_id, | |
license="mit", | |
finetuned_from=f"{pretrained_encoder}", | |
tasks=tasks, | |
tags=["PROTAC", "cheminformatics"], | |
dataset=[ds_name], | |
dataset_args=[ds_config], | |
) | |
tokenizer.push_to_hub( | |
repo_id=hub_model_id, | |
commit_message="Upload tokenizer", | |
private=True, | |
token=hub_token, | |
tags=["PROTAC", "cheminformatics"], | |
) | |
else: | |
print("Pushing model to local directory.") | |
print("-" * 80) | |
trainer.save_model(output_dir) | |
tokenizer.save_pretrained(output_dir) | |
print(f"Model saved to '{output_dir}'.") | |
print("All done.") | |