|
|
""" |
|
|
PyPilot Training Manager - Advanced distributed training with monitoring |
|
|
""" |
|
|
import torch |
|
|
import torch.nn as nn |
|
|
from torch.utils.data import DataLoader, Dataset |
|
|
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback |
|
|
import wandb |
|
|
import numpy as np |
|
|
import time |
|
|
from datetime import datetime |
|
|
import os |
|
|
|
|
|
class CodeDataset(Dataset): |
|
|
def __init__(self, tokenized_data): |
|
|
self.data = tokenized_data |
|
|
|
|
|
def __len__(self): |
|
|
return len(self.data) |
|
|
|
|
|
def __getitem__(self, idx): |
|
|
return self.data[idx] |
|
|
|
|
|
class PyPilotTrainingManager: |
|
|
def __init__(self, model, model_name="PyPilot"): |
|
|
self.model = model |
|
|
self.model_name = model_name |
|
|
self.training_history = [] |
|
|
self.best_loss = float('inf') |
|
|
|
|
|
def setup_distributed_training(self, use_fp16=True, use_gradient_checkpointing=True): |
|
|
"""Configure distributed training options""" |
|
|
training_args = TrainingArguments( |
|
|
output_dir=f"./pypilot-checkpoints", |
|
|
overwrite_output_dir=True, |
|
|
num_train_epochs=10, |
|
|
per_device_train_batch_size=4, |
|
|
per_device_eval_batch_size=4, |
|
|
gradient_accumulation_steps=8, |
|
|
learning_rate=5e-5, |
|
|
weight_decay=0.01, |
|
|
warmup_steps=1000, |
|
|
logging_dir="./logs", |
|
|
logging_steps=500, |
|
|
eval_steps=1000, |
|
|
save_steps=2000, |
|
|
save_total_limit=5, |
|
|
prediction_loss_only=True, |
|
|
remove_unused_columns=False, |
|
|
fp16=use_fp16, |
|
|
dataloader_pin_memory=False, |
|
|
gradient_checkpointing=use_gradient_checkpointing, |
|
|
report_to=["wandb"], |
|
|
run_name=f"pypilot-{datetime.now().strftime('%Y-%m-%d-%H-%M')}", |
|
|
) |
|
|
return training_args |
|
|
|
|
|
def setup_wandb_monitoring(self, project_name="pypilot"): |
|
|
"""Initialize Weights & Biases for experiment tracking""" |
|
|
wandb.init( |
|
|
project=project_name, |
|
|
name=f"pypilot-{datetime.now().strftime('%Y-%m-%d-%H-%M')}", |
|
|
config={ |
|
|
"architecture": "Transformer", |
|
|
"dataset": "GitHub Code", |
|
|
"epochs": 10, |
|
|
"batch_size": 32, |
|
|
} |
|
|
) |
|
|
|
|
|
def create_advanced_callbacks(self): |
|
|
"""Create callbacks for training optimization""" |
|
|
callbacks = [ |
|
|
EarlyStoppingCallback(early_stopping_patience=3), |
|
|
] |
|
|
return callbacks |
|
|
|
|
|
def compute_metrics(self, eval_pred): |
|
|
"""Compute advanced metrics for code generation""" |
|
|
predictions, labels = eval_pred |
|
|
predictions = torch.tensor(predictions) |
|
|
labels = torch.tensor(labels) |
|
|
|
|
|
|
|
|
loss_fct = nn.CrossEntropyLoss() |
|
|
loss = loss_fct(predictions.view(-1, predictions.size(-1)), labels.view(-1)) |
|
|
perplexity = torch.exp(loss) |
|
|
|
|
|
|
|
|
preds = torch.argmax(predictions, dim=-1) |
|
|
accuracy = (preds == labels).float().mean() |
|
|
|
|
|
return { |
|
|
"perplexity": perplexity.item(), |
|
|
"accuracy": accuracy.item(), |
|
|
"loss": loss.item() |
|
|
} |
|
|
|
|
|
def train_with_advanced_features(self, train_dataset, eval_dataset=None): |
|
|
"""Start advanced training with all features""" |
|
|
print("π Starting Advanced PyPilot Training...") |
|
|
|
|
|
|
|
|
self.setup_wandb_monitoring() |
|
|
|
|
|
|
|
|
training_args = self.setup_distributed_training() |
|
|
callbacks = self.create_advanced_callbacks() |
|
|
|
|
|
|
|
|
trainer = Trainer( |
|
|
model=self.model, |
|
|
args=training_args, |
|
|
train_dataset=train_dataset, |
|
|
eval_dataset=eval_dataset, |
|
|
compute_metrics=self.compute_metrics, |
|
|
callbacks=callbacks, |
|
|
) |
|
|
|
|
|
|
|
|
print("π― Training started with advanced features:") |
|
|
print(f" - FP16 Precision: Enabled") |
|
|
print(f" - Gradient Checkpointing: Enabled") |
|
|
print(f" - Early Stopping: Enabled") |
|
|
print(f" - W&B Monitoring: Enabled") |
|
|
|
|
|
trainer.train() |
|
|
|
|
|
|
|
|
trainer.save_model("./pypilot-final-model") |
|
|
print("β
Training completed and model saved!") |
|
|
|
|
|
return trainer |
|
|
|
|
|
def hyperparameter_search(self, train_dataset, param_combinations): |
|
|
"""Perform hyperparameter search""" |
|
|
best_params = None |
|
|
|
|
|
for i, params in enumerate(param_combinations): |
|
|
print(f"π Testing hyperparameter combination {i+1}/{len(param_combinations)}") |
|
|
|
|
|
|
|
|
self.update_model_hyperparams(params) |
|
|
|
|
|
|
|
|
quick_trainer = Trainer( |
|
|
model=self.model, |
|
|
args=TrainingArguments( |
|
|
output_dir=f"./hparam-search-{i}", |
|
|
num_train_epochs=1, |
|
|
per_device_train_batch_size=params['batch_size'], |
|
|
learning_rate=params['learning_rate'], |
|
|
), |
|
|
train_dataset=train_dataset, |
|
|
) |
|
|
|
|
|
results = quick_trainer.train() |
|
|
|
|
|
if results.training_loss < self.best_loss: |
|
|
self.best_loss = results.training_loss |
|
|
best_params = params |
|
|
|
|
|
print(f"π― Best hyperparameters: {best_params}") |
|
|
return best_params |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
from modeling_pypilot import PyPilotModel, PyPilotConfig |
|
|
|
|
|
config = PyPilotConfig() |
|
|
model = PyPilotModel(config) |
|
|
|
|
|
manager = PyPilotTrainingManager(model) |
|
|
print("β
Training Manager ready!") |