|
import os
|
|
import torch
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, TrainerCallback
|
|
from datasets import load_dataset
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
|
os.environ["HF_TOKEN"] = "hf_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
|
|
|
|
|
|
os.environ["HF_HUB_DOWNLOAD_TIMEOUT"] = "600"
|
|
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
|
|
|
|
|
|
model_name = "Salesforce/codegen-350M-multi"
|
|
local_model_path = "./codegen_model"
|
|
|
|
print(f"Attempting to download/load tokenizer from {model_name} to {local_model_path}...")
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=local_model_path)
|
|
print("Tokenizer loaded.")
|
|
|
|
print(f"Attempting to download/load model from {model_name} to {local_model_path}...")
|
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=local_model_path)
|
|
print("Model loaded.")
|
|
|
|
|
|
tokenizer.pad_token = tokenizer.eos_token
|
|
|
|
|
|
device = torch.device("cpu")
|
|
model.to(device)
|
|
print(f"Model moved to {device}.")
|
|
|
|
|
|
dataset_file = "custom_dataset.jsonl"
|
|
print(f"Loading dataset from {dataset_file}...")
|
|
dataset = load_dataset('json', data_files=dataset_file, split='train')
|
|
print("Dataset loaded.")
|
|
print(f"Dataset size: {len(dataset)} examples.")
|
|
print(f"First example of dataset: {dataset[0]}")
|
|
|
|
|
|
def tokenize_function(examples):
|
|
inputs = [f"{prompt}\n{code}" for prompt, code in zip(examples["prompt"], examples["code"])]
|
|
|
|
return tokenizer(inputs, truncation=True, padding="max_length", max_length=64)
|
|
|
|
print("Tokenizing dataset...")
|
|
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["prompt", "code"])
|
|
print("Dataset tokenized.")
|
|
print(f"First tokenized example: {tokenized_dataset[0]}")
|
|
|
|
|
|
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
|
|
|
|
|
|
training_args = TrainingArguments(
|
|
output_dir="./finetuned_codegen",
|
|
overwrite_output_dir=True,
|
|
num_train_epochs=3,
|
|
|
|
per_device_train_batch_size=1,
|
|
gradient_accumulation_steps=1,
|
|
save_steps=500,
|
|
save_total_limit=2,
|
|
logging_steps=10,
|
|
learning_rate=5e-5,
|
|
fp16=False,
|
|
use_cpu=True,
|
|
dataloader_pin_memory=False,
|
|
report_to="none",
|
|
gradient_checkpointing=True,
|
|
max_grad_norm=1.0,
|
|
)
|
|
|
|
|
|
class LossCallback(TrainerCallback):
|
|
def __init__(self):
|
|
self.losses = []
|
|
self.log_steps = []
|
|
|
|
def on_log(self, args, state, control, logs=None, **kwargs):
|
|
if logs and "loss" in logs:
|
|
self.losses.append(logs["loss"])
|
|
self.log_steps.append(state.global_step)
|
|
print(f"Step {state.global_step}: Loss = {logs['loss']:.4f}")
|
|
|
|
loss_callback = LossCallback()
|
|
|
|
|
|
trainer = Trainer(
|
|
model=model,
|
|
args=training_args,
|
|
train_dataset=tokenized_dataset,
|
|
data_collator=data_collator,
|
|
callbacks=[loss_callback],
|
|
)
|
|
|
|
|
|
print("Starting fine-tuning...")
|
|
print("WARNING: Training on CPU will be extremely slow. The 0% progress bar might take a very long time to update.")
|
|
print("Please monitor your system's RAM and CPU usage.")
|
|
trainer.train()
|
|
print("Fine-tuning finished.")
|
|
|
|
|
|
model.save_pretrained("./finetuned_codegen")
|
|
tokenizer.save_pretrained("./finetuned_codegen")
|
|
print("Model fine-tuned and saved to ./finetuned_codegen.")
|
|
|
|
|
|
if loss_callback.losses:
|
|
plt.figure(figsize=(10, 6))
|
|
plt.plot(loss_callback.log_steps, loss_callback.losses, label="Training Loss")
|
|
plt.xlabel("Steps")
|
|
plt.ylabel("Loss")
|
|
plt.title("Fine-Tuning Loss Curve")
|
|
plt.legend()
|
|
plt.grid(True)
|
|
plot_path = "./finetuned_codegen/loss_plot.png"
|
|
plt.savefig(plot_path)
|
|
print(f"Loss plot saved to {plot_path}")
|
|
else:
|
|
print("No training losses recorded to plot.")
|
|
plt.show()
|
|
|
|
print("Fine-tuning script finished execution.") |