File size: 3,847 Bytes
5a93c7a 6e793bd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
import os
import torch
import json
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, TrainerCallback
from datasets import Dataset
import matplotlib.pyplot as plt
# Set Hugging Face token (replace with your actual token)
os.environ["HF_TOKEN"] = "hf_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" # Replace with your HF_TOKEN
# Download model and tokenizer
model_name = "Salesforce/codegen-350M-multi"
local_model_path = "./codegen_model"
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=local_model_path)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, cache_dir=local_model_path)
# Set padding token
tokenizer.pad_token = tokenizer.eos_token
# Move model to CPU
device = torch.device("cpu")
model.to(device)
# Load custom dataset from JSONL
dataset_path = "./custom_dataset.jsonl"
data = []
with open(dataset_path, 'r', encoding='utf-8') as f:
for line in f:
data.append(json.loads(line.strip()))
dataset = Dataset.from_list(data)
# Tokenize dataset
def tokenize_function(examples):
inputs = [f"{prompt}\n{code}" for prompt, code in zip(examples["prompt"], examples["code"])]
return tokenizer(inputs, truncation=True, padding="max_length", max_length=128)
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["prompt", "code"])
# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
# Define training arguments
training_args = TrainingArguments(
output_dir="./finetuned_codegen",
overwrite_output_dir=True,
num_train_epochs=5,
per_device_train_batch_size=1,
gradient_accumulation_steps=4,
save_steps=500,
save_total_limit=2,
logging_steps=10, # Reduced logging steps for more frequent loss recording
learning_rate=5e-5,
fp16=False,
no_cuda=True,
dataloader_pin_memory=False,
)
# Custom callback to store training loss
class LossCallback(TrainerCallback):
def __init__(self):
self.losses = []
self.steps = []
def on_log(self, args, state, control, logs=None, **kwargs):
if logs and "loss" in logs:
self.losses.append(logs["loss"])
self.steps.append(state.global_step)
loss_callback = LossCallback()
# Initialize Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
data_collator=data_collator,
callbacks=[loss_callback],
)
# Start fine-tuning
print("Starting fine-tuning...")
trainer.train()
# Save fine-tuned model
model.save_pretrained("./finetuned_codegen")
tokenizer.save_pretrained("./finetuned_codegen")
# Plot training loss
plt.plot(loss_callback.steps, loss_callback.losses, label="Training Loss")
plt.xlabel("Steps")
plt.ylabel("Loss")
plt.title("Fine-Tuning Loss Curve")
plt.legend()
plt.savefig("./finetuned_codegen/loss_plot.png")
plt.show()
print("Fine-tuning completed. Model saved to ./finetuned_codegen. Loss plot saved to ./finetuned_codegen/loss_plot.png")
# Test fine-tuned model
print("\nTesting fine-tuned model...")
prompts = [
"Write a Python program to print 'Hello, guys how are you!'"
]
for prompt in prompts:
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
outputs = model.generate(
**inputs,
max_length=200,
num_return_sequences=1,
pad_token_id=tokenizer.eos_token_id,
do_sample=True,
temperature=0.7,
top_p=0.9
)
generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Prompt: {prompt}\nGenerated Code:\n{generated_code}\n{'-'*50}") |