File size: 3,847 Bytes
5a93c7a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6e793bd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import os
import torch
import json
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, TrainerCallback
from datasets import Dataset
import matplotlib.pyplot as plt

# Set Hugging Face token (replace with your actual token)
os.environ["HF_TOKEN"] = "hf_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"  # Replace with your HF_TOKEN

# Download model and tokenizer
model_name = "Salesforce/codegen-350M-multi"
local_model_path = "./codegen_model"
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=local_model_path)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, cache_dir=local_model_path)

# Set padding token
tokenizer.pad_token = tokenizer.eos_token

# Move model to CPU
device = torch.device("cpu")
model.to(device)

# Load custom dataset from JSONL
dataset_path = "./custom_dataset.jsonl"
data = []
with open(dataset_path, 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line.strip()))
dataset = Dataset.from_list(data)

# Tokenize dataset
def tokenize_function(examples):
    inputs = [f"{prompt}\n{code}" for prompt, code in zip(examples["prompt"], examples["code"])]
    return tokenizer(inputs, truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["prompt", "code"])

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./finetuned_codegen",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    save_steps=500,
    save_total_limit=2,
    logging_steps=10,  # Reduced logging steps for more frequent loss recording
    learning_rate=5e-5,
    fp16=False,
    no_cuda=True,
    dataloader_pin_memory=False,
)

# Custom callback to store training loss
class LossCallback(TrainerCallback):
    def __init__(self):
        self.losses = []
        self.steps = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs and "loss" in logs:
            self.losses.append(logs["loss"])
            self.steps.append(state.global_step)

loss_callback = LossCallback()

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
    callbacks=[loss_callback],
)

# Start fine-tuning
print("Starting fine-tuning...")
trainer.train()

# Save fine-tuned model
model.save_pretrained("./finetuned_codegen")
tokenizer.save_pretrained("./finetuned_codegen")

# Plot training loss
plt.plot(loss_callback.steps, loss_callback.losses, label="Training Loss")
plt.xlabel("Steps")
plt.ylabel("Loss")
plt.title("Fine-Tuning Loss Curve")
plt.legend()
plt.savefig("./finetuned_codegen/loss_plot.png")
plt.show()

print("Fine-tuning completed. Model saved to ./finetuned_codegen. Loss plot saved to ./finetuned_codegen/loss_plot.png")

# Test fine-tuned model
print("\nTesting fine-tuned model...")
prompts = [
    "Write a Python program to print 'Hello, guys how are you!'"
]

for prompt in prompts:
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
    outputs = model.generate(
        **inputs,
        max_length=200,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        temperature=0.7,
        top_p=0.9
    )
    generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Prompt: {prompt}\nGenerated Code:\n{generated_code}\n{'-'*50}")