File size: 4,966 Bytes
6e793bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, TrainerCallback
from datasets import load_dataset
import matplotlib.pyplot as plt

# Set Hugging Face token (replace with your actual token)
os.environ["HF_TOKEN"] = "hf_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"

# Recommended for download stability, if you had issues before
os.environ["HF_HUB_DOWNLOAD_TIMEOUT"] = "600" # 10 minutes timeout
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" # Enable robust downloader

# Download model and tokenizer
model_name = "Salesforce/codegen-350M-multi"
local_model_path = "./codegen_model"

print(f"Attempting to download/load tokenizer from {model_name} to {local_model_path}...")
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=local_model_path)
print("Tokenizer loaded.")

print(f"Attempting to download/load model from {model_name} to {local_model_path}...")
# Removed torch_dtype=torch.float16 as it's typically for GPU and might not help on CPU
# and could even cause unexpected behavior on some CPU setups.
model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=local_model_path)
print("Model loaded.")

# Set padding token
tokenizer.pad_token = tokenizer.eos_token

# Move model to CPU
device = torch.device("cpu")
model.to(device)
print(f"Model moved to {device}.")

# Load custom dataset from JSONL file
dataset_file = "custom_dataset.jsonl"
print(f"Loading dataset from {dataset_file}...")
dataset = load_dataset('json', data_files=dataset_file, split='train')
print("Dataset loaded.")
print(f"Dataset size: {len(dataset)} examples.")
print(f"First example of dataset: {dataset[0]}") # Print first example to check data format

# Tokenize dataset
def tokenize_function(examples):
    inputs = [f"{prompt}\n{code}" for prompt, code in zip(examples["prompt"], examples["code"])]
    # --- REDUCED MAX_LENGTH TO SAVE MEMORY ---
    return tokenizer(inputs, truncation=True, padding="max_length", max_length=64) # Try 64 or even 32 if 128 is too much

print("Tokenizing dataset...")
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["prompt", "code"])
print("Dataset tokenized.")
print(f"First tokenized example: {tokenized_dataset[0]}")

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./finetuned_codegen",
    overwrite_output_dir=True,
    num_train_epochs=3,
    # --- AGGRESSIVELY REDUCED BATCH SIZE AND GRADIENT ACCUMULATION FOR CPU ---
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1, # No accumulation, true batch size of 1
    save_steps=500,
    save_total_limit=2,
    logging_steps=10, # Log more frequently to see if it starts moving
    learning_rate=5e-5,
    fp16=False, # Keep False for CPU
    use_cpu=True, # Use this instead of no_cuda=True
    dataloader_pin_memory=False, # Disable pin_memory for CPU
    report_to="none", # Disable reporting to avoid potential hangs
    gradient_checkpointing=True, # Keep this, it helps with memory on CPU too
    max_grad_norm=1.0,
)

# Custom callback to store training loss
class LossCallback(TrainerCallback):
    def __init__(self):
        self.losses = []
        self.log_steps = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs and "loss" in logs:
            self.losses.append(logs["loss"])
            self.log_steps.append(state.global_step)
            print(f"Step {state.global_step}: Loss = {logs['loss']:.4f}")

loss_callback = LossCallback()

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
    callbacks=[loss_callback],
)

# Start fine-tuning
print("Starting fine-tuning...")
print("WARNING: Training on CPU will be extremely slow. The 0% progress bar might take a very long time to update.")
print("Please monitor your system's RAM and CPU usage.")
trainer.train()
print("Fine-tuning finished.")

# Save fine-tuned model
model.save_pretrained("./finetuned_codegen")
tokenizer.save_pretrained("./finetuned_codegen")
print("Model fine-tuned and saved to ./finetuned_codegen.")

# Plot training loss
if loss_callback.losses:
    plt.figure(figsize=(10, 6))
    plt.plot(loss_callback.log_steps, loss_callback.losses, label="Training Loss")
    plt.xlabel("Steps")
    plt.ylabel("Loss")
    plt.title("Fine-Tuning Loss Curve")
    plt.legend()
    plt.grid(True)
    plot_path = "./finetuned_codegen/loss_plot.png"
    plt.savefig(plot_path)
    print(f"Loss plot saved to {plot_path}")
else:
    print("No training losses recorded to plot.")
plt.show()

print("Fine-tuning script finished execution.")