File size: 1,583 Bytes
696ae63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import torch, json
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import get_peft_model, LoraConfig, TaskType

# Load your dataset
data = [json.loads(l) for l in open("data/sft_data.jsonl")]
dataset = Dataset.from_list(data)

# Load model & tokenizer
base_model = "meta-llama/Llama-2-7b-hf"  # Or use Mistral, Falcon, etc.
tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(base_model, torch_dtype=torch.float16)

# Add LoRA (optional)
lora_config = LoraConfig(task_type=TaskType.CAUSAL_LM, r=8, lora_alpha=32, lora_dropout=0.05,
                         target_modules=["q_proj", "v_proj"])
model = get_peft_model(model, lora_config)

# Preprocessing
def tokenize(example):
    prompt = f"### Instruction:\n{example['prompt']}\n\n### Response:\n{example['output']}"
    return tokenizer(prompt, truncation=True, max_length=512, padding="max_length")
dataset = dataset.map(tokenize, remove_columns=dataset.column_names)

# Training setup
args = TrainingArguments(
    output_dir="./sft-model",
    per_device_train_batch_size=2,
    num_train_epochs=3,
    fp16=True,
    evaluation_strategy="no",
    save_strategy="epoch",
    logging_steps=20,
    learning_rate=2e-5,
    report_to="tensorboard",
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
trainer = Trainer(model=model, args=args, train_dataset=dataset, data_collator=data_collator)
trainer.train()