Spaces:
Running
Running
import torch, json | |
from datasets import load_dataset, Dataset | |
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling | |
from peft import get_peft_model, LoraConfig, TaskType | |
# Load your dataset | |
data = [json.loads(l) for l in open("data/sft_data.jsonl")] | |
dataset = Dataset.from_list(data) | |
# Load model & tokenizer | |
base_model = "meta-llama/Llama-2-7b-hf" # Or use Mistral, Falcon, etc. | |
tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True) | |
model = AutoModelForCausalLM.from_pretrained(base_model, torch_dtype=torch.float16) | |
# Add LoRA (optional) | |
lora_config = LoraConfig(task_type=TaskType.CAUSAL_LM, r=8, lora_alpha=32, lora_dropout=0.05, | |
target_modules=["q_proj", "v_proj"]) | |
model = get_peft_model(model, lora_config) | |
# Preprocessing | |
def tokenize(example): | |
prompt = f"### Instruction:\n{example['prompt']}\n\n### Response:\n{example['output']}" | |
return tokenizer(prompt, truncation=True, max_length=512, padding="max_length") | |
dataset = dataset.map(tokenize, remove_columns=dataset.column_names) | |
# Training setup | |
args = TrainingArguments( | |
output_dir="./sft-model", | |
per_device_train_batch_size=2, | |
num_train_epochs=3, | |
fp16=True, | |
evaluation_strategy="no", | |
save_strategy="epoch", | |
logging_steps=20, | |
learning_rate=2e-5, | |
report_to="tensorboard", | |
) | |
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) | |
trainer = Trainer(model=model, args=args, train_dataset=dataset, data_collator=data_collator) | |
trainer.train() |