import torch, json from datasets import load_dataset, Dataset from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling from peft import get_peft_model, LoraConfig, TaskType # Load your dataset data = [json.loads(l) for l in open("data/sft_data.jsonl")] dataset = Dataset.from_list(data) # Load model & tokenizer base_model = "meta-llama/Llama-2-7b-hf" # Or use Mistral, Falcon, etc. tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True) model = AutoModelForCausalLM.from_pretrained(base_model, torch_dtype=torch.float16) # Add LoRA (optional) lora_config = LoraConfig(task_type=TaskType.CAUSAL_LM, r=8, lora_alpha=32, lora_dropout=0.05, target_modules=["q_proj", "v_proj"]) model = get_peft_model(model, lora_config) # Preprocessing def tokenize(example): prompt = f"### Instruction:\n{example['prompt']}\n\n### Response:\n{example['output']}" return tokenizer(prompt, truncation=True, max_length=512, padding="max_length") dataset = dataset.map(tokenize, remove_columns=dataset.column_names) # Training setup args = TrainingArguments( output_dir="./sft-model", per_device_train_batch_size=2, num_train_epochs=3, fp16=True, evaluation_strategy="no", save_strategy="epoch", logging_steps=20, learning_rate=2e-5, report_to="tensorboard", ) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) trainer = Trainer(model=model, args=args, train_dataset=dataset, data_collator=data_collator) trainer.train()