from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments from datasets import load_dataset # Load the dataset dataset = load_dataset("csv", data_files="qa_dataset.csv") # Load the tokenizer and model model_name = "gpt2" # Base model tokenizer = AutoTokenizer.from_pretrained(model_name) # Add padding token to the tokenizer tokenizer.pad_token = tokenizer.eos_token # Set the padding token to the end-of-sequence token model = AutoModelForCausalLM.from_pretrained(model_name) # Prepare the dataset def preprocess_function(examples): inputs = [f"Q: {q} A:" for q in examples["question"]] outputs = examples["answer"] model_inputs = tokenizer(inputs, text_target=outputs, max_length=512, padding ='longest', truncation=True) return model_inputs tokenized_dataset = dataset["train"].map(preprocess_function, batched=True) training_args = TrainingArguments( output_dir="./results", evaluation_strategy="no", learning_rate=5e-5, per_device_train_batch_size=8, num_train_epochs=3, weight_decay=0.01, logging_dir='./logs', logging_steps=10, report_to="tensorboard", run_name="gpt2-finetuning" ) trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset, tokenizer=tokenizer ) # Fine-tune the model trainer.train() # Save the model model.save_pretrained("./fine_tuned_model") tokenizer.save_pretrained("./fine_tuned_model")