Spaces:
Sleeping
Sleeping
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments | |
from datasets import load_dataset | |
# Load the dataset | |
dataset = load_dataset("csv", data_files="qa_dataset.csv") | |
# Load the tokenizer and model | |
model_name = "gpt2" # Base model | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
# Add padding token to the tokenizer | |
tokenizer.pad_token = tokenizer.eos_token # Set the padding token to the end-of-sequence token | |
model = AutoModelForCausalLM.from_pretrained(model_name) | |
# Prepare the dataset | |
def preprocess_function(examples): | |
inputs = [f"Q: {q} A:" for q in examples["question"]] | |
outputs = examples["answer"] | |
model_inputs = tokenizer(inputs, text_target=outputs, max_length=512, padding ='longest', truncation=True) | |
return model_inputs | |
tokenized_dataset = dataset["train"].map(preprocess_function, batched=True) | |
training_args = TrainingArguments( | |
output_dir="./results", | |
evaluation_strategy="no", | |
learning_rate=5e-5, | |
per_device_train_batch_size=8, | |
num_train_epochs=3, | |
weight_decay=0.01, | |
logging_dir='./logs', | |
logging_steps=10, | |
report_to="tensorboard", | |
run_name="gpt2-finetuning" | |
) | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=tokenized_dataset, | |
tokenizer=tokenizer | |
) | |
# Fine-tune the model | |
trainer.train() | |
# Save the model | |
model.save_pretrained("./fine_tuned_model") | |
tokenizer.save_pretrained("./fine_tuned_model") |