prelington commited on
Commit
8ea3de0
·
verified ·
1 Parent(s): 1814385

Create train.py

Browse files
Files changed (1) hide show
  1. train.py +56 -0
train.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import (
2
+ AutoTokenizer,
3
+ AutoModelForSequenceClassification,
4
+ TrainingArguments,
5
+ Trainer,
6
+ DataCollatorWithPadding
7
+ )
8
+ from datasets import load_dataset
9
+ import torch
10
+
11
+ def train_model():
12
+ # Load your model and tokenizer
13
+ model_name = "your-username/your-model-name"
14
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
15
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
16
+
17
+ # Load your dataset (replace with actual dataset)
18
+ dataset = load_dataset("imdb") # Example dataset
19
+
20
+ def tokenize_function(examples):
21
+ return tokenizer(examples["text"], truncation=True)
22
+
23
+ tokenized_datasets = dataset.map(tokenize_function, batched=True)
24
+
25
+ # Training arguments
26
+ training_args = TrainingArguments(
27
+ output_dir="./results",
28
+ learning_rate=2e-5,
29
+ per_device_train_batch_size=16,
30
+ per_device_eval_batch_size=16,
31
+ num_train_epochs=3,
32
+ weight_decay=0.01,
33
+ evaluation_strategy="epoch",
34
+ save_strategy="epoch",
35
+ load_best_model_at_end=True,
36
+ )
37
+
38
+ # Initialize Trainer
39
+ trainer = Trainer(
40
+ model=model,
41
+ args=training_args,
42
+ train_dataset=tokenized_datasets["train"],
43
+ eval_dataset=tokenized_datasets["test"],
44
+ tokenizer=tokenizer,
45
+ data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
46
+ )
47
+
48
+ # Start training
49
+ trainer.train()
50
+
51
+ # Save the fine-tuned model
52
+ trainer.save_model("./fine-tuned-model")
53
+ tokenizer.save_pretrained("./fine-tuned-model")
54
+
55
+ if __name__ == "__main__":
56
+ train_model()