name-extraction / finetune.py
datasetsANDmodels's picture
Upload finetune.py
1399970 verified
from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch, csv
file_dict = {
"train" : "name_dataset.csv",
"test" : "name_dataset.csv"
}
dataset = load_dataset(
'csv',
data_files=file_dict,
delimiter=',',
column_names=['text', 'label'],
skiprows=1
)
print(f"Train dataset size: {len(dataset['train'])}")
print(f"Test dataset size: {len(dataset['test'])}")
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import concatenate_datasets
model_id = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_id)
def tokenize_function(example):
model_inputs = tokenizer(example["text"], truncation=True)
targets = tokenizer(example["label"], truncation=True)
model_inputs['labels'] = targets['input_ids']
return model_inputs
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns("text")
tokenized_datasets = tokenized_datasets.remove_columns("label")
from transformers import DataCollatorForSeq2Seq
model =T5ForConditionalGeneration.from_pretrained(model_id)
from peft import LoraConfig, get_peft_model,TaskType
lora_config = LoraConfig(
r=16,
lora_alpha=32,
target_modules=["q", "v"],
lora_dropout=0.05,
bias="none",
task_type=TaskType.SEQ_2_SEQ_LM
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
label_pad_token_id = -100
data_collator = DataCollatorForSeq2Seq(
tokenizer,
model=model,
label_pad_token_id=label_pad_token_id,
pad_to_multiple_of=8
)
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
output_dir = "lora-t5"
training_args = Seq2SeqTrainingArguments(
output_dir=output_dir,
auto_find_batch_size=True,
learning_rate=1e-3,
num_train_epochs=100,
logging_dir=f"{output_dir}/logs",
logging_strategy="steps",
logging_steps=500,
save_strategy="no",
# report_to="tensorboard",
)
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=tokenized_datasets["train"],
)
model.config.use_cache = False
trainer.train()
peft_model_id = "name-peft"
trainer.model.save_pretrained(peft_model_id)
tokenizer.save_pretrained(peft_model_id)
from transformers import T5ForConditionalGeneration, AutoTokenizer
from peft import PeftModel
base_model = T5ForConditionalGeneration.from_pretrained(model_id)
peft_model = PeftModel.from_pretrained(base_model, "name-peft")
peft_model = peft_model.merge_and_unload()
peft_model.save_pretrained("name-extraction")
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.save_pretrained("name-extraction")