|
|
|
|
|
from datasets import load_dataset |
|
from transformers import T5Tokenizer, T5ForConditionalGeneration |
|
import torch, csv |
|
file_dict = { |
|
"train" : "name_dataset.csv", |
|
"test" : "name_dataset.csv" |
|
} |
|
|
|
dataset = load_dataset( |
|
'csv', |
|
data_files=file_dict, |
|
delimiter=',', |
|
column_names=['text', 'label'], |
|
skiprows=1 |
|
) |
|
|
|
print(f"Train dataset size: {len(dataset['train'])}") |
|
print(f"Test dataset size: {len(dataset['test'])}") |
|
|
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
|
from datasets import concatenate_datasets |
|
|
|
model_id = "t5-small" |
|
tokenizer = AutoTokenizer.from_pretrained(model_id) |
|
|
|
def tokenize_function(example): |
|
model_inputs = tokenizer(example["text"], truncation=True) |
|
targets = tokenizer(example["label"], truncation=True) |
|
model_inputs['labels'] = targets['input_ids'] |
|
return model_inputs |
|
|
|
tokenized_datasets = dataset.map(tokenize_function, batched=True) |
|
tokenized_datasets = tokenized_datasets.remove_columns("text") |
|
tokenized_datasets = tokenized_datasets.remove_columns("label") |
|
|
|
from transformers import DataCollatorForSeq2Seq |
|
model =T5ForConditionalGeneration.from_pretrained(model_id) |
|
|
|
from peft import LoraConfig, get_peft_model,TaskType |
|
|
|
|
|
lora_config = LoraConfig( |
|
r=16, |
|
lora_alpha=32, |
|
target_modules=["q", "v"], |
|
lora_dropout=0.05, |
|
bias="none", |
|
task_type=TaskType.SEQ_2_SEQ_LM |
|
) |
|
model = get_peft_model(model, lora_config) |
|
model.print_trainable_parameters() |
|
|
|
label_pad_token_id = -100 |
|
data_collator = DataCollatorForSeq2Seq( |
|
tokenizer, |
|
model=model, |
|
label_pad_token_id=label_pad_token_id, |
|
pad_to_multiple_of=8 |
|
) |
|
|
|
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments |
|
|
|
output_dir = "lora-t5" |
|
training_args = Seq2SeqTrainingArguments( |
|
output_dir=output_dir, |
|
auto_find_batch_size=True, |
|
learning_rate=1e-3, |
|
num_train_epochs=100, |
|
logging_dir=f"{output_dir}/logs", |
|
logging_strategy="steps", |
|
logging_steps=500, |
|
save_strategy="no", |
|
|
|
) |
|
|
|
trainer = Seq2SeqTrainer( |
|
model=model, |
|
args=training_args, |
|
data_collator=data_collator, |
|
train_dataset=tokenized_datasets["train"], |
|
) |
|
model.config.use_cache = False |
|
trainer.train() |
|
peft_model_id = "name-peft" |
|
trainer.model.save_pretrained(peft_model_id) |
|
tokenizer.save_pretrained(peft_model_id) |
|
from transformers import T5ForConditionalGeneration, AutoTokenizer |
|
from peft import PeftModel |
|
base_model = T5ForConditionalGeneration.from_pretrained(model_id) |
|
peft_model = PeftModel.from_pretrained(base_model, "name-peft") |
|
peft_model = peft_model.merge_and_unload() |
|
peft_model.save_pretrained("name-extraction") |
|
tokenizer = AutoTokenizer.from_pretrained(model_id) |
|
tokenizer.save_pretrained("name-extraction") |
|
|