Spaces:
No application file
No application file
File size: 6,523 Bytes
4fe7b26 61330e7 4fe7b26 04de3ea 4fe7b26 04de3ea 4fe7b26 61330e7 4fe7b26 699e2de 4fe7b26 699e2de 4fe7b26 61330e7 4fe7b26 61330e7 4fe7b26 61330e7 4fe7b26 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
import os
import csv
import torch
import argparse
import numpy as np
import pandas as pd
import huggingface_hub
from datasets import Dataset
from download_model import download_model
from huggingface_hub import HfApi, HfFolder
from transformers import (AutoTokenizer, BitsAndBytesConfig, MBart50TokenizerFast,
AutoModelForSeq2SeqLM, AutoModelForCausalLM, Trainer,
MBartForConditionalGeneration, TrainingArguments,
DataCollatorForSeq2Seq, EarlyStoppingCallback)
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
MODELS = {
"mT5": "mT5",
"mBART50": "mBART50"
}
def summarize_text_mt5(texts, model, tokenizer):
inputs = tokenizer(texts, return_tensors="pt",
max_length=512, truncation=True,
padding=True).to(model.device)
summary_ids = model.generate(inputs.input_ids,
max_length=60,
num_beams=4, length_penalty=2.0,
early_stopping=True)
summaries = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)
return summaries
def summarize_text_mbart50(texts, model, tokenizer):
inputs = tokenizer(texts, return_tensors="pt",
max_length=1024, truncation=True,
padding=True).to(model.device)
summary_ids = model.generate(inputs.input_ids, max_length=60,
num_beams=4, length_penalty=2.0,
early_stopping=True)
summaries = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)
return summaries
summaries = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)
return summaries
def experiments(model_name, finetune_type):
str1 = "f_bgSZT"
str2 = "AFSBqvApwHjMQuTOALqZKRpRBzEUL"
token = "h"+str1+str2
huggingface_hub.login(token = token)
"""Runs an experiment with the given model and dataset."""
print(f"Starting Experiment: on {model_name}")
train = pd.read_csv(os.path.join(BASE_DIR, "datasets/train.csv"))
train_fr = pd.read_csv(os.path.join(BASE_DIR, "datasets/train_fr.csv"))
train_cross = pd.read_csv(os.path.join(BASE_DIR, "datasets/train_cross.csv"))
val = pd.read_csv(os.path.join(BASE_DIR, "datasets/val.csv"))
val_fr = pd.read_csv(os.path.join(BASE_DIR, "datasets/val_fr.csv"))
val_cross = pd.read_csv(os.path.join(BASE_DIR, "datasets/val_cross.csv"))
test = pd.read_csv(os.path.join(BASE_DIR, "datasets/test.csv"))
test_fr = pd.read_csv(os.path.join(BASE_DIR, "datasets/test_fr.csv"))
test_cross = pd.read_csv(os.path.join(BASE_DIR, "datasets/test_cross.csv"))
model, tokenizer = download_model(model_name)
print(f"Model {model_name} loaded successfully.")
if model_name == "mT5":
summarize_text = summarize_text_mt5
elif model_name == "mBART50":
summarize_text = summarize_text_mbart50
if finetune_type == "english":
fine_tune(model_name, "english", model, tokenizer, summarize_text, train, val)
elif finetune_type == "multilingual":
fine_tune(model_name, "multilingual", model, tokenizer, summarize_text, train_fr, val_fr)
else:
fine_tune(model_name, "crosslingual", model, tokenizer, summarize_text, train_cross, val_cross)
def fine_tune(model_name, finetune_type, model, tokenizer, summarize_text, train, val):
print("Starting Fine-tuning...")
if model_name == "mT5":
max_input = 512
else:
max_input = 1024
max_output = 60
train_dataset = train
eval_dataset = val
if finetune_type == "english":
train_dataset = Dataset.from_pandas(train.sample(1500))
eval_dataset = Dataset.from_pandas(val.sample(200))
else:
train_dataset = Dataset.from_pandas(train.sample(1200))
eval_dataset = Dataset.from_pandas(val.sample(150))
def preprocess_function(examples):
inputs = [f"Summarize the text: {ex}" for ex in examples["source"]]
targets = [f"Summary: {ex}" for ex in examples["target"]]
model_inputs = tokenizer(inputs, max_length=max_input, truncation=True)
with tokenizer.as_target_tokenizer():
labels = tokenizer(targets, max_length=max_output, truncation=True)
model_inputs["labels"] = labels["input_ids"]
print("Input:", inputs[0])
print("Output:", targets[0])
return model_inputs
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_eval = eval_dataset.map(preprocess_function, batched=True)
# QLoRA config for mT5
if model_name == "mT5":
lora_config = LoraConfig(
r=8,
lora_alpha=32,
lora_dropout=0.05,
bias="none",
task_type=TaskType.SEQ_2_SEQ_LM
)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model) # Initialize the DataCollatorForSeq2Seq
training_args = TrainingArguments(
output_dir=f"./{model_name}-{finetune_type}-finetuned",
evaluation_strategy="epoch",
save_total_limit = 1,
learning_rate=2e-5,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
num_train_epochs=3,
weight_decay=0.01,
push_to_hub=True,
fp16=True,
report_to="none",
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_eval,
data_collator=data_collator,
)
trainer.train()
tokenizer.save_pretrained(training_args.output_dir)
tokenizer.push_to_hub(f"{model_name}-{finetune_type}-finetuned")
print("Saving model to Hugging Face Hub...")
def main():
parser = argparse.ArgumentParser(description="Run experiments with different models.")
parser.add_argument("--model", type=str, required=True, choices=MODELS.values(), help="The model to use.")
parser.add_argument("--finetune_type", type=str, required=True, choices=["english", "multilingual", "crosslingual"], help="The type of fine-tuning to apply.")
args = parser.parse_args()
experiments(args.model, args.finetune_type)
if __name__ == "__main__":
main() |