Spaces:
No application file
No application file
File size: 9,321 Bytes
4fe7b26 7c1bf97 4fe7b26 64c7578 4fe7b26 64c7578 4fe7b26 7c1bf97 4fe7b26 7c1bf97 4fe7b26 7c1bf97 4fe7b26 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 |
import os
import csv
import torch
import argparse
import numpy as np
import pandas as pd
import huggingface_hub
from datasets import Dataset
from load_finetuned import load_model
from metrics import compute_scores, save_scores
from transformers import (AutoTokenizer, BitsAndBytesConfig, MBart50TokenizerFast,
AutoModelForSeq2SeqLM, AutoModelForCausalLM, Trainer,
MBartForConditionalGeneration, TrainingArguments,
DataCollatorForSeq2Seq)
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
MODELS = {
"mT5": "mT5",
"mBART50": "mBART50"
}
LANGUAGE_CODES = {
"English": "en_XX",
"French": "fr_XX"
}
EXPERIMENTS = {
"mT5": ["zero-shot"],
"mBART50": ["zero-shot", "1-shot"]
}
def summarize_text_mt5(texts, model, tokenizer):
inputs = tokenizer(texts, return_tensors="pt",
max_length=512, truncation=True,
padding=True).to(model.device)
summary_ids = model.generate(input_ids = inputs.input_ids,
max_length=60,
num_beams=4, length_penalty=2.0,
early_stopping=True)
summaries = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)
return summaries
def summarize_text_mbart50(texts, model, tokenizer):
inputs = tokenizer(texts, return_tensors="pt",
max_length=1024, truncation=True,
padding=True).to(model.device)
summary_ids = model.generate(inputs.input_ids, max_length=60,
num_beams=4, length_penalty=2.0,
early_stopping=True)
summaries = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)
return summaries
def experiments(model_name, experiment_type, num_examples, finetune_type):
"""Runs an experiment with the given model and dataset."""
print(f"Starting Experiment: on {model_name}")
# Construct dataset paths dynamically
train = pd.read_csv(os.path.join(BASE_DIR, "datasets/train.csv"))
train_fr = pd.read_csv(os.path.join(BASE_DIR, "datasets/train_fr.csv"))
train_cross = pd.read_csv(os.path.join(BASE_DIR, "datasets/train_cross.csv"))
val = pd.read_csv(os.path.join(BASE_DIR, "datasets/val.csv"))
val_fr = pd.read_csv(os.path.join(BASE_DIR, "datasets/val_fr.csv"))
val_cross = pd.read_csv(os.path.join(BASE_DIR, "datasets/val_cross.csv"))
test = pd.read_csv(os.path.join(BASE_DIR, "datasets/test.csv"))
test_fr = pd.read_csv(os.path.join(BASE_DIR, "datasets/test_fr.csv"))
test_cross = pd.read_csv(os.path.join(BASE_DIR, "datasets/test_cross.csv"))
test = test.sample(num_examples) if num_examples else test
test_fr = test_fr.sample(num_examples) if num_examples else test_fr
test_cross = test_cross.sample(num_examples) if num_examples else test_cross
model, tokenizer = load_model(model_name, finetune_type)
print(f"Model {model_name} loaded successfully.")
if model_name == "mT5":
summarize_text = summarize_text_mt5
elif model_name == "mBART50":
summarize_text = summarize_text_mbart50
if experiment_type == "zero-shot":
run_zero_shot(model_name, model, tokenizer, summarize_text, test, test_fr, test_cross)
elif experiment_type == "1-shot":
run_1_shot(model_name, model, tokenizer, summarize_text, train, train_fr, train_cross, test, test_fr, test_cross)
elif experiment_type == "2-shot":
run_2_shot(model_name, model, tokenizer, summarize_text, train, train_fr, train_cross, test, test_fr, test_cross)
else:
raise ValueError("Invalid experiment type.")
def run_zero_shot(model_name, model, tokenizer, summarize_text, test, test_fr, test_cross, batch_size=16):
print("Running Zero-Shot Evaluation...")
for dataset, name in [(test, "English"), (test_fr, "French"), (test_cross, "Cross-lingual")]:
if model_name == "mBART50":
if name == "English":
tokenizer.src_lang = "en_XX"
else:
tokenizer.src_lang = "fr_XX"
prefix = "Summarize in English: " if name == "Cross-lingual" else "Summarize the text: "
texts = [f"{prefix}{row['source']}\n Summary: " for _, row in dataset.iterrows()]
reference_summaries = dataset["target"].tolist()
# Process in batches
generated_summaries = []
for i in range(0, len(texts), batch_size):
batch_texts = texts[i:i + batch_size]
print(f"Processing batch {i//batch_size + 1}")
batch_summaries = summarize_text(batch_texts, model, tokenizer)
generated_summaries.extend(batch_summaries)
scores = compute_scores(generated_summaries, reference_summaries)
save_scores(scores, model_name, "zero-shot", name)
print(f"{name} Scores:", scores)
def run_1_shot(model_name, model, tokenizer, summarize_text, train, train_fr, train_cross, test, test_fr, test_cross, batch_size=16):
print("Running 1-Shot Evaluation...")
for dataset, train_data, name in [(test, train, "English"), (test_fr, train_fr, "French"), (test_cross, train_cross, "Cross-lingual")]:
if model_name == "mBART50":
if name == "English":
tokenizer.src_lang = "en_XX"
else:
tokenizer.src_lang = "fr_XX"
generated_summaries = []
reference_summaries = []
texts = []
for _, sample in dataset.iterrows():
one_shot = train_data.sample(1)
source = one_shot["source"].iloc[0]
target = one_shot["target"].iloc[0]
prefix = "Summarize in English: " if name == "Cross-lingual" else "Summarize the text: "
prompt = (
f"{prefix}{source}\n Summary: {target}\n\n"
f"{prefix}{sample['source']}\n Summary: "
)
texts.append(prompt)
reference_summaries.append(sample["target"])
# Process in batches
for i in range(0, len(texts), batch_size):
batch_texts = texts[i:i + batch_size]
print(f"Processing batch {i//batch_size + 1}")
batch_summaries = summarize_text(batch_texts, model, tokenizer)
generated_summaries.extend(batch_summaries)
scores = compute_scores(generated_summaries, reference_summaries)
save_scores(scores, model_name, "1-shot", name)
print(f"{name} Scores:", scores)
def run_2_shot(model_name, model, tokenizer, summarize_text, train, train_fr, train_cross, test, test_fr, test_cross, batch_size=16):
print("Running 2-Shot Evaluation...")
for dataset, train_data, name in [(test, train, "English"), (test_fr, train_fr, "French"), (test_cross, train_cross, "Cross-lingual")]:
if model_name == "mBART50":
if name == "English":
tokenizer.src_lang = "en_XX"
else:
tokenizer.src_lang = "fr_XX"
generated_summaries = []
reference_summaries = []
texts = []
for _, sample in dataset.iterrows():
two_shots = train_data.sample(2)
two_shot1, two_shot2 = two_shots.iloc[0], two_shots.iloc[1]
source1, source2 = two_shot1["source"].iloc[0] , two_shot2["source"].iloc[0]
target1, target2 = two_shot1["target"].iloc[0] , two_shot2["target"].iloc[0]
prefix = "Summarize in English: " if name == "Cross-lingual" else "Summarize the text: "
prompt = (
f"{prefix}{two_shot1['source']}\n Summary: {two_shot1['target']}\n\n"
f"{prefix}{two_shot2['source']}\n Summary: {two_shot2['target']}\n\n"
f"{prefix}{sample['source']}\n Summary: "
)
texts.append(prompt)
reference_summaries.append(sample["target"])
# Process in batches
for i in range(0, len(texts), batch_size):
batch_texts = texts[i:i + batch_size]
batch_summaries = summarize_text(batch_texts, model, tokenizer)
print(f"Processing batch {i//batch_size + 1}")
generated_summaries.extend(batch_summaries)
scores = compute_scores(generated_summaries, reference_summaries)
save_scores(scores, model_name, "2-shot", name)
print(f"{name} Scores:", scores)
def main():
parser = argparse.ArgumentParser(description="Run experiments with different models.")
parser.add_argument("--model", type=str, required=True, choices=MODELS.values(), help="The model to use.")
parser.add_argument("--experiment", type=str, required=True, choices=sum(EXPERIMENTS.values(), []), help="The experiment to run.")
parser.add_argument("--num_examples", type=int, default=None, help="Number of examples to generate summaries on (optional).")
parser.add_argument("--finetune_type", type=str, required=True, choices=["english", "multilingual", "crosslingual"], help="The type of fine-tuning to apply.")
args = parser.parse_args()
experiments(args.model, args.experiment, args.num_examples, args.finetune_type)
if __name__ == "__main__":
main() |