|
import re |
|
from itertools import product |
|
|
|
import datasets |
|
import evaluate |
|
import numpy as np |
|
import sacrebleu |
|
import transformers.data.metrics.squad_metrics as squad_metrics |
|
from rouge_score import rouge_scorer, scoring |
|
|
|
from lm_eval.utils import general_detokenize |
|
|
|
|
|
def lowercase_first_letter(text): |
|
return text[0].lower() + text[1:] |
|
|
|
|
|
def process_summarization(dataset): |
|
def _process_doc(doc): |
|
|
|
doc["text"] = re.sub(r" +", " ", doc["text"]) |
|
doc["summary"] = re.sub(r" +", " ", doc["summary"]) |
|
return doc |
|
|
|
return dataset.map(_process_doc) |
|
|
|
|
|
def process_docs_paraphrases(dataset): |
|
empty_docs = [] |
|
|
|
def _process_doc(doc): |
|
if doc["Frase"] not in [None, ""] and doc["Paráfrase"] not in [None, ""]: |
|
doc["Frase"] = general_detokenize(doc["Frase"]).strip() |
|
doc["Paráfrase"] = general_detokenize(doc["Paráfrase"]).strip() |
|
|
|
if doc["Frase"].endswith((".", ",", ";")): |
|
doc["Frase"] = doc["Frase"][:-1] |
|
|
|
doc["Paráfrase"] = lowercase_first_letter(doc["Paráfrase"]) |
|
return doc |
|
else: |
|
empty_docs.append(doc) |
|
return doc |
|
|
|
if empty_docs != []: |
|
len_empty_docs = len(empty_docs) |
|
print( |
|
f"Found {len_empty_docs} empty documents out of the {len(dataset)} total docs in the dataset: {empty_docs}" |
|
) |
|
return dataset.filter( |
|
lambda doc: doc["Frase"] not in [None, ""] |
|
and doc["Paráfrase"] not in [None, ""] |
|
).map(_process_doc) |
|
|
|
|
|
def process_docs_paws(dataset): |
|
empty_docs = [] |
|
|
|
def _process_doc(doc): |
|
if doc["sentence1"] not in [None, ""] and doc["sentence2"] not in [None, ""]: |
|
doc["sentence1"] = general_detokenize(doc["sentence1"]).strip() |
|
doc["sentence2"] = general_detokenize(doc["sentence2"]).strip() |
|
|
|
if doc["sentence1"].endswith((".", ",", ";")): |
|
doc["sentence1"] = doc["sentence1"][:-1] |
|
|
|
doc["sentence2"] = lowercase_first_letter(doc["sentence2"]) |
|
return doc |
|
else: |
|
empty_docs.append(doc) |
|
return doc |
|
|
|
if empty_docs != []: |
|
len_empty_docs = len(empty_docs) |
|
print( |
|
f"Found {len_empty_docs} empty documents out of the {len(dataset)} total docs in the dataset: {empty_docs}" |
|
) |
|
return dataset.filter( |
|
lambda doc: doc["sentence1"] not in [None, ""] |
|
and doc["sentence2"] not in [None, ""] |
|
).map(_process_doc) |
|
|
|
|
|
def rouge1(items): |
|
""" |
|
# passthrough for efficiency |
|
""" |
|
return items |
|
|
|
|
|
def rouge1_agg(items): |
|
""" |
|
Higher is better |
|
""" |
|
refs = list(zip(*items))[0] |
|
preds = list(zip(*items))[1] |
|
rouge_scorer = evaluate.load("rouge") |
|
|
|
return rouge_scorer.compute(predictions=preds, references=refs)["rouge1"] |
|
|
|
|
|
def process_results_mc2(doc, results): |
|
lls, is_greedy = zip(*results) |
|
|
|
|
|
split_idx = list(doc["mc2_targets"]["labels"]).index(0) |
|
|
|
ll_true, ll_false = lls[:split_idx], lls[split_idx:] |
|
p_true, p_false = np.exp(np.array(ll_true)), np.exp(np.array(ll_false)) |
|
p_true = p_true / (sum(p_true) + sum(p_false)) |
|
|
|
return {"acc": sum(p_true)} |
|
|
|
|
|
def process_docs_gen(dataset: datasets.Dataset) -> datasets.Dataset: |
|
return dataset.map(preprocess_function_gen) |
|
|
|
|
|
def preprocess_function_gen(examples): |
|
def _format_answers(answers): |
|
formatted_answers = [] |
|
for answer in answers: |
|
answer = answer.strip() |
|
if len(answer): |
|
|
|
if answer[-1] != ".": |
|
formatted_answers.append(answer + ".") |
|
else: |
|
formatted_answers.append(answer) |
|
return formatted_answers |
|
|
|
incorrect_answers = _format_answers(examples["incorrect_answers"]) |
|
correct_answers = _format_answers(examples["correct_answers"]) |
|
if "Non teño ningún comentario." not in correct_answers: |
|
correct_answers.append("Non teño ningún comentario.") |
|
return { |
|
"question": examples["question"].strip(), |
|
"correct_answers": correct_answers, |
|
"incorrect_answers": incorrect_answers, |
|
} |
|
|
|
|
|
def process_doc_nli(dataset): |
|
def process_fn(doc): |
|
|
|
doc["sentence1"] = general_detokenize(doc["sentence1"]).strip() |
|
doc["sentence2"] = general_detokenize(doc["sentence2"]).strip() |
|
|
|
doc["sentence1"] = ( |
|
doc["sentence1"][:-1] |
|
if doc["sentence1"].endswith((".", ",", "!", "?")) |
|
else doc["sentence1"] |
|
) |
|
|
|
doc["sentence2"] = lowercase_first_letter(doc["sentence2"]) |
|
|
|
doc["sentence2"] = ( |
|
(doc["sentence2"] + ".") |
|
if not doc["sentence2"].endswith(".") |
|
else doc["sentence2"] |
|
) |
|
|
|
label_to_int = {"entailment": 0, "neutral": 1, "contradiction": 2} |
|
doc["gold_label"] = label_to_int[doc["gold_label"]] |
|
return doc |
|
|
|
return dataset.map(process_fn) |
|
|
|
|
|
def process_results_gen(doc, results): |
|
completion = results[0] |
|
true_refs, false_refs = doc["correct_answers"], doc["incorrect_answers"] |
|
all_refs = true_refs + false_refs |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bleu_scores = [bleu([[ref]], [completion]) for ref in all_refs] |
|
bleu_correct = np.nanmax(bleu_scores[: len(true_refs)]) |
|
bleu_incorrect = np.nanmax(bleu_scores[len(true_refs) :]) |
|
bleu_max = bleu_correct |
|
bleu_diff = bleu_correct - bleu_incorrect |
|
bleu_acc = int(bleu_correct > bleu_incorrect) |
|
|
|
|
|
rouge_scores = [rouge([ref], [completion]) for ref in all_refs] |
|
|
|
rouge1_scores = [score["rouge1"] for score in rouge_scores] |
|
rouge1_correct = np.nanmax(rouge1_scores[: len(true_refs)]) |
|
rouge1_incorrect = np.nanmax(rouge1_scores[len(true_refs) :]) |
|
rouge1_max = rouge1_correct |
|
rouge1_diff = rouge1_correct - rouge1_incorrect |
|
rouge1_acc = int(rouge1_correct > rouge1_incorrect) |
|
|
|
rouge2_scores = [score["rouge2"] for score in rouge_scores] |
|
rouge2_correct = np.nanmax(rouge2_scores[: len(true_refs)]) |
|
rouge2_incorrect = np.nanmax(rouge2_scores[len(true_refs) :]) |
|
rouge2_max = rouge2_correct |
|
rouge2_diff = rouge2_correct - rouge2_incorrect |
|
rouge2_acc = int(rouge2_correct > rouge2_incorrect) |
|
|
|
rougeL_scores = [score["rougeLsum"] for score in rouge_scores] |
|
rougeL_correct = np.nanmax(rougeL_scores[: len(true_refs)]) |
|
rougeL_incorrect = np.nanmax(rougeL_scores[len(true_refs) :]) |
|
rougeL_max = rougeL_correct |
|
rougeL_diff = rougeL_correct - rougeL_incorrect |
|
rougeL_acc = int(rougeL_correct > rougeL_incorrect) |
|
|
|
return { |
|
|
|
|
|
|
|
"bleu_max": bleu_max, |
|
"bleu_acc": bleu_acc, |
|
"bleu_diff": bleu_diff, |
|
"rouge1_max": rouge1_max, |
|
"rouge1_acc": rouge1_acc, |
|
"rouge1_diff": rouge1_diff, |
|
"rouge2_max": rouge2_max, |
|
"rouge2_acc": rouge2_acc, |
|
"rouge2_diff": rouge2_diff, |
|
"rougeL_max": rougeL_max, |
|
"rougeL_acc": rougeL_acc, |
|
"rougeL_diff": rougeL_diff, |
|
} |
|
|
|
|
|
def bleu(refs, preds): |
|
""" |
|
Returns `t5` style BLEU scores. See the related implementation: |
|
https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L41 |
|
|
|
:param refs: |
|
A `list` of `list` of reference `str`s. |
|
:param preds: |
|
A `list` of predicted `str`s. |
|
""" |
|
score = sacrebleu.corpus_bleu( |
|
preds, |
|
refs, |
|
smooth_method="exp", |
|
smooth_value=0.0, |
|
force=False, |
|
lowercase=False, |
|
tokenize="intl", |
|
use_effective_order=False, |
|
).score |
|
return score |
|
|
|
|
|
def rouge(refs, preds): |
|
""" |
|
Returns `t5` style ROUGE scores. See the related implementation: |
|
https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L68 |
|
|
|
:param refs: |
|
A `list` of reference `strs`. |
|
:param preds: |
|
A `list` of predicted `strs`. |
|
""" |
|
rouge_types = ["rouge1", "rouge2", "rougeLsum"] |
|
scorer = rouge_scorer.RougeScorer(rouge_types) |
|
|
|
|
|
def _prepare_summary(summary): |
|
summary = summary.replace(" . ", ".\n") |
|
return summary |
|
|
|
|
|
aggregator = scoring.BootstrapAggregator() |
|
for ref, pred in zip(refs, preds): |
|
ref = _prepare_summary(ref) |
|
pred = _prepare_summary(pred) |
|
aggregator.add_scores(scorer.score(ref, pred)) |
|
result = aggregator.aggregate() |
|
return {type: result[type].mid.fmeasure * 100 for type in rouge_types} |
|
|