import re from itertools import product import datasets import evaluate import numpy as np import sacrebleu import transformers.data.metrics.squad_metrics as squad_metrics from rouge_score import rouge_scorer, scoring from lm_eval.utils import general_detokenize def lowercase_first_letter(text): return text[0].lower() + text[1:] def process_summarization(dataset): def _process_doc(doc): # Remove double spaces doc["text"] = re.sub(r" +", " ", doc["text"]) doc["summary"] = re.sub(r" +", " ", doc["summary"]) return doc return dataset.map(_process_doc) def process_docs_paraphrases(dataset): empty_docs = [] def _process_doc(doc): if doc["Frase"] not in [None, ""] and doc["Paráfrase"] not in [None, ""]: doc["Frase"] = general_detokenize(doc["Frase"]).strip() doc["Paráfrase"] = general_detokenize(doc["Paráfrase"]).strip() # Remove final punctuation mark in the first sentence if doc["Frase"].endswith((".", ",", ";")): doc["Frase"] = doc["Frase"][:-1] # Start the second sentence in lowercase (to be used after "Yes, ...") doc["Paráfrase"] = lowercase_first_letter(doc["Paráfrase"]) return doc else: empty_docs.append(doc) return doc if empty_docs != []: len_empty_docs = len(empty_docs) print( f"Found {len_empty_docs} empty documents out of the {len(dataset)} total docs in the dataset: {empty_docs}" ) return dataset.filter( lambda doc: doc["Frase"] not in [None, ""] and doc["Paráfrase"] not in [None, ""] ).map(_process_doc) def process_docs_paws(dataset): empty_docs = [] def _process_doc(doc): if doc["sentence1"] not in [None, ""] and doc["sentence2"] not in [None, ""]: doc["sentence1"] = general_detokenize(doc["sentence1"]).strip() doc["sentence2"] = general_detokenize(doc["sentence2"]).strip() # Remove final punctuation mark in the first sentence if doc["sentence1"].endswith((".", ",", ";")): doc["sentence1"] = doc["sentence1"][:-1] # Start the second sentence in lowercase (to be used after "Yes, ...") doc["sentence2"] = lowercase_first_letter(doc["sentence2"]) return doc else: empty_docs.append(doc) return doc if empty_docs != []: len_empty_docs = len(empty_docs) print( f"Found {len_empty_docs} empty documents out of the {len(dataset)} total docs in the dataset: {empty_docs}" ) return dataset.filter( lambda doc: doc["sentence1"] not in [None, ""] and doc["sentence2"] not in [None, ""] ).map(_process_doc) def rouge1(items): """ # passthrough for efficiency """ return items def rouge1_agg(items): """ Higher is better """ refs = list(zip(*items))[0] preds = list(zip(*items))[1] rouge_scorer = evaluate.load("rouge") # import code; code.interact(local=dict(globals(), **locals())) return rouge_scorer.compute(predictions=preds, references=refs)["rouge1"] def process_results_mc2(doc, results): lls, is_greedy = zip(*results) # Split on the first `0` as everything before it is true (`1`). split_idx = list(doc["mc2_targets"]["labels"]).index(0) # Compute the normalized probability mass for the correct answer. ll_true, ll_false = lls[:split_idx], lls[split_idx:] p_true, p_false = np.exp(np.array(ll_true)), np.exp(np.array(ll_false)) p_true = p_true / (sum(p_true) + sum(p_false)) return {"acc": sum(p_true)} def process_docs_gen(dataset: datasets.Dataset) -> datasets.Dataset: return dataset.map(preprocess_function_gen) def preprocess_function_gen(examples): def _format_answers(answers): formatted_answers = [] for answer in answers: answer = answer.strip() if len(answer): # Add a period after all answers. if answer[-1] != ".": formatted_answers.append(answer + ".") else: formatted_answers.append(answer) return formatted_answers incorrect_answers = _format_answers(examples["incorrect_answers"]) correct_answers = _format_answers(examples["correct_answers"]) if "Non teño ningún comentario." not in correct_answers: correct_answers.append("Non teño ningún comentario.") return { "question": examples["question"].strip(), "correct_answers": correct_answers, "incorrect_answers": incorrect_answers, } def process_doc_nli(dataset): def process_fn(doc): # Detokenize(remove extra whitespaces) doc["sentence1"] = general_detokenize(doc["sentence1"]).strip() doc["sentence2"] = general_detokenize(doc["sentence2"]).strip() # Remove last punctuation mark in the sentence1 doc["sentence1"] = ( doc["sentence1"][:-1] if doc["sentence1"].endswith((".", ",", "!", "?")) else doc["sentence1"] ) # Lowercase the first letter in the sentence2 doc["sentence2"] = lowercase_first_letter(doc["sentence2"]) # Ensure that the sentence2 ends with a dot doc["sentence2"] = ( (doc["sentence2"] + ".") if not doc["sentence2"].endswith(".") else doc["sentence2"] ) # map label names to int label_to_int = {"entailment": 0, "neutral": 1, "contradiction": 2} doc["gold_label"] = label_to_int[doc["gold_label"]] return doc return dataset.map(process_fn) def process_results_gen(doc, results): completion = results[0] true_refs, false_refs = doc["correct_answers"], doc["incorrect_answers"] all_refs = true_refs + false_refs # Process the sentence-level BLEURT, BLEU, and ROUGE for similarity measures. # # BLEURT # bleurt_scores_true = self.bleurt.compute( # predictions=[completion] * len(true_refs), references=true_refs # )["scores"] # bleurt_scores_false = self.bleurt.compute( # predictions=[completion] * len(false_refs), references=false_refs # )["scores"] # bleurt_correct = max(bleurt_scores_true) # bleurt_incorrect = max(bleurt_scores_false) # bleurt_max = bleurt_correct # bleurt_diff = bleurt_correct - bleurt_incorrect # bleurt_acc = int(bleurt_correct > bleurt_incorrect) # BLEU bleu_scores = [bleu([[ref]], [completion]) for ref in all_refs] bleu_correct = np.nanmax(bleu_scores[: len(true_refs)]) bleu_incorrect = np.nanmax(bleu_scores[len(true_refs) :]) bleu_max = bleu_correct bleu_diff = bleu_correct - bleu_incorrect bleu_acc = int(bleu_correct > bleu_incorrect) # ROUGE-N rouge_scores = [rouge([ref], [completion]) for ref in all_refs] # ROUGE-1 rouge1_scores = [score["rouge1"] for score in rouge_scores] rouge1_correct = np.nanmax(rouge1_scores[: len(true_refs)]) rouge1_incorrect = np.nanmax(rouge1_scores[len(true_refs) :]) rouge1_max = rouge1_correct rouge1_diff = rouge1_correct - rouge1_incorrect rouge1_acc = int(rouge1_correct > rouge1_incorrect) # ROUGE-2 rouge2_scores = [score["rouge2"] for score in rouge_scores] rouge2_correct = np.nanmax(rouge2_scores[: len(true_refs)]) rouge2_incorrect = np.nanmax(rouge2_scores[len(true_refs) :]) rouge2_max = rouge2_correct rouge2_diff = rouge2_correct - rouge2_incorrect rouge2_acc = int(rouge2_correct > rouge2_incorrect) # ROUGE-L rougeL_scores = [score["rougeLsum"] for score in rouge_scores] rougeL_correct = np.nanmax(rougeL_scores[: len(true_refs)]) rougeL_incorrect = np.nanmax(rougeL_scores[len(true_refs) :]) rougeL_max = rougeL_correct rougeL_diff = rougeL_correct - rougeL_incorrect rougeL_acc = int(rougeL_correct > rougeL_incorrect) return { # "bleurt_max": bleurt_max, # "bleurt_acc": bleurt_acc, # "bleurt_diff": bleurt_diff, "bleu_max": bleu_max, "bleu_acc": bleu_acc, "bleu_diff": bleu_diff, "rouge1_max": rouge1_max, "rouge1_acc": rouge1_acc, "rouge1_diff": rouge1_diff, "rouge2_max": rouge2_max, "rouge2_acc": rouge2_acc, "rouge2_diff": rouge2_diff, "rougeL_max": rougeL_max, "rougeL_acc": rougeL_acc, "rougeL_diff": rougeL_diff, } def bleu(refs, preds): """ Returns `t5` style BLEU scores. See the related implementation: https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L41 :param refs: A `list` of `list` of reference `str`s. :param preds: A `list` of predicted `str`s. """ score = sacrebleu.corpus_bleu( preds, refs, smooth_method="exp", smooth_value=0.0, force=False, lowercase=False, tokenize="intl", use_effective_order=False, ).score return score def rouge(refs, preds): """ Returns `t5` style ROUGE scores. See the related implementation: https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L68 :param refs: A `list` of reference `strs`. :param preds: A `list` of predicted `strs`. """ rouge_types = ["rouge1", "rouge2", "rougeLsum"] scorer = rouge_scorer.RougeScorer(rouge_types) # Add newlines between sentences to correctly compute `rougeLsum`. def _prepare_summary(summary): summary = summary.replace(" . ", ".\n") return summary # Accumulate confidence intervals. aggregator = scoring.BootstrapAggregator() for ref, pred in zip(refs, preds): ref = _prepare_summary(ref) pred = _prepare_summary(pred) aggregator.add_scores(scorer.score(ref, pred)) result = aggregator.aggregate() return {type: result[type].mid.fmeasure * 100 for type in rouge_types}