|
import argparse
|
|
import gc
|
|
import logging
|
|
from datetime import datetime
|
|
|
|
import torch
|
|
import wandb
|
|
from tqdm import tqdm
|
|
|
|
from predictions.all_llms import llms
|
|
from src.evaluation.model_evaluator import ModelEvaluator
|
|
from src.model.hugging_face_model import HFLLMModel
|
|
from src.task.task_factory import tasks_factory
|
|
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument(
|
|
"--test",
|
|
help="If set to true, the system will default to testing only a small model with a few examples.",
|
|
default=False,
|
|
type=bool,
|
|
)
|
|
parser.add_argument(
|
|
"--max_examples",
|
|
"-m",
|
|
help="The maximum number of examples to use, defaults to None.",
|
|
type=int,
|
|
default=None,
|
|
)
|
|
parser.add_argument(
|
|
"--token",
|
|
"-t",
|
|
help="Input your HuggingFace token to fetch models.",
|
|
type=str,
|
|
default=None,
|
|
)
|
|
parser.add_argument(
|
|
"--models_name",
|
|
"-mn",
|
|
help="The name of the model(s) to load.",
|
|
type=str,
|
|
default=None,
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--batch_size",
|
|
help="The batch size to use during the evaluation.",
|
|
type=int,
|
|
default=16,
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
tasks_names = [
|
|
"piaf",
|
|
"qfrblimp",
|
|
"allocine",
|
|
"qfrcola",
|
|
"gqnli",
|
|
"opus_parcus",
|
|
"paws_x",
|
|
"fquad",
|
|
"sickfr",
|
|
"sts22",
|
|
"xnli",
|
|
]
|
|
tasks = tasks_factory(tasks_names)
|
|
|
|
models = []
|
|
if args.models_name is not None:
|
|
if args.models_name in llms:
|
|
models = llms[args.models_name]
|
|
else:
|
|
models = args.models_name.split(",")
|
|
else:
|
|
models = llms["all"]
|
|
|
|
logging.info("Starting Evaluation")
|
|
|
|
time_start = datetime.now()
|
|
|
|
for model_name in tqdm(
|
|
models, total=len(models), desc="Processing LLM inference on tasks."
|
|
):
|
|
try:
|
|
model = HFLLMModel(model_name=model_name, batch_size=args.batch_size)
|
|
logging.info("Creating model")
|
|
evaluator = ModelEvaluator()
|
|
logging.info("Evaluating model")
|
|
|
|
exp_name = f"{model_name}"
|
|
wandb.init(
|
|
project="COLLE",
|
|
config={"model_name": model_name, "tasks": "; ".join(tasks_names)},
|
|
name=exp_name,
|
|
)
|
|
|
|
predictions_payload = evaluator.evaluate_subset(model, tasks, args.max_examples)
|
|
wandb.log(predictions_payload)
|
|
|
|
logging.info("Saving results")
|
|
evaluator.save_results("./results")
|
|
|
|
metrics_payload = evaluator.compute_metrics()
|
|
evaluator.save_metrics("./results")
|
|
wandb.log(metrics_payload)
|
|
|
|
wandb.finish(exit_code=0)
|
|
|
|
except Exception as e:
|
|
error_message = f"Evaluation failed for model {model_name}: {e}"
|
|
logging.error(error_message)
|
|
wandb.finish(exit_code=1)
|
|
continue
|
|
finally:
|
|
|
|
if "model" in locals():
|
|
del model
|
|
if "evaluator" in locals():
|
|
del evaluator
|
|
gc.collect()
|
|
torch.cuda.empty_cache()
|
|
|
|
time_end = datetime.now()
|
|
info_message = f"End time: {time_end}"
|
|
logging.info(info_message)
|
|
elapsed_time = time_end - time_start
|
|
info_message = f"Elapsed time: {elapsed_time}"
|
|
logging.info(info_message)
|
|
|