turing-space / turing /evaluate_model.py
papri-ka's picture
Deploy FastAPI ML service to Hugging Face Spaces
5fc6e5d
import time
from datasets import DatasetDict
from loguru import logger
import numpy as np
import pandas as pd
import torch
import turing.config as config
def calculate_submission_score(avg_f1: float, avg_runtime: float, avg_flops: float) -> float:
"""
Calculates the final competition score.
The score is a weighted sum of F1 score, runtime, and GFLOPS.
Weights:
- F1 Score: 60%
- Runtime: 20%
- GFLOPS: 20%
Args:
avg_f1 (float): Average F1 score across all categories.
avg_runtime (float): Average runtime in seconds.
avg_flops (float): Average GFLOPS.
Returns:
float: Final submission score.
"""
score_f1 = 0.6 * avg_f1
runtime_ratio = (config.MAX_AVG_RUNTIME - avg_runtime) / config.MAX_AVG_RUNTIME
score_runtime = 0.2 * max(runtime_ratio, 0)
flops_ratio = (config.MAX_AVG_FLOPS - avg_flops) / config.MAX_AVG_FLOPS
score_flops = 0.2 * max(flops_ratio, 0)
total_score = score_f1 + score_runtime + score_flops
logger.info(f" F1 Score (60%): {score_f1:.4f} (avg_f1: {avg_f1:.4f})")
logger.info(
f" Runtime Score (20%): {score_runtime:.4f} (avg_runtime: {avg_runtime:.4f}s / {config.MAX_AVG_RUNTIME}s)"
)
logger.info(
f" GFLOPS Score (20%): {score_flops:.4f} (avg_flops: {avg_flops:.4f} / {config.MAX_AVG_FLOPS})"
)
logger.info(" ====================")
logger.info(f" Final Score: {total_score:.4f}")
return total_score
def evaluate_models(models: dict, dataset: DatasetDict):
"""
Evaluates the provided models on the test datasets for each language.
Computes precision, recall, and F1 score for each category and language.
Also measures average runtime and GFLOPS for model inference.
Args:
models (dict): A dictionary mapping language codes to their respective models.
dataset (DatasetDict): A DatasetDict containing test datasets for each language.
Returns:
pd.DataFrame: DataFrame containing precision, recall, and F1 scores for each category and language.
float: Final submission score calculated based on average F1, runtime, and GF
"""
total_flops = 0
total_time = 0
scores = []
for lan in config.LANGS:
logger.info(f"\n--- Evaluating Language: {lan.upper()} ---")
model = models[lan]
with torch.profiler.profile(with_flops=True) as p:
test_data = dataset[f"{lan}_test"]
x = test_data[config.INPUT_COLUMN]
x = list(x) if hasattr(x, 'tolist') else x # Convert pandas Series to list
y_true = np.array(test_data[config.LABEL_COLUMN]).T
begin = time.time()
for i in range(10):
y_pred = model.predict(x)
y_pred = np.asarray(y_pred).T
total = time.time() - begin
total_time = total_time + total
total_flops = total_flops + (sum(k.flops for k in p.key_averages()) / 1e9)
for i in range(len(y_pred)):
assert len(y_pred[i]) == len(y_true[i])
tp = sum([true == pred == 1 for (true, pred) in zip(y_true[i], y_pred[i])])
#tn = sum([true == pred == 0 for (true, pred) in zip(y_true[i], y_pred[i])])
fp = sum([true == 0 and pred == 1 for (true, pred) in zip(y_true[i], y_pred[i])])
fn = sum([true == 1 and pred == 0 for (true, pred) in zip(y_true[i], y_pred[i])])
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = (2 * tp) / (2 * tp + fp + fn)
scores.append({
"lan": lan,
"cat": config.LABELS_MAP[lan][i],
"precision": precision,
"recall": recall,
"f1": f1,
})
logger.info(f"Compute in GFLOPs: {total_flops / 10}")
logger.info(f"Avg runtime in seconds: {total_time / 10}")
scores = pd.DataFrame(scores)
print(scores)
avg_f1 = scores["f1"].mean()
avg_runtime = total_time / 10
avg_flops = total_flops / 10
final_score = calculate_submission_score(avg_f1, avg_runtime, avg_flops)
logger.info(f"Final Score for {lan.upper()}: {final_score:.4f}")
return scores, final_score