File size: 4,229 Bytes
5fc6e5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import time

from datasets import DatasetDict
from loguru import logger
import numpy as np
import pandas as pd
import torch

import turing.config as config


def calculate_submission_score(avg_f1: float, avg_runtime: float, avg_flops: float) -> float:
    """
    Calculates the final competition score.
    The score is a weighted sum of F1 score, runtime, and GFLOPS.
    Weights:
    - F1 Score: 60%
    - Runtime: 20%
    - GFLOPS: 20%

    Args:
        avg_f1 (float): Average F1 score across all categories.
        avg_runtime (float): Average runtime in seconds.
        avg_flops (float): Average GFLOPS.

    Returns:
        float: Final submission score.
    """

    score_f1 = 0.6 * avg_f1

    runtime_ratio = (config.MAX_AVG_RUNTIME - avg_runtime) / config.MAX_AVG_RUNTIME
    score_runtime = 0.2 * max(runtime_ratio, 0)

    flops_ratio = (config.MAX_AVG_FLOPS - avg_flops) / config.MAX_AVG_FLOPS
    score_flops = 0.2 * max(flops_ratio, 0)

    total_score = score_f1 + score_runtime + score_flops

    logger.info(f"  F1 Score (60%): {score_f1:.4f} (avg_f1: {avg_f1:.4f})")
    logger.info(
        f"  Runtime Score (20%): {score_runtime:.4f} (avg_runtime: {avg_runtime:.4f}s / {config.MAX_AVG_RUNTIME}s)"
    )
    logger.info(
        f"  GFLOPS Score (20%): {score_flops:.4f} (avg_flops: {avg_flops:.4f} / {config.MAX_AVG_FLOPS})"
    )
    logger.info("  ====================")
    logger.info(f"  Final Score: {total_score:.4f}")

    return total_score


def evaluate_models(models: dict, dataset: DatasetDict):
    """
    Evaluates the provided models on the test datasets for each language.
    Computes precision, recall, and F1 score for each category and language.
    Also measures average runtime and GFLOPS for model inference.

    Args:
        models (dict): A dictionary mapping language codes to their respective models.
        dataset (DatasetDict): A DatasetDict containing test datasets for each language.

    Returns:
        pd.DataFrame: DataFrame containing precision, recall, and F1 scores for each category and language.
        float: Final submission score calculated based on average F1, runtime, and GF
    """

    total_flops = 0
    total_time = 0
    scores = []

    for lan in config.LANGS:
        logger.info(f"\n--- Evaluating Language: {lan.upper()} ---")
        model = models[lan]

        with torch.profiler.profile(with_flops=True) as p:
            test_data = dataset[f"{lan}_test"]
            x = test_data[config.INPUT_COLUMN]
            x = list(x) if hasattr(x, 'tolist') else x  # Convert pandas Series to list
            y_true = np.array(test_data[config.LABEL_COLUMN]).T

            begin = time.time()
            for i in range(10):
                y_pred = model.predict(x)
                y_pred = np.asarray(y_pred).T
            total = time.time() - begin
            total_time = total_time + total

        total_flops = total_flops + (sum(k.flops for k in p.key_averages()) / 1e9)

        for i in range(len(y_pred)):
            assert len(y_pred[i]) == len(y_true[i])
            tp = sum([true == pred == 1 for (true, pred) in zip(y_true[i], y_pred[i])])
            #tn = sum([true == pred == 0 for (true, pred) in zip(y_true[i], y_pred[i])])
            fp = sum([true == 0 and pred == 1 for (true, pred) in zip(y_true[i], y_pred[i])])
            fn = sum([true == 1 and pred == 0 for (true, pred) in zip(y_true[i], y_pred[i])])
            precision = tp / (tp + fp)
            recall = tp / (tp + fn)
            f1 = (2 * tp) / (2 * tp + fp + fn)
            scores.append({
                "lan": lan,
                "cat": config.LABELS_MAP[lan][i],
                "precision": precision,
                "recall": recall,
                "f1": f1,
            })

    logger.info(f"Compute in GFLOPs: {total_flops / 10}")
    logger.info(f"Avg runtime in seconds: {total_time / 10}")
    scores = pd.DataFrame(scores)
    print(scores)

    avg_f1 = scores["f1"].mean()
    avg_runtime = total_time / 10
    avg_flops = total_flops / 10

    final_score = calculate_submission_score(avg_f1, avg_runtime, avg_flops)

    logger.info(f"Final Score for {lan.upper()}: {final_score:.4f}")

    return scores, final_score