File size: 3,949 Bytes
22c93a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import numpy as np

EPS = 1e-10
NLLEPS = 1e-6

def compute_maximum_metrics(predicts, n_bins=10):
    n = len(predicts)
    acc, cnf, siz = np.zeros(n_bins), np.zeros(n_bins), np.zeros(n_bins)
    brier_score = []
    negative_ll = []

    for idx in range(n):
        m = len(predicts[idx])

        # Compute maximum probabilities and corresponding counts within each problem
        max_prob, max_prob_counts = -1e6, 0
        for i in range(m):
            ans, prob, flag = predicts[idx][i]
            if prob > max_prob:
                max_prob, max_prob_counts = prob, 0
            if prob >= max_prob - EPS:
                max_prob_counts += 1
        # print(max_prob, max_prob_counts)
        # Compute the maximum accuracy for each problem as well as the ECE metric
        vote_acc = 0
        for i in range(m):
            ans, prob, flag = predicts[idx][i]
            if prob < max_prob:
                continue
            if np.isnan(prob):
                continue
            if flag:
                vote_acc += 1.0 / max_prob_counts
            # Compute Expected Calibration Error
            for cur in range(n_bins):
                lower, upper = cur / n_bins, (cur + 1) / n_bins
                if lower < max_prob <= upper:
                    if flag:
                        acc[cur] += 1.0 / max_prob_counts
                    cnf[cur] += prob / max_prob_counts
                    siz[cur] += 1.0 / max_prob_counts

        # Compute Brier Score
        brier_score.append((vote_acc - max_prob) ** 2)

        # Compute Negative Likelihhod
        cliped_max_prob = max(min(max_prob, 1 - NLLEPS), NLLEPS)
        cliped_vote_acc = max(min(vote_acc, 1 - NLLEPS), NLLEPS)
        negative_ll.append(
            -np.log(cliped_max_prob) * cliped_vote_acc
            - np.log(1 - cliped_max_prob) * (1 - cliped_vote_acc)
        )

    # Turn each metrics into values
    ece = 0
    for cur in range(n_bins):
        if siz[cur] > 0:
            acc[cur] = acc[cur] / siz[cur]
            cnf[cur] = cnf[cur] / siz[cur]
        ece += siz[cur] * np.abs(acc[cur] - cnf[cur])
        # print(siz[cur], acc[cur], cnf[cur])
    ece = ece / sum(siz)
    bs = np.mean(brier_score)
    nll = np.mean(negative_ll)

    return (ece, bs, nll), (acc, cnf, siz)


def compute_average_metrics(predicts, n_bins=10):
    n = len(predicts)
    acc, cnf, siz = np.zeros(n_bins), np.zeros(n_bins), np.zeros(n_bins)
    brier_score = []
    negative_ll = []

    for idx in range(n):
        m = len(predicts[idx])

        problem_brier_score = []
        problem_negative_ll = []
        for i in range(m):
            ans, prob, flag = predicts[idx][i]
            # Compute Expected Calibration Error
            for cur in range(n_bins):
                lower, upper = cur / n_bins, (cur + 1) / n_bins
                if lower < prob <= upper:
                    if flag:
                        acc[cur] += 1.0 / m
                    cnf[cur] += prob / m
                    siz[cur] += 1.0 / m

            # Compute Brier Score
            problem_brier_score.append(((1 if flag else 0) - prob) ** 2)

            # Compute Negative Likelyhood
            cliped_max_prob = max(min(prob, 1 - NLLEPS), NLLEPS)
            cliped_vote_acc = max(min(1 if flag else 0, 1 - NLLEPS), NLLEPS)
            problem_negative_ll.append(
                -np.log(cliped_max_prob) * cliped_vote_acc
                - np.log(1 - cliped_max_prob) * (1 - cliped_vote_acc)
            )

        brier_score.append(np.mean(problem_brier_score))
        negative_ll.append(np.mean(problem_negative_ll))

    ece = 0
    for cur in range(n_bins):
        if siz[cur] > 0:
            acc[cur] = acc[cur] / siz[cur]
            cnf[cur] = cnf[cur] / siz[cur]
        ece += siz[cur] * np.abs(acc[cur] - cnf[cur])
    ece = ece / sum(siz)
    bs = np.mean(brier_score)
    nll = np.mean(negative_ll)

    return (ece, bs, nll), (acc, cnf, siz)