|
import numpy as np |
|
|
|
EPS = 1e-10 |
|
NLLEPS = 1e-6 |
|
|
|
def compute_maximum_metrics(predicts, n_bins=10): |
|
n = len(predicts) |
|
acc, cnf, siz = np.zeros(n_bins), np.zeros(n_bins), np.zeros(n_bins) |
|
brier_score = [] |
|
negative_ll = [] |
|
|
|
for idx in range(n): |
|
m = len(predicts[idx]) |
|
|
|
|
|
max_prob, max_prob_counts = -1e6, 0 |
|
for i in range(m): |
|
ans, prob, flag = predicts[idx][i] |
|
if prob > max_prob: |
|
max_prob, max_prob_counts = prob, 0 |
|
if prob >= max_prob - EPS: |
|
max_prob_counts += 1 |
|
|
|
|
|
vote_acc = 0 |
|
for i in range(m): |
|
ans, prob, flag = predicts[idx][i] |
|
if prob < max_prob: |
|
continue |
|
if np.isnan(prob): |
|
continue |
|
if flag: |
|
vote_acc += 1.0 / max_prob_counts |
|
|
|
for cur in range(n_bins): |
|
lower, upper = cur / n_bins, (cur + 1) / n_bins |
|
if lower < max_prob <= upper: |
|
if flag: |
|
acc[cur] += 1.0 / max_prob_counts |
|
cnf[cur] += prob / max_prob_counts |
|
siz[cur] += 1.0 / max_prob_counts |
|
|
|
|
|
brier_score.append((vote_acc - max_prob) ** 2) |
|
|
|
|
|
cliped_max_prob = max(min(max_prob, 1 - NLLEPS), NLLEPS) |
|
cliped_vote_acc = max(min(vote_acc, 1 - NLLEPS), NLLEPS) |
|
negative_ll.append( |
|
-np.log(cliped_max_prob) * cliped_vote_acc |
|
- np.log(1 - cliped_max_prob) * (1 - cliped_vote_acc) |
|
) |
|
|
|
|
|
ece = 0 |
|
for cur in range(n_bins): |
|
if siz[cur] > 0: |
|
acc[cur] = acc[cur] / siz[cur] |
|
cnf[cur] = cnf[cur] / siz[cur] |
|
ece += siz[cur] * np.abs(acc[cur] - cnf[cur]) |
|
|
|
ece = ece / sum(siz) |
|
bs = np.mean(brier_score) |
|
nll = np.mean(negative_ll) |
|
|
|
return (ece, bs, nll), (acc, cnf, siz) |
|
|
|
|
|
def compute_average_metrics(predicts, n_bins=10): |
|
n = len(predicts) |
|
acc, cnf, siz = np.zeros(n_bins), np.zeros(n_bins), np.zeros(n_bins) |
|
brier_score = [] |
|
negative_ll = [] |
|
|
|
for idx in range(n): |
|
m = len(predicts[idx]) |
|
|
|
problem_brier_score = [] |
|
problem_negative_ll = [] |
|
for i in range(m): |
|
ans, prob, flag = predicts[idx][i] |
|
|
|
for cur in range(n_bins): |
|
lower, upper = cur / n_bins, (cur + 1) / n_bins |
|
if lower < prob <= upper: |
|
if flag: |
|
acc[cur] += 1.0 / m |
|
cnf[cur] += prob / m |
|
siz[cur] += 1.0 / m |
|
|
|
|
|
problem_brier_score.append(((1 if flag else 0) - prob) ** 2) |
|
|
|
|
|
cliped_max_prob = max(min(prob, 1 - NLLEPS), NLLEPS) |
|
cliped_vote_acc = max(min(1 if flag else 0, 1 - NLLEPS), NLLEPS) |
|
problem_negative_ll.append( |
|
-np.log(cliped_max_prob) * cliped_vote_acc |
|
- np.log(1 - cliped_max_prob) * (1 - cliped_vote_acc) |
|
) |
|
|
|
brier_score.append(np.mean(problem_brier_score)) |
|
negative_ll.append(np.mean(problem_negative_ll)) |
|
|
|
ece = 0 |
|
for cur in range(n_bins): |
|
if siz[cur] > 0: |
|
acc[cur] = acc[cur] / siz[cur] |
|
cnf[cur] = cnf[cur] / siz[cur] |
|
ece += siz[cur] * np.abs(acc[cur] - cnf[cur]) |
|
ece = ece / sum(siz) |
|
bs = np.mean(brier_score) |
|
nll = np.mean(negative_ll) |
|
|
|
return (ece, bs, nll), (acc, cnf, siz) |
|
|