Spaces:
Running
Running
""" | |
Elo Rating Calculation Module for BigCodeArena | |
Contains Bradley-Terry Model with confidence intervals and traditional Elo calculation | |
""" | |
import math | |
import numpy as np | |
import pandas as pd | |
from collections import defaultdict | |
from tqdm import tqdm | |
from sklearn.linear_model import LogisticRegression | |
import yaml | |
import os | |
# Minimum number of votes required for a model to be included in rankings | |
MIN_VOTES_THRESHOLD = 10 | |
def load_model_metadata(): | |
"""Load model metadata from api_config.yaml""" | |
try: | |
config_path = os.path.join(os.path.dirname(__file__), "api_config.yaml") | |
with open(config_path, "r", encoding="utf-8") as file: | |
config = yaml.safe_load(file) | |
metadata = {} | |
for model_key, model_config in config.items(): | |
if isinstance(model_config, dict): | |
model_name = model_config.get("model", model_key) | |
metadata[model_name] = { | |
"organization": model_config.get("organization", "Unknown"), | |
"license": model_config.get("license", "Unknown"), | |
} | |
# Also store with the key name for lookup | |
metadata[model_key] = { | |
"organization": model_config.get("organization", "Unknown"), | |
"license": model_config.get("license", "Unknown"), | |
} | |
return metadata | |
except Exception as e: | |
print(f"Warning: Could not load model metadata: {e}") | |
return {} | |
def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000, sample_weight=None): | |
"""Compute Elo ratings using Bradley-Terry Model with Maximum Likelihood Estimation""" | |
# Get all unique models to ensure consistent indexing | |
all_models = pd.Index(sorted(set(df["model_a"].unique()) | set(df["model_b"].unique()))) | |
ptbl_a_win = pd.pivot_table( | |
df[df["winner"] == "model_a"], | |
index="model_a", | |
columns="model_b", | |
aggfunc="size", | |
fill_value=0, | |
) | |
# Reindex to include all models | |
ptbl_a_win = ptbl_a_win.reindex(index=all_models, columns=all_models, fill_value=0) | |
# if no tie, create a zero matrix | |
if sum(df["winner"].isin(["tie", "tie (bothbad)"])) == 0: | |
ptbl_tie = pd.DataFrame(0, index=all_models, columns=all_models) | |
else: | |
ptbl_tie = pd.pivot_table( | |
df[df["winner"].isin(["tie", "tie (bothbad)"])], | |
index="model_a", | |
columns="model_b", | |
aggfunc="size", | |
fill_value=0, | |
) | |
ptbl_tie = ptbl_tie.reindex(index=all_models, columns=all_models, fill_value=0) | |
ptbl_tie = ptbl_tie + ptbl_tie.T | |
ptbl_b_win = pd.pivot_table( | |
df[df["winner"] == "model_b"], | |
index="model_a", | |
columns="model_b", | |
aggfunc="size", | |
fill_value=0, | |
) | |
ptbl_b_win = ptbl_b_win.reindex(index=all_models, columns=all_models, fill_value=0) | |
ptbl_win = ptbl_a_win * 2 + ptbl_b_win.T * 2 + ptbl_tie | |
models = pd.Series(np.arange(len(ptbl_win.index)), index=ptbl_win.index) | |
p = len(models) | |
X = np.zeros([p * (p - 1) * 2, p]) | |
Y = np.zeros(p * (p - 1) * 2) | |
cur_row = 0 | |
sample_weights = [] | |
for m_a in ptbl_win.index: | |
for m_b in ptbl_win.columns: | |
if m_a == m_b: | |
continue | |
# if nan skip | |
if math.isnan(ptbl_win.loc[m_a, m_b]) or math.isnan(ptbl_win.loc[m_b, m_a]): | |
continue | |
X[cur_row, models[m_a]] = +math.log(BASE) | |
X[cur_row, models[m_b]] = -math.log(BASE) | |
Y[cur_row] = 1.0 | |
sample_weights.append(ptbl_win.loc[m_a, m_b]) | |
X[cur_row + 1, models[m_a]] = math.log(BASE) | |
X[cur_row + 1, models[m_b]] = -math.log(BASE) | |
Y[cur_row + 1] = 0.0 | |
sample_weights.append(ptbl_win.loc[m_b, m_a]) | |
cur_row += 2 | |
X = X[:cur_row] | |
Y = Y[:cur_row] | |
lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-6) | |
lr.fit(X, Y, sample_weight=sample_weights) | |
elo_scores = SCALE * lr.coef_[0] + INIT_RATING | |
return pd.Series(elo_scores, index=models.index).sort_values(ascending=False) | |
def get_bootstrap_result(battles, func_compute_elo, num_round): | |
"""Get bootstrap results for confidence interval calculation""" | |
rows = [] | |
for i in tqdm(range(num_round), desc="bootstrap"): | |
rows.append(func_compute_elo(battles.sample(frac=1.0, replace=True))) | |
df = pd.DataFrame(rows) | |
return df[df.median().sort_values(ascending=False).index] | |
def compute_online_elo(battles, K=4, SCALE=400, BASE=10, INIT_RATING=1000): | |
"""Compute Elo ratings for models based on battle results (legacy function for compatibility)""" | |
rating = defaultdict(lambda: INIT_RATING) | |
for rd, model_a, model_b, winner in battles[ | |
["model_a", "model_b", "winner"] | |
].itertuples(): | |
ra = rating[model_a] | |
rb = rating[model_b] | |
ea = 1 / (1 + BASE ** ((rb - ra) / SCALE)) | |
eb = 1 / (1 + BASE ** ((ra - rb) / SCALE)) | |
if winner == "model_a": | |
sa = 1 | |
elif winner == "model_b": | |
sa = 0 | |
elif winner == "tie" or winner == "tie (bothbad)": | |
sa = 0.5 | |
else: | |
raise Exception(f"unexpected vote {winner}") | |
rating[model_a] += K * (sa - ea) | |
rating[model_b] += K * (1 - sa - eb) | |
# calibrate llama-13b to 800 if it exists | |
if "llama-13b" in rating: | |
delta = 800 - rating["llama-13b"] | |
for model in battles["model_a"].unique(): | |
rating[model] += delta | |
return rating | |
def calculate_elo_with_confidence_intervals(battles_df, vote_counts): | |
""" | |
Main function to calculate Elo ratings with confidence intervals | |
Args: | |
battles_df (pd.DataFrame): DataFrame with columns ['model_a', 'model_b', 'winner'] | |
vote_counts (dict): Dictionary with vote counts for each model | |
Returns: | |
tuple: (elo_ratings, confidence_intervals) | |
""" | |
confidence_intervals = {} # Initialize to avoid uninitialized variable error | |
# Check if we have sufficient data for Bradley-Terry model | |
# Since we only display models with >= MIN_VOTES_THRESHOLD votes, we need enough battles | |
if len(battles_df) < MIN_VOTES_THRESHOLD: | |
# Not enough battles for reliable ranking | |
all_models = set( | |
battles_df["model_a"].tolist() + battles_df["model_b"].tolist() | |
) | |
elo_ratings = pd.Series({model: 1000 for model in all_models}) | |
confidence_intervals = {model: 0 for model in all_models} | |
else: | |
try: | |
# Use the new Bradley-Terry Model | |
elo_ratings = compute_mle_elo(battles_df) | |
# Calculate confidence intervals using bootstrap | |
if len(battles_df) >= MIN_VOTES_THRESHOLD: # Only calculate CI if we have enough data | |
try: | |
np.random.seed(42) | |
bootstrap_df = get_bootstrap_result( | |
battles_df, compute_mle_elo, num_round=100 | |
) | |
# Calculate 95% confidence intervals | |
if not bootstrap_df.empty: | |
# Initialize CI for all models first | |
for model in elo_ratings.index: | |
confidence_intervals[model] = 0 | |
# Update with bootstrap results | |
for model in bootstrap_df.columns: | |
scores = bootstrap_df[model].dropna() | |
if len(scores) > 0: | |
lower = scores.quantile(0.025) | |
upper = scores.quantile(0.975) | |
median_score = scores.median() | |
ci_margin = (upper - lower) / 2 | |
confidence_intervals[model] = ci_margin | |
else: | |
# Fallback: no confidence intervals | |
for model in elo_ratings.index: | |
confidence_intervals[model] = 0 | |
except Exception as bootstrap_error: | |
for model in elo_ratings.index: | |
confidence_intervals[model] = 0 | |
else: | |
# Not enough data for bootstrap, set CI to 0 | |
for model in elo_ratings.index: | |
confidence_intervals[model] = 0 | |
except Exception as e: | |
# Fallback to old method if Bradley-Terry fails | |
old_elo_ratings = compute_online_elo(battles_df) | |
elo_ratings = pd.Series(old_elo_ratings) | |
confidence_intervals = {model: 0 for model in elo_ratings.index} | |
return elo_ratings, confidence_intervals | |
def create_ranking_dataframe(elo_ratings, confidence_intervals, vote_counts): | |
""" | |
Create ranking DataFrame with all necessary columns | |
Only includes models with at least MIN_VOTES_THRESHOLD battles | |
Args: | |
elo_ratings (pd.Series): Elo ratings for each model | |
confidence_intervals (dict): Confidence interval margins for each model | |
vote_counts (dict): Vote counts for each model | |
Returns: | |
pd.DataFrame: Ranking table with columns [Rank, Model, Score, 95% CI (±), Votes, Organization, License] | |
Empty DataFrame if no models have >= MIN_VOTES_THRESHOLD votes | |
""" | |
# Load model metadata | |
metadata = load_model_metadata() | |
# Create ranking list with Elo ratings and confidence intervals | |
# Only include models with at least MIN_VOTES_THRESHOLD battles | |
ranking_list = [] | |
for model in elo_ratings.index: | |
# Skip models with fewer than MIN_VOTES_THRESHOLD votes | |
if vote_counts.get(model, 0) < MIN_VOTES_THRESHOLD: | |
continue | |
ci_margin = confidence_intervals.get(model, 0) | |
# Get metadata for this model | |
model_metadata = metadata.get(model, {}) | |
organization = model_metadata.get("organization", "Unknown") | |
license_type = model_metadata.get("license", "Unknown") | |
ranking_list.append( | |
{ | |
"Model": model, | |
"Score": round(elo_ratings[model], 1), | |
"95% CI (±)": round(ci_margin, 1) if ci_margin > 0 else "-", | |
"Votes": vote_counts[model], | |
"Organization": organization, | |
"License": license_type, | |
} | |
) | |
# Return empty DataFrame if no models meet the minimum vote threshold | |
if not ranking_list: | |
return pd.DataFrame() | |
# Sort by Elo rating (highest first) | |
ranking_df = pd.DataFrame(ranking_list).sort_values("Score", ascending=False) | |
ranking_df["Rank"] = range(1, len(ranking_df) + 1) | |
# Reorder columns | |
ranking_df = ranking_df[ | |
["Rank", "Model", "Score", "95% CI (±)", "Votes", "Organization", "License"] | |
] | |
return ranking_df | |