arena / elo_calculation.py
terryyz
update
07e11cb
raw
history blame
11 kB
"""
Elo Rating Calculation Module for BigCodeArena
Contains Bradley-Terry Model with confidence intervals and traditional Elo calculation
"""
import math
import numpy as np
import pandas as pd
from collections import defaultdict
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
import yaml
import os
# Minimum number of votes required for a model to be included in rankings
MIN_VOTES_THRESHOLD = 10
def load_model_metadata():
"""Load model metadata from api_config.yaml"""
try:
config_path = os.path.join(os.path.dirname(__file__), "api_config.yaml")
with open(config_path, "r", encoding="utf-8") as file:
config = yaml.safe_load(file)
metadata = {}
for model_key, model_config in config.items():
if isinstance(model_config, dict):
model_name = model_config.get("model", model_key)
metadata[model_name] = {
"organization": model_config.get("organization", "Unknown"),
"license": model_config.get("license", "Unknown"),
}
# Also store with the key name for lookup
metadata[model_key] = {
"organization": model_config.get("organization", "Unknown"),
"license": model_config.get("license", "Unknown"),
}
return metadata
except Exception as e:
print(f"Warning: Could not load model metadata: {e}")
return {}
def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000, sample_weight=None):
"""Compute Elo ratings using Bradley-Terry Model with Maximum Likelihood Estimation"""
# Get all unique models to ensure consistent indexing
all_models = pd.Index(sorted(set(df["model_a"].unique()) | set(df["model_b"].unique())))
ptbl_a_win = pd.pivot_table(
df[df["winner"] == "model_a"],
index="model_a",
columns="model_b",
aggfunc="size",
fill_value=0,
)
# Reindex to include all models
ptbl_a_win = ptbl_a_win.reindex(index=all_models, columns=all_models, fill_value=0)
# if no tie, create a zero matrix
if sum(df["winner"].isin(["tie", "tie (bothbad)"])) == 0:
ptbl_tie = pd.DataFrame(0, index=all_models, columns=all_models)
else:
ptbl_tie = pd.pivot_table(
df[df["winner"].isin(["tie", "tie (bothbad)"])],
index="model_a",
columns="model_b",
aggfunc="size",
fill_value=0,
)
ptbl_tie = ptbl_tie.reindex(index=all_models, columns=all_models, fill_value=0)
ptbl_tie = ptbl_tie + ptbl_tie.T
ptbl_b_win = pd.pivot_table(
df[df["winner"] == "model_b"],
index="model_a",
columns="model_b",
aggfunc="size",
fill_value=0,
)
ptbl_b_win = ptbl_b_win.reindex(index=all_models, columns=all_models, fill_value=0)
ptbl_win = ptbl_a_win * 2 + ptbl_b_win.T * 2 + ptbl_tie
models = pd.Series(np.arange(len(ptbl_win.index)), index=ptbl_win.index)
p = len(models)
X = np.zeros([p * (p - 1) * 2, p])
Y = np.zeros(p * (p - 1) * 2)
cur_row = 0
sample_weights = []
for m_a in ptbl_win.index:
for m_b in ptbl_win.columns:
if m_a == m_b:
continue
# if nan skip
if math.isnan(ptbl_win.loc[m_a, m_b]) or math.isnan(ptbl_win.loc[m_b, m_a]):
continue
X[cur_row, models[m_a]] = +math.log(BASE)
X[cur_row, models[m_b]] = -math.log(BASE)
Y[cur_row] = 1.0
sample_weights.append(ptbl_win.loc[m_a, m_b])
X[cur_row + 1, models[m_a]] = math.log(BASE)
X[cur_row + 1, models[m_b]] = -math.log(BASE)
Y[cur_row + 1] = 0.0
sample_weights.append(ptbl_win.loc[m_b, m_a])
cur_row += 2
X = X[:cur_row]
Y = Y[:cur_row]
lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-6)
lr.fit(X, Y, sample_weight=sample_weights)
elo_scores = SCALE * lr.coef_[0] + INIT_RATING
return pd.Series(elo_scores, index=models.index).sort_values(ascending=False)
def get_bootstrap_result(battles, func_compute_elo, num_round):
"""Get bootstrap results for confidence interval calculation"""
rows = []
for i in tqdm(range(num_round), desc="bootstrap"):
rows.append(func_compute_elo(battles.sample(frac=1.0, replace=True)))
df = pd.DataFrame(rows)
return df[df.median().sort_values(ascending=False).index]
def compute_online_elo(battles, K=4, SCALE=400, BASE=10, INIT_RATING=1000):
"""Compute Elo ratings for models based on battle results (legacy function for compatibility)"""
rating = defaultdict(lambda: INIT_RATING)
for rd, model_a, model_b, winner in battles[
["model_a", "model_b", "winner"]
].itertuples():
ra = rating[model_a]
rb = rating[model_b]
ea = 1 / (1 + BASE ** ((rb - ra) / SCALE))
eb = 1 / (1 + BASE ** ((ra - rb) / SCALE))
if winner == "model_a":
sa = 1
elif winner == "model_b":
sa = 0
elif winner == "tie" or winner == "tie (bothbad)":
sa = 0.5
else:
raise Exception(f"unexpected vote {winner}")
rating[model_a] += K * (sa - ea)
rating[model_b] += K * (1 - sa - eb)
# calibrate llama-13b to 800 if it exists
if "llama-13b" in rating:
delta = 800 - rating["llama-13b"]
for model in battles["model_a"].unique():
rating[model] += delta
return rating
def calculate_elo_with_confidence_intervals(battles_df, vote_counts):
"""
Main function to calculate Elo ratings with confidence intervals
Args:
battles_df (pd.DataFrame): DataFrame with columns ['model_a', 'model_b', 'winner']
vote_counts (dict): Dictionary with vote counts for each model
Returns:
tuple: (elo_ratings, confidence_intervals)
"""
confidence_intervals = {} # Initialize to avoid uninitialized variable error
# Check if we have sufficient data for Bradley-Terry model
# Since we only display models with >= MIN_VOTES_THRESHOLD votes, we need enough battles
if len(battles_df) < MIN_VOTES_THRESHOLD:
# Not enough battles for reliable ranking
all_models = set(
battles_df["model_a"].tolist() + battles_df["model_b"].tolist()
)
elo_ratings = pd.Series({model: 1000 for model in all_models})
confidence_intervals = {model: 0 for model in all_models}
else:
try:
# Use the new Bradley-Terry Model
elo_ratings = compute_mle_elo(battles_df)
# Calculate confidence intervals using bootstrap
if len(battles_df) >= MIN_VOTES_THRESHOLD: # Only calculate CI if we have enough data
try:
np.random.seed(42)
bootstrap_df = get_bootstrap_result(
battles_df, compute_mle_elo, num_round=100
)
# Calculate 95% confidence intervals
if not bootstrap_df.empty:
# Initialize CI for all models first
for model in elo_ratings.index:
confidence_intervals[model] = 0
# Update with bootstrap results
for model in bootstrap_df.columns:
scores = bootstrap_df[model].dropna()
if len(scores) > 0:
lower = scores.quantile(0.025)
upper = scores.quantile(0.975)
median_score = scores.median()
ci_margin = (upper - lower) / 2
confidence_intervals[model] = ci_margin
else:
# Fallback: no confidence intervals
for model in elo_ratings.index:
confidence_intervals[model] = 0
except Exception as bootstrap_error:
for model in elo_ratings.index:
confidence_intervals[model] = 0
else:
# Not enough data for bootstrap, set CI to 0
for model in elo_ratings.index:
confidence_intervals[model] = 0
except Exception as e:
# Fallback to old method if Bradley-Terry fails
old_elo_ratings = compute_online_elo(battles_df)
elo_ratings = pd.Series(old_elo_ratings)
confidence_intervals = {model: 0 for model in elo_ratings.index}
return elo_ratings, confidence_intervals
def create_ranking_dataframe(elo_ratings, confidence_intervals, vote_counts):
"""
Create ranking DataFrame with all necessary columns
Only includes models with at least MIN_VOTES_THRESHOLD battles
Args:
elo_ratings (pd.Series): Elo ratings for each model
confidence_intervals (dict): Confidence interval margins for each model
vote_counts (dict): Vote counts for each model
Returns:
pd.DataFrame: Ranking table with columns [Rank, Model, Score, 95% CI (±), Votes, Organization, License]
Empty DataFrame if no models have >= MIN_VOTES_THRESHOLD votes
"""
# Load model metadata
metadata = load_model_metadata()
# Create ranking list with Elo ratings and confidence intervals
# Only include models with at least MIN_VOTES_THRESHOLD battles
ranking_list = []
for model in elo_ratings.index:
# Skip models with fewer than MIN_VOTES_THRESHOLD votes
if vote_counts.get(model, 0) < MIN_VOTES_THRESHOLD:
continue
ci_margin = confidence_intervals.get(model, 0)
# Get metadata for this model
model_metadata = metadata.get(model, {})
organization = model_metadata.get("organization", "Unknown")
license_type = model_metadata.get("license", "Unknown")
ranking_list.append(
{
"Model": model,
"Score": round(elo_ratings[model], 1),
"95% CI (±)": round(ci_margin, 1) if ci_margin > 0 else "-",
"Votes": vote_counts[model],
"Organization": organization,
"License": license_type,
}
)
# Return empty DataFrame if no models meet the minimum vote threshold
if not ranking_list:
return pd.DataFrame()
# Sort by Elo rating (highest first)
ranking_df = pd.DataFrame(ranking_list).sort_values("Score", ascending=False)
ranking_df["Rank"] = range(1, len(ranking_df) + 1)
# Reorder columns
ranking_df = ranking_df[
["Rank", "Model", "Score", "95% CI (±)", "Votes", "Organization", "License"]
]
return ranking_df