""" Elo Rating Calculation Module for BigCodeArena Contains Bradley-Terry Model with confidence intervals and traditional Elo calculation """ import math import numpy as np import pandas as pd from collections import defaultdict from tqdm import tqdm from sklearn.linear_model import LogisticRegression import yaml import os # Minimum number of votes required for a model to be included in rankings MIN_VOTES_THRESHOLD = 10 def load_model_metadata(): """Load model metadata from api_config.yaml""" try: config_path = os.path.join(os.path.dirname(__file__), "api_config.yaml") with open(config_path, "r", encoding="utf-8") as file: config = yaml.safe_load(file) metadata = {} for model_key, model_config in config.items(): if isinstance(model_config, dict): model_name = model_config.get("model", model_key) metadata[model_name] = { "organization": model_config.get("organization", "Unknown"), "license": model_config.get("license", "Unknown"), } # Also store with the key name for lookup metadata[model_key] = { "organization": model_config.get("organization", "Unknown"), "license": model_config.get("license", "Unknown"), } return metadata except Exception as e: print(f"Warning: Could not load model metadata: {e}") return {} def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000, sample_weight=None): """Compute Elo ratings using Bradley-Terry Model with Maximum Likelihood Estimation""" # Get all unique models to ensure consistent indexing all_models = pd.Index(sorted(set(df["model_a"].unique()) | set(df["model_b"].unique()))) ptbl_a_win = pd.pivot_table( df[df["winner"] == "model_a"], index="model_a", columns="model_b", aggfunc="size", fill_value=0, ) # Reindex to include all models ptbl_a_win = ptbl_a_win.reindex(index=all_models, columns=all_models, fill_value=0) # if no tie, create a zero matrix if sum(df["winner"].isin(["tie", "tie (bothbad)"])) == 0: ptbl_tie = pd.DataFrame(0, index=all_models, columns=all_models) else: ptbl_tie = pd.pivot_table( df[df["winner"].isin(["tie", "tie (bothbad)"])], index="model_a", columns="model_b", aggfunc="size", fill_value=0, ) ptbl_tie = ptbl_tie.reindex(index=all_models, columns=all_models, fill_value=0) ptbl_tie = ptbl_tie + ptbl_tie.T ptbl_b_win = pd.pivot_table( df[df["winner"] == "model_b"], index="model_a", columns="model_b", aggfunc="size", fill_value=0, ) ptbl_b_win = ptbl_b_win.reindex(index=all_models, columns=all_models, fill_value=0) ptbl_win = ptbl_a_win * 2 + ptbl_b_win.T * 2 + ptbl_tie models = pd.Series(np.arange(len(ptbl_win.index)), index=ptbl_win.index) p = len(models) X = np.zeros([p * (p - 1) * 2, p]) Y = np.zeros(p * (p - 1) * 2) cur_row = 0 sample_weights = [] for m_a in ptbl_win.index: for m_b in ptbl_win.columns: if m_a == m_b: continue # if nan skip if math.isnan(ptbl_win.loc[m_a, m_b]) or math.isnan(ptbl_win.loc[m_b, m_a]): continue X[cur_row, models[m_a]] = +math.log(BASE) X[cur_row, models[m_b]] = -math.log(BASE) Y[cur_row] = 1.0 sample_weights.append(ptbl_win.loc[m_a, m_b]) X[cur_row + 1, models[m_a]] = math.log(BASE) X[cur_row + 1, models[m_b]] = -math.log(BASE) Y[cur_row + 1] = 0.0 sample_weights.append(ptbl_win.loc[m_b, m_a]) cur_row += 2 X = X[:cur_row] Y = Y[:cur_row] lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-6) lr.fit(X, Y, sample_weight=sample_weights) elo_scores = SCALE * lr.coef_[0] + INIT_RATING return pd.Series(elo_scores, index=models.index).sort_values(ascending=False) def get_bootstrap_result(battles, func_compute_elo, num_round): """Get bootstrap results for confidence interval calculation""" rows = [] for i in tqdm(range(num_round), desc="bootstrap"): rows.append(func_compute_elo(battles.sample(frac=1.0, replace=True))) df = pd.DataFrame(rows) return df[df.median().sort_values(ascending=False).index] def compute_online_elo(battles, K=4, SCALE=400, BASE=10, INIT_RATING=1000): """Compute Elo ratings for models based on battle results (legacy function for compatibility)""" rating = defaultdict(lambda: INIT_RATING) for rd, model_a, model_b, winner in battles[ ["model_a", "model_b", "winner"] ].itertuples(): ra = rating[model_a] rb = rating[model_b] ea = 1 / (1 + BASE ** ((rb - ra) / SCALE)) eb = 1 / (1 + BASE ** ((ra - rb) / SCALE)) if winner == "model_a": sa = 1 elif winner == "model_b": sa = 0 elif winner == "tie" or winner == "tie (bothbad)": sa = 0.5 else: raise Exception(f"unexpected vote {winner}") rating[model_a] += K * (sa - ea) rating[model_b] += K * (1 - sa - eb) # calibrate llama-13b to 800 if it exists if "llama-13b" in rating: delta = 800 - rating["llama-13b"] for model in battles["model_a"].unique(): rating[model] += delta return rating def calculate_elo_with_confidence_intervals(battles_df, vote_counts): """ Main function to calculate Elo ratings with confidence intervals Args: battles_df (pd.DataFrame): DataFrame with columns ['model_a', 'model_b', 'winner'] vote_counts (dict): Dictionary with vote counts for each model Returns: tuple: (elo_ratings, confidence_intervals) """ confidence_intervals = {} # Initialize to avoid uninitialized variable error # Check if we have sufficient data for Bradley-Terry model # Since we only display models with >= MIN_VOTES_THRESHOLD votes, we need enough battles if len(battles_df) < MIN_VOTES_THRESHOLD: # Not enough battles for reliable ranking all_models = set( battles_df["model_a"].tolist() + battles_df["model_b"].tolist() ) elo_ratings = pd.Series({model: 1000 for model in all_models}) confidence_intervals = {model: 0 for model in all_models} else: try: # Use the new Bradley-Terry Model elo_ratings = compute_mle_elo(battles_df) # Calculate confidence intervals using bootstrap if len(battles_df) >= MIN_VOTES_THRESHOLD: # Only calculate CI if we have enough data try: np.random.seed(42) bootstrap_df = get_bootstrap_result( battles_df, compute_mle_elo, num_round=100 ) # Calculate 95% confidence intervals if not bootstrap_df.empty: # Initialize CI for all models first for model in elo_ratings.index: confidence_intervals[model] = 0 # Update with bootstrap results for model in bootstrap_df.columns: scores = bootstrap_df[model].dropna() if len(scores) > 0: lower = scores.quantile(0.025) upper = scores.quantile(0.975) median_score = scores.median() ci_margin = (upper - lower) / 2 confidence_intervals[model] = ci_margin else: # Fallback: no confidence intervals for model in elo_ratings.index: confidence_intervals[model] = 0 except Exception as bootstrap_error: for model in elo_ratings.index: confidence_intervals[model] = 0 else: # Not enough data for bootstrap, set CI to 0 for model in elo_ratings.index: confidence_intervals[model] = 0 except Exception as e: # Fallback to old method if Bradley-Terry fails old_elo_ratings = compute_online_elo(battles_df) elo_ratings = pd.Series(old_elo_ratings) confidence_intervals = {model: 0 for model in elo_ratings.index} return elo_ratings, confidence_intervals def create_ranking_dataframe(elo_ratings, confidence_intervals, vote_counts): """ Create ranking DataFrame with all necessary columns Only includes models with at least MIN_VOTES_THRESHOLD battles Args: elo_ratings (pd.Series): Elo ratings for each model confidence_intervals (dict): Confidence interval margins for each model vote_counts (dict): Vote counts for each model Returns: pd.DataFrame: Ranking table with columns [Rank, Model, Score, 95% CI (±), Votes, Organization, License] Empty DataFrame if no models have >= MIN_VOTES_THRESHOLD votes """ # Load model metadata metadata = load_model_metadata() # Create ranking list with Elo ratings and confidence intervals # Only include models with at least MIN_VOTES_THRESHOLD battles ranking_list = [] for model in elo_ratings.index: # Skip models with fewer than MIN_VOTES_THRESHOLD votes if vote_counts.get(model, 0) < MIN_VOTES_THRESHOLD: continue ci_margin = confidence_intervals.get(model, 0) # Get metadata for this model model_metadata = metadata.get(model, {}) organization = model_metadata.get("organization", "Unknown") license_type = model_metadata.get("license", "Unknown") ranking_list.append( { "Model": model, "Score": round(elo_ratings[model], 1), "95% CI (±)": round(ci_margin, 1) if ci_margin > 0 else "-", "Votes": vote_counts[model], "Organization": organization, "License": license_type, } ) # Return empty DataFrame if no models meet the minimum vote threshold if not ranking_list: return pd.DataFrame() # Sort by Elo rating (highest first) ranking_df = pd.DataFrame(ranking_list).sort_values("Score", ascending=False) ranking_df["Rank"] = range(1, len(ranking_df) + 1) # Reorder columns ranking_df = ranking_df[ ["Rank", "Model", "Score", "95% CI (±)", "Votes", "Organization", "License"] ] return ranking_df