import pandas as pd from datasets import get_dataset_config_names, load_dataset from datasets.exceptions import DatasetNotFoundError from tqdm.auto import tqdm from src.display.utils import AutoEvalColumn from src.envs import TOKEN from src.logger import get_logger logger = get_logger(__name__) def get_leaderboard_df(results_dataset_name: str) -> pd.DataFrame: """ @brief Creates a dataframe from all the individual experiment results. """ empty_df = pd.DataFrame( columns=[ AutoEvalColumn.system.name, AutoEvalColumn.organization.name, AutoEvalColumn.success_rate_overall.name, AutoEvalColumn.success_rate_tier1.name, AutoEvalColumn.success_rate_tier2.name, AutoEvalColumn.submitted_on.name, ] ) try: configs = get_dataset_config_names( results_dataset_name, token=TOKEN, ) except (DatasetNotFoundError, FileNotFoundError): # Return an empty DataFrame with expected columns logger.warning("Failed to load configuration", exc_info=True) return empty_df if configs == ["default"]: logger.info("Dataset has only default config — treating as empty") return empty_df rows = [] for submission_id in tqdm( configs, total=len(configs), desc="Processing Submission Results", ): submission_ds = load_dataset( results_dataset_name, submission_id, split="train", token=TOKEN, ) submission_df = pd.DataFrame(submission_ds) if submission_df.empty or "did_pass" not in submission_df.columns or submission_df.did_pass.isna().any(): logger.warning(f"Skipping {submission_id} due to invalid did_pass values") continue assert submission_df["tier"].isin([1, 2]).all(), "Invalid tier values found in submission_df" success_rate = 100 * submission_df["did_pass"].mean() tier1_success_rate = 100 * submission_df[submission_df["tier"] == 1]["did_pass"].mean() tier2_success_rate = 100 * submission_df[submission_df["tier"] == 2]["did_pass"].mean() first_row = submission_df.iloc[0] rows.append( { AutoEvalColumn.system.name: first_row["system_name"], AutoEvalColumn.organization.name: first_row["organization"], AutoEvalColumn.success_rate_overall.name: success_rate, AutoEvalColumn.success_rate_tier1.name: tier1_success_rate, AutoEvalColumn.success_rate_tier2.name: tier2_success_rate, AutoEvalColumn.submitted_on.name: pd.to_datetime(first_row["submission_ts"]).strftime("%Y-%m-%d %H:%M"), } ) full_df = pd.DataFrame(rows) logger.info(f"Loaded results df with {len(full_df)} entries") # Keep only the latest entry per unique (System Name, System Type, Organization) triplet final_df = ( full_df.sort_values("Submitted On", ascending=False) .drop_duplicates(subset=[AutoEvalColumn.system.name, AutoEvalColumn.organization.name], keep="first") .sort_values(by=[AutoEvalColumn.success_rate_overall.name], ascending=False) .reset_index(drop=True) ) cols_to_round = [ AutoEvalColumn.success_rate_overall.name, AutoEvalColumn.success_rate_tier1.name, AutoEvalColumn.success_rate_tier2.name, ] final_df[cols_to_round] = final_df[cols_to_round].round(decimals=2) return final_df