Alvinn-aai's picture
24 hours wait time and enable validation
ee85d80
import pandas as pd
from datasets import get_dataset_config_names, load_dataset
from datasets.exceptions import DatasetNotFoundError
from tqdm.auto import tqdm
from src.display.utils import AutoEvalColumn
from src.envs import TOKEN
from src.logger import get_logger
logger = get_logger(__name__)
def get_leaderboard_df(results_dataset_name: str) -> pd.DataFrame:
"""
@brief Creates a dataframe from all the individual experiment results.
"""
empty_df = pd.DataFrame(
columns=[
AutoEvalColumn.system.name,
AutoEvalColumn.organization.name,
AutoEvalColumn.success_rate_overall.name,
AutoEvalColumn.success_rate_tier1.name,
AutoEvalColumn.success_rate_tier2.name,
AutoEvalColumn.submitted_on.name,
]
)
try:
configs = get_dataset_config_names(
results_dataset_name,
token=TOKEN,
)
except (DatasetNotFoundError, FileNotFoundError):
# Return an empty DataFrame with expected columns
logger.warning("Failed to load configuration", exc_info=True)
return empty_df
if configs == ["default"]:
logger.info("Dataset has only default config — treating as empty")
return empty_df
rows = []
for submission_id in tqdm(
configs,
total=len(configs),
desc="Processing Submission Results",
):
submission_ds = load_dataset(
results_dataset_name,
submission_id,
split="train",
token=TOKEN,
)
submission_df = pd.DataFrame(submission_ds)
if submission_df.empty or "did_pass" not in submission_df.columns or submission_df.did_pass.isna().any():
logger.warning(f"Skipping {submission_id} due to invalid did_pass values")
continue
assert submission_df["tier"].isin([1, 2]).all(), "Invalid tier values found in submission_df"
success_rate = 100 * submission_df["did_pass"].mean()
tier1_success_rate = 100 * submission_df[submission_df["tier"] == 1]["did_pass"].mean()
tier2_success_rate = 100 * submission_df[submission_df["tier"] == 2]["did_pass"].mean()
first_row = submission_df.iloc[0]
rows.append(
{
AutoEvalColumn.system.name: first_row["system_name"],
AutoEvalColumn.organization.name: first_row["organization"],
AutoEvalColumn.success_rate_overall.name: success_rate,
AutoEvalColumn.success_rate_tier1.name: tier1_success_rate,
AutoEvalColumn.success_rate_tier2.name: tier2_success_rate,
AutoEvalColumn.submitted_on.name: pd.to_datetime(first_row["submission_ts"]).strftime("%Y-%m-%d %H:%M"),
}
)
full_df = pd.DataFrame(rows)
logger.info(f"Loaded results df with {len(full_df)} entries")
# Keep only the latest entry per unique (System Name, System Type, Organization) triplet
final_df = (
full_df.sort_values("Submitted On", ascending=False)
.drop_duplicates(subset=[AutoEvalColumn.system.name, AutoEvalColumn.organization.name], keep="first")
.sort_values(by=[AutoEvalColumn.success_rate_overall.name], ascending=False)
.reset_index(drop=True)
)
cols_to_round = [
AutoEvalColumn.success_rate_overall.name,
AutoEvalColumn.success_rate_tier1.name,
AutoEvalColumn.success_rate_tier2.name,
]
final_df[cols_to_round] = final_df[cols_to_round].round(decimals=2)
return final_df