Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 3,580 Bytes
0061e14 416ebf1 b49d2fb e00a798 0061e14 416ebf1 e00a798 0061e14 e00a798 0061e14 e00a798 416ebf1 0061e14 54e1175 b49d2fb 416ebf1 b49d2fb 416ebf1 b49d2fb 416ebf1 54e1175 e00a798 416ebf1 e00a798 54e1175 e00a798 54e1175 e00a798 54e1175 e00a798 6d7c674 a6adcf8 e00a798 ea641c7 e00a798 ea641c7 e00a798 0061e14 e00a798 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
import pandas as pd
from datasets import get_dataset_config_names, load_dataset
from datasets.exceptions import DatasetNotFoundError
from tqdm.auto import tqdm
from src.display.utils import AutoEvalColumn
from src.envs import TOKEN
from src.logger import get_logger
logger = get_logger(__name__)
def get_leaderboard_df(results_dataset_name: str) -> pd.DataFrame:
"""
@brief Creates a dataframe from all the individual experiment results.
"""
empty_df = pd.DataFrame(
columns=[
AutoEvalColumn.system.name,
AutoEvalColumn.organization.name,
AutoEvalColumn.success_rate_overall.name,
AutoEvalColumn.success_rate_tier1.name,
AutoEvalColumn.success_rate_tier2.name,
AutoEvalColumn.submitted_on.name,
]
)
try:
configs = get_dataset_config_names(
results_dataset_name,
token=TOKEN,
)
except (DatasetNotFoundError, FileNotFoundError):
# Return an empty DataFrame with expected columns
logger.warning("Failed to load configuration", exc_info=True)
return empty_df
if configs == ["default"]:
logger.info("Dataset has only default config — treating as empty")
return empty_df
rows = []
for submission_id in tqdm(
configs,
total=len(configs),
desc="Processing Submission Results",
):
submission_ds = load_dataset(
results_dataset_name,
submission_id,
split="train",
token=TOKEN,
)
submission_df = pd.DataFrame(submission_ds)
if submission_df.empty or "did_pass" not in submission_df.columns or submission_df.did_pass.isna().any():
logger.warning(f"Skipping {submission_id} due to invalid did_pass values")
continue
assert submission_df["tier"].isin([1, 2]).all(), "Invalid tier values found in submission_df"
success_rate = 100 * submission_df["did_pass"].mean()
tier1_success_rate = 100 * submission_df[submission_df["tier"] == 1]["did_pass"].mean()
tier2_success_rate = 100 * submission_df[submission_df["tier"] == 2]["did_pass"].mean()
first_row = submission_df.iloc[0]
rows.append(
{
AutoEvalColumn.system.name: first_row["system_name"],
AutoEvalColumn.organization.name: first_row["organization"],
AutoEvalColumn.success_rate_overall.name: success_rate,
AutoEvalColumn.success_rate_tier1.name: tier1_success_rate,
AutoEvalColumn.success_rate_tier2.name: tier2_success_rate,
AutoEvalColumn.submitted_on.name: pd.to_datetime(first_row["submission_ts"]).strftime("%Y-%m-%d %H:%M"),
}
)
full_df = pd.DataFrame(rows)
logger.info(f"Loaded results df with {len(full_df)} entries")
# Keep only the latest entry per unique (System Name, System Type, Organization) triplet
final_df = (
full_df.sort_values("Submitted On", ascending=False)
.drop_duplicates(subset=[AutoEvalColumn.system.name, AutoEvalColumn.organization.name], keep="first")
.sort_values(by=[AutoEvalColumn.success_rate_overall.name], ascending=False)
.reset_index(drop=True)
)
cols_to_round = [
AutoEvalColumn.success_rate_overall.name,
AutoEvalColumn.success_rate_tier1.name,
AutoEvalColumn.success_rate_tier2.name,
]
final_df[cols_to_round] = final_df[cols_to_round].round(decimals=2)
return final_df
|