Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import json | |
import os | |
import pandas as pd | |
from datasets import load_dataset, get_dataset_config_names | |
from datasets.exceptions import DatasetNotFoundError | |
from tqdm.auto import tqdm | |
from src.display.formatting import has_no_nan_values, make_clickable_model | |
from src.display.utils import AutoEvalColumn, EvalQueueColumn | |
from src.envs import TOKEN | |
from src.leaderboard.read_evals import get_raw_eval_results | |
from src.logger import get_logger | |
logger = get_logger(__name__) | |
def get_leaderboard_df(results_dataset_name: str) -> pd.DataFrame: | |
"""Creates a dataframe from all the individual experiment results""" | |
try: | |
configs = get_dataset_config_names(results_dataset_name, token=TOKEN) | |
except (DatasetNotFoundError, FileNotFoundError): | |
# Return an empty DataFrame with expected columns | |
return pd.DataFrame( | |
columns=[ | |
"System Name", | |
"System Type", | |
"Organization", | |
"Success Rate (%)", | |
"Problems Solved", | |
"Submitted On", | |
] | |
) | |
rows = [] | |
for submission_id in tqdm(configs, total=len(configs), desc="Processing Submission Results"): | |
submission_ds = load_dataset(results_dataset_name, submission_id, split="train", token=TOKEN) | |
submission_df = pd.DataFrame(submission_ds) | |
if submission_df.empty or "did_pass" not in submission_df.columns or submission_df.did_pass.isna().any(): | |
logger.warning(f"Skipping {submission_id} due to invalid did_pass values") | |
continue | |
success_rate = 100 * submission_df["did_pass"].mean() | |
num_solved = submission_df["did_pass"].sum() | |
first_row = submission_df.iloc[0] | |
rows.append( | |
{ | |
"System Name": first_row["system_name"], | |
"System Type": first_row["system_type"], | |
"Organization": first_row["organization"], | |
"Success Rate (%)": success_rate, | |
"Problems Solved": num_solved, | |
"Submitted On": pd.to_datetime(first_row["submission_ts"]).strftime("%Y-%m-%d %H:%M"), | |
} | |
) | |
full_df = pd.DataFrame(rows) | |
# TODO: forbid multiple submissions under the same name? | |
# Keep only the latest entry per unique (System Name, System Type, Organization) triplet | |
final_df = ( | |
full_df.sort_values("Submitted On", ascending=False) | |
.drop_duplicates(subset=["System Name", "System Type", "Organization"], keep="first") | |
.sort_values(by=[AutoEvalColumn.success_rate.name], ascending=False) | |
.reset_index(drop=True) | |
) | |
cols_to_round = ["Success Rate (%)"] | |
final_df[cols_to_round] = final_df[cols_to_round].round(decimals=2) | |
return final_df | |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]: | |
"""Creates the different dataframes for the evaluation queues requestes""" | |
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")] | |
all_evals = [] | |
for entry in entries: | |
if ".json" in entry: | |
file_path = os.path.join(save_path, entry) | |
with open(file_path) as fp: | |
data = json.load(fp) | |
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"]) | |
data[EvalQueueColumn.revision.name] = data.get("revision", "main") | |
all_evals.append(data) | |
elif ".md" not in entry: | |
# this is a folder | |
sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")] | |
for sub_entry in sub_entries: | |
file_path = os.path.join(save_path, entry, sub_entry) | |
with open(file_path) as fp: | |
data = json.load(fp) | |
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"]) | |
data[EvalQueueColumn.revision.name] = data.get("revision", "main") | |
all_evals.append(data) | |
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]] | |
running_list = [e for e in all_evals if e["status"] == "RUNNING"] | |
finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"] | |
df_pending = pd.DataFrame.from_records(pending_list, columns=cols) | |
df_running = pd.DataFrame.from_records(running_list, columns=cols) | |
df_finished = pd.DataFrame.from_records(finished_list, columns=cols) | |
return df_finished[cols], df_running[cols], df_pending[cols] | |