Spaces:

McGill-NLP
/

msteb_leaderboard

Running

File size: 5,305 Bytes

3b3db42
 
fed47e0
8b1f7a0
 
8fc70f8
3b3db42
8fc70f8
 
8b1f7a0
54eae7e
 
8b1f7a0
fed47e0
 
 
 
 
8b1f7a0
fed47e0
 
 
 
54eae7e
fed47e0
54eae7e
8b1f7a0
 
fed47e0
 
 
8b1f7a0
fed47e0
8fc70f8
8b1f7a0
fed47e0
 
8b1f7a0
fed47e0
8b1f7a0
fed47e0
 
8b1f7a0
fed47e0
 
8b1f7a0
 
8fc70f8

import json
import os
import pandas as pd
from datetime import datetime, timezone

from src.about import Tasks, SpeechTasks
from src.display.formatting import styled_error, styled_message, styled_warning
from src.display.utils import REGION_MAP
from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO, RESULTS_REPO, EVAL_RESULTS_PATH

REQUESTED_MODELS = None
USERS_TO_SUBMISSION_DATES = None


def handle_csv_submission(
    model_name: str,
    csv_file,  # uploaded file path
    result_type: str,
):
    if model_name == "" or model_name is None:
        return styled_error("Please provide a model name.")
    if csv_file is None:
        return styled_error("Please provide a CSV file with results.")

    df = pd.read_csv(csv_file)

    current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

    # Save uploaded CSV
    subdir = os.path.join(EVAL_REQUESTS_PATH, result_type)
    os.makedirs(subdir, exist_ok=True)

    filename = f"{current_time}_{model_name}_{result_type}_results.csv"
    remote_path = f"msteb_{result_type}_requests/{filename}"

    csv_save_path = os.path.join(subdir,filename)
    df.to_csv(csv_save_path, index=False)

    print(f"Uploading to {QUEUE_REPO}/{remote_path}")
    API.upload_file(
        path_or_fileobj=csv_save_path,
        path_in_repo=remote_path,
        repo_id=QUEUE_REPO,
        repo_type="dataset",  # or "model" if you made the repo that way
        commit_message=f"Add {result_type} request for {model_name} at {current_time}",
    )

    # Remove the local file
    os.remove(csv_save_path)
    # this converts dataframe to json and uploads it to results


    try:
        convert_csv_to_json_and_upload(df, model_name, result_type)
    except ValueError as e:
        return styled_error(f"{str(e)}")
    return styled_message(f"Results CSV successfully submitted for `{model_name}`!")

def find_task_by_col_name(col_name, enum_cls):
    for task in enum_cls:
        if task.value.col_name == col_name:
            return task
    return None
def convert_csv_to_json_and_upload(df: pd.DataFrame, model_name: str, result_type: str):
    task_enum = Tasks if result_type == "text" else SpeechTasks

    task_display_names = {t.value.col_name for t in task_enum}
    region_names = df["Region"].tolist()
    average_row = "Average (Micro)"

    # --- Validation ---
    df_columns = set(df.columns[1:])  # exclude Region column
    if not df_columns.issubset(task_display_names):
        extra = df_columns - task_display_names
        raise ValueError(f"Extra columns in CSV: {extra}")
    if average_row not in df["Region"].values:
        raise ValueError("Missing row for 'Average (Micro)'")

    data_region_names = [r for r in region_names if r != average_row]

    for region in data_region_names:
        if region not in REGION_MAP:
            raise ValueError(f"Region '{region}' not found in REGION_MAP keys.")

    # --- Build JSON ---
    # I go over the regions in the CSV and create a JSON object.
    model_json = {
        "config": {"model_name": model_name},
        "results": {},
        "regions": {},
    }
    at_least_one_number = False

    for _, row in df.iterrows():
        region_display = row["Region"]

        if region_display == average_row:
            for col, val in row.items():
                if col == "Region":
                    continue
                task = find_task_by_col_name(col, task_enum)
                if val is not None and not pd.isna(val) and isinstance(val, (int, float)):
                    print(f" value {val}")
                    at_least_one_number = True
                    model_json["results"][task.value.benchmark] = {task.value.metric: val/100}
        else:
            model_json["regions"][REGION_MAP[region_display]] = {}
            for col, val in row.items():
                if col == "Region":
                    continue
                task = find_task_by_col_name(col, task_enum)
                if val is not None and not pd.isna(val) and isinstance(val, (int, float)):
                    model_json["regions"][REGION_MAP[region_display]][task.value.benchmark] = {task.value.metric: val/100}

    # Check if at least one number is present in the results
    print(at_least_one_number)
    if at_least_one_number is False:
        raise ValueError("No valid numeric results found in the CSV. Please check your input.")

    # --- Save locally ---
    subdir = os.path.join(EVAL_RESULTS_PATH, result_type)
    os.makedirs(subdir, exist_ok=True)
    current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
    filename = f"{current_time}_{model_name}_{result_type}.json"
    json_save_path = os.path.join(subdir,filename)

    with open(json_save_path, "w") as f:
        json.dump(model_json, f, indent=2)

    # --- Upload to HF Hub ---
    remote_path = f"msteb_leaderboard/msteb_{result_type}_results/{filename}"
    API.upload_file(
        path_or_fileobj=json_save_path,
        path_in_repo=remote_path,
        repo_id=RESULTS_REPO,
        repo_type="dataset",
        commit_message=f"Upload results for {model_name} ({result_type}) at {current_time}",
    )
    os.remove(json_save_path)

    print(f"Uploaded to {RESULTS_REPO}/{current_time}")

    return f"Uploaded to {RESULTS_REPO}/{current_time}"