import json import os import re import uuid from pathlib import Path import pandas as pd import streamlit as st from datasets import load_dataset from huggingface_hub import CommitScheduler from src.check_validity import validate_model # define page config st.set_page_config(page_title="IVACE Leaderboard", layout="wide") # setup scheduler to upload user requests request_file = Path("user_request/") / f"data_{uuid.uuid4()}.json" request_folder = request_file.parent # columns = [ # "eval_name", # "Model", # "Type", # "Average ⬆️", # "IFEval", # "MMLU-PRO", # "GPQA", # "MUSR", # "CO₂ cost (kg)", # ] # languages lang_list = ["Spain", "Portuguese", "English", "Spanish", "Costa Rica", "Mexico", "Peru", "Uruguay", "Basque", "Catalan", "Galician"] # column order model_columns = ["model_name", "url", "type"] scheduler = CommitScheduler( repo_id="iberbench/ivace-user-request", repo_type="dataset", private=True, folder_path=request_folder, token=st.secrets["HF_TOKEN"], path_in_repo="data", every=10, ) def log_submission(input_dict: dict) -> None: """ Append input/outputs and user feedback to a JSON Lines file using a thread lock to avoid concurrent writes from different users. """ with scheduler.lock: with request_file.open("a") as f: f.write(json.dumps(input_dict)) f.write("\n") # def get_url(html_content: str) -> str: # match = re.search(r'href=["\'](https?://[^\s"\']+)', html_content) # if match: # url = match.group(1) # return url # else: # raise ValueError("Url not found in the link") def get_lang_columns(columns: list, lang: str): """Filter columns per language""" lang_norm = lang.lower().replace(" ", "_") return [col for col in columns if lang_norm in col] @st.cache_data def load_data(lang) -> pd.DataFrame: try: data = ( load_dataset("iberbench/lm-eval-results-ac", token=st.secrets["HF_TOKEN"])["train"] .to_pandas() ) # filter lang columns task_columns = [col for col in data.columns if col not in model_columns] task_lang_columns = get_lang_columns(task_columns, lang) data = data[model_columns + task_lang_columns] # data["Model"] = data["Model"].apply(get_url) # data.sort_values(by="Average ⬆️", ascending=False, inplace=True) # data.reset_index(drop=True, inplace=True) # add column to apply filtering data["Active"] = False return data except FileNotFoundError: st.error("iberbench/lm-eval-results-ac was not found in the hub") return pd.DataFrame() # functions to create filter def active_data(lang) -> pd.DataFrame: """Change all records as active""" return st.session_state[f"leaderboard_data_{lang}"][ st.session_state[f"leaderboard_data_{lang}"]["Active"] == True ].copy() def get_index(lang, row) -> pd.Series: """Get index of the row""" return active_data(lang).iloc[row].name def commit(lang) -> None: """Commit changes to the session state""" for row in st.session_state[f"edited_data_{lang}"]["edited_rows"]: row_index = get_index(lang, row) for key, value in st.session_state[f"edited_data_{lang}"][ "edited_rows" ][row].items(): st.session_state[f"leaderboard_data_{lang}"].at[ row_index, key ] = value def create_search_per_language(lang: str, search_dict: dict): if not st.session_state[f"leaderboard_data_{lang}"].empty: search_dict[lang] = st.text_input( "Search for ...", key=f"search_input_{lang}", on_change=commit, kwargs={"lang": lang}, ) if search_dict[lang] == "": st.session_state[f"leaderboard_data_{lang}"].Active = True else: st.session_state[f"leaderboard_data_{lang}"].Active = False st.session_state[f"leaderboard_data_{lang}"].loc[ st.session_state[f"leaderboard_data_{lang}"][ "model_name" ].str.contains(search_dict[lang], case=False), "Active", ] = True # select columns to display task_columns = [col for col in st.session_state[f"leaderboard_data_{lang}"].columns if col not in model_columns] task_lang_columns = get_lang_columns(task_columns, lang) columns = model_columns + task_lang_columns edited_data = st.data_editor( active_data(lang), column_order=columns, key=f"edited_data_{lang}", hide_index=False, # column_config={"Model": st.column_config.LinkColumn("Model")}, column_config={"url": st.column_config.LinkColumn("url")}, ) else: st.write("No data found to display on leaderboard.") # streamlit UI for lang in lang_list: # todo: load a different dataset per language of load different column per lang leaderboard_data = load_data(lang) if f"leaderboard_data_{lang}" not in st.session_state: st.session_state[f"leaderboard_data_{lang}"] = leaderboard_data tabs = st.tabs(["Leaderboard", "Submit model"]) search_dict = {} with tabs[0]: # logo image cols_logo = st.columns(5, vertical_alignment="center") with cols_logo[2]: st.image("assets/images/hf-logo.png", use_container_width=True) # title st.markdown( """
Comparing Large Language Models in an open and reproducible way