import json import os import re import uuid from pathlib import Path import pandas as pd import streamlit as st from datasets import load_dataset from huggingface_hub import CommitScheduler from src.check_validity import validate_model # define page config st.set_page_config(page_title="IVACE Leaderboard", layout="wide") # setup scheduler to upload user requests request_file = Path("user_request/") / f"data_{uuid.uuid4()}.json" request_folder = request_file.parent # columns = [ # "eval_name", # "Model", # "Type", # "Average ⬆️", # "IFEval", # "MMLU-PRO", # "GPQA", # "MUSR", # "CO₂ cost (kg)", # ] # languages lang_list = ["Spain", "Portuguese", "English", "Spanish", "Costa Rica", "Mexico", "Peru", "Uruguay", "Basque", "Catalan", "Galician"] # column order model_columns = ["model_name", "url", "type"] scheduler = CommitScheduler( repo_id="iberbench/ivace-user-request", repo_type="dataset", private=True, folder_path=request_folder, token=st.secrets["HF_TOKEN"], path_in_repo="data", every=10, ) def log_submission(input_dict: dict) -> None: """ Append input/outputs and user feedback to a JSON Lines file using a thread lock to avoid concurrent writes from different users. """ with scheduler.lock: with request_file.open("a") as f: f.write(json.dumps(input_dict)) f.write("\n") # def get_url(html_content: str) -> str: # match = re.search(r'href=["\'](https?://[^\s"\']+)', html_content) # if match: # url = match.group(1) # return url # else: # raise ValueError("Url not found in the link") def get_lang_columns(columns: list, lang: str): """Filter columns per language""" lang_norm = lang.lower().replace(" ", "_") return [col for col in columns if lang_norm in col] @st.cache_data def load_data(lang) -> pd.DataFrame: try: data = ( load_dataset("iberbench/lm-eval-results-ac", token=st.secrets["HF_TOKEN"])["train"] .to_pandas() ) # filter lang columns task_columns = [col for col in data.columns if col not in model_columns] task_lang_columns = get_lang_columns(task_columns, lang) data = data[model_columns + task_lang_columns] # data["Model"] = data["Model"].apply(get_url) # data.sort_values(by="Average ⬆️", ascending=False, inplace=True) # data.reset_index(drop=True, inplace=True) # add column to apply filtering data["Active"] = False return data except FileNotFoundError: st.error("iberbench/lm-eval-results-ac was not found in the hub") return pd.DataFrame() # functions to create filter def active_data(lang) -> pd.DataFrame: """Change all records as active""" return st.session_state[f"leaderboard_data_{lang}"][ st.session_state[f"leaderboard_data_{lang}"]["Active"] == True ].copy() def get_index(lang, row) -> pd.Series: """Get index of the row""" return active_data(lang).iloc[row].name def commit(lang) -> None: """Commit changes to the session state""" for row in st.session_state[f"edited_data_{lang}"]["edited_rows"]: row_index = get_index(lang, row) for key, value in st.session_state[f"edited_data_{lang}"][ "edited_rows" ][row].items(): st.session_state[f"leaderboard_data_{lang}"].at[ row_index, key ] = value def create_search_per_language(lang: str, search_dict: dict): if not st.session_state[f"leaderboard_data_{lang}"].empty: search_dict[lang] = st.text_input( "Search for ...", key=f"search_input_{lang}", on_change=commit, kwargs={"lang": lang}, ) if search_dict[lang] == "": st.session_state[f"leaderboard_data_{lang}"].Active = True else: st.session_state[f"leaderboard_data_{lang}"].Active = False st.session_state[f"leaderboard_data_{lang}"].loc[ st.session_state[f"leaderboard_data_{lang}"][ "model_name" ].str.contains(search_dict[lang], case=False), "Active", ] = True # select columns to display task_columns = [col for col in st.session_state[f"leaderboard_data_{lang}"].columns if col not in model_columns] task_lang_columns = get_lang_columns(task_columns, lang) columns = model_columns + task_lang_columns edited_data = st.data_editor( active_data(lang), column_order=columns, key=f"edited_data_{lang}", hide_index=False, # column_config={"Model": st.column_config.LinkColumn("Model")}, column_config={"url": st.column_config.LinkColumn("url")}, ) else: st.write("No data found to display on leaderboard.") # streamlit UI for lang in lang_list: # todo: load a different dataset per language of load different column per lang leaderboard_data = load_data(lang) if f"leaderboard_data_{lang}" not in st.session_state: st.session_state[f"leaderboard_data_{lang}"] = leaderboard_data tabs = st.tabs(["Leaderboard", "Submit model"]) search_dict = {} with tabs[0]: # logo image cols_logo = st.columns(5, vertical_alignment="center") with cols_logo[2]: st.image("assets/images/hf-logo.png", use_container_width=True) # title st.markdown( """

IVACE LLM Leaderboard

Comparing Large Language Models in an open and reproducible way

""", unsafe_allow_html=True, ) # create tabs per language lang_tabs = st.tabs(lang_list) for lang, lt in zip(lang_list, lang_tabs): with lt: create_search_per_language(lang, search_dict) with tabs[1]: st.header("Submit model") def get_id_number(id_val): html_template = f"""
{id_val}
""" return html_template # create guide info guide_info_list = [] html_path = "assets/html" for filename in os.listdir(html_path): file_path = os.path.join(html_path, filename) with open(file_path, "r", encoding="utf-8") as file: guide_info_list.append(file.read()) # display adding number id for i, info_div in enumerate(guide_info_list): st.markdown(get_id_number(i + 1) + info_div, unsafe_allow_html=True) with st.form("submit_model_form"): model_name = st.text_input( "Model Name (format: user_name/model_name)", help="Your model should be public on the Hub and follow the username/model-id format (e.g. mistralai/Mistral-7B-v0.1).", ) description = st.text_area( "Description", help="Add a description of the proposed model for the evaluation to help prioritize its evaluation", ) user_contact = st.text_input( "Your Contact Email", help="User e-mail to contact when there are updates", ) precision_option = st.selectbox( "Choose precision format:", help="Size limits vary by precision: • FP16/BF16: up to 100B parameters • 8-bit: up to 280B parameters (2x) • 4-bit: up to 560B parameters (4x) Choose carefully as incorrect precision can cause evaluation errors.", options=["float16", "bfloat16", "8bit", "4bit", "GPTQ"], index=0, ) weight_type_option = st.selectbox( "Select what type of weights are being loaded from the checkpoint provided:", help="Original: Complete model weights in safetensors format Delta: Weight differences from base model (requires base model for size calculation) Adapter: Lightweight fine-tuning layers (requires base model for size calculation)", options=["Original", "Adapter", "Delta"], index=0, ) base_model_name = st.text_input( "Base model", help="Required for delta weights or adapters. This information is used to identify the original model and calculate the total parameter count by combining base model and adapter/delta parameters.", value="", ) model_type = st.selectbox( "Choose model type:", help="🟢 Pretrained: Base models trained on text using masked modeling 🔶 Fine-tuned: Domain-specific optimization 💬 Chat: Models using RLHF, DPO, or IFT for conversation 🤝 Merge: Combined weights without additional training", options=[ "🟢 Pretrained", "🔶 Fine-tuned", "💬 Chat", "🤝 Merge", ], ) submit_button = st.form_submit_button("Submit Request") if submit_button: # validate model size, license, chat_templates use_chat_template = True if model_type == "💬 Chat" else False validation_error = validate_model( model_name, precision_option, base_model_name, weight_type_option, use_chat_template, ) if validation_error is not None: st.error(validation_error) elif not re.match(r"[^@]+@[^@]+\.[^@]+", user_contact): st.error("Invalid email address.") else: input_dict = { "model_name": model_name, "description": description, "user_contact": user_contact, "precision_option": precision_option, "weight_type_option": weight_type_option, "base_model_name": base_model_name, "model_type": model_type, } try: log_submission(input_dict) st.success("Your request has been sent successfully.") except Exception as e: st.error( f"Failed to send your request: {e}. Please try again later." )