Spaces:
Running
Running
import json | |
import os | |
import re | |
import uuid | |
from pathlib import Path | |
import pandas as pd | |
import streamlit as st | |
from datasets import load_dataset | |
from huggingface_hub import CommitScheduler | |
from src.check_validity import validate_model | |
# define page config | |
st.set_page_config(page_title="IVACE Leaderboard", layout="wide") | |
# setup scheduler to upload user requests | |
request_file = Path("user_request/") / f"data_{uuid.uuid4()}.json" | |
request_folder = request_file.parent | |
# columns = [ | |
# "eval_name", | |
# "Model", | |
# "Type", | |
# "Average β¬οΈ", | |
# "IFEval", | |
# "MMLU-PRO", | |
# "GPQA", | |
# "MUSR", | |
# "COβ cost (kg)", | |
# ] | |
# languages | |
lang_list = ["Spain", "Portuguese", "English", "Spanish", "Costa Rica", "Mexico", "Peru", "Uruguay", "Basque", "Catalan", "Galician"] | |
# column order | |
model_columns = ["model_name", "url", "type"] | |
scheduler = CommitScheduler( | |
repo_id="iberbench/ivace-user-request", | |
repo_type="dataset", | |
private=True, | |
folder_path=request_folder, | |
token=st.secrets["HF_TOKEN"], | |
path_in_repo="data", | |
every=10, | |
) | |
def log_submission(input_dict: dict) -> None: | |
""" | |
Append input/outputs and user feedback to a JSON Lines file using a thread lock to avoid concurrent writes from different users. | |
""" | |
with scheduler.lock: | |
with request_file.open("a") as f: | |
f.write(json.dumps(input_dict)) | |
f.write("\n") | |
# def get_url(html_content: str) -> str: | |
# match = re.search(r'href=["\'](https?://[^\s"\']+)', html_content) | |
# if match: | |
# url = match.group(1) | |
# return url | |
# else: | |
# raise ValueError("Url not found in the link") | |
def get_lang_columns(columns: list, lang: str): | |
"""Filter columns per language""" | |
lang_norm = lang.lower().replace(" ", "_") | |
return [col for col in columns if lang_norm in col] | |
def load_data(lang) -> pd.DataFrame: | |
try: | |
data = ( | |
load_dataset("iberbench/lm-eval-results-ac", token=st.secrets["HF_TOKEN"])["train"] | |
.to_pandas() | |
) | |
# filter lang columns | |
task_columns = [col for col in data.columns if col not in model_columns] | |
task_lang_columns = get_lang_columns(task_columns, lang) | |
data = data[model_columns + task_lang_columns] | |
# data["Model"] = data["Model"].apply(get_url) | |
# data.sort_values(by="Average β¬οΈ", ascending=False, inplace=True) | |
# data.reset_index(drop=True, inplace=True) | |
# add column to apply filtering | |
data["Active"] = False | |
return data | |
except FileNotFoundError: | |
st.error("iberbench/lm-eval-results-ac was not found in the hub") | |
return pd.DataFrame() | |
# functions to create filter | |
def active_data(lang) -> pd.DataFrame: | |
"""Change all records as active""" | |
return st.session_state[f"leaderboard_data_{lang}"][ | |
st.session_state[f"leaderboard_data_{lang}"]["Active"] == True | |
].copy() | |
def get_index(lang, row) -> pd.Series: | |
"""Get index of the row""" | |
return active_data(lang).iloc[row].name | |
def commit(lang) -> None: | |
"""Commit changes to the session state""" | |
for row in st.session_state[f"edited_data_{lang}"]["edited_rows"]: | |
row_index = get_index(lang, row) | |
for key, value in st.session_state[f"edited_data_{lang}"][ | |
"edited_rows" | |
][row].items(): | |
st.session_state[f"leaderboard_data_{lang}"].at[ | |
row_index, key | |
] = value | |
def create_search_per_language(lang: str, search_dict: dict): | |
if not st.session_state[f"leaderboard_data_{lang}"].empty: | |
search_dict[lang] = st.text_input( | |
"Search for ...", | |
key=f"search_input_{lang}", | |
on_change=commit, | |
kwargs={"lang": lang}, | |
) | |
if search_dict[lang] == "": | |
st.session_state[f"leaderboard_data_{lang}"].Active = True | |
else: | |
st.session_state[f"leaderboard_data_{lang}"].Active = False | |
st.session_state[f"leaderboard_data_{lang}"].loc[ | |
st.session_state[f"leaderboard_data_{lang}"][ | |
"model_name" | |
].str.contains(search_dict[lang], case=False), | |
"Active", | |
] = True | |
# select columns to display | |
task_columns = [col for col in st.session_state[f"leaderboard_data_{lang}"].columns if col not in model_columns] | |
task_lang_columns = get_lang_columns(task_columns, lang) | |
columns = model_columns + task_lang_columns | |
edited_data = st.data_editor( | |
active_data(lang), | |
column_order=columns, | |
key=f"edited_data_{lang}", | |
hide_index=False, | |
# column_config={"Model": st.column_config.LinkColumn("Model")}, | |
column_config={"url": st.column_config.LinkColumn("url")}, | |
) | |
else: | |
st.write("No data found to display on leaderboard.") | |
# streamlit UI | |
for lang in lang_list: | |
# todo: load a different dataset per language of load different column per lang | |
leaderboard_data = load_data(lang) | |
if f"leaderboard_data_{lang}" not in st.session_state: | |
st.session_state[f"leaderboard_data_{lang}"] = leaderboard_data | |
tabs = st.tabs(["Leaderboard", "Submit model"]) | |
search_dict = {} | |
with tabs[0]: | |
# logo image | |
cols_logo = st.columns(5, vertical_alignment="center") | |
with cols_logo[2]: | |
st.image("assets/images/hf-logo.png", use_container_width=True) | |
# title | |
st.markdown( | |
""" | |
<div style="text-align: center;"> | |
<h1>IVACE LLM Leaderboard</h1> | |
<p style="font-size: 1.2rem;"> | |
Comparing Large Language Models in an <span style="font-weight: 600;">open</span> | |
and <span style="font-weight: 600;">reproducible</span> way | |
</p> | |
</div> | |
""", | |
unsafe_allow_html=True, | |
) | |
# create tabs per language | |
lang_tabs = st.tabs(lang_list) | |
for lang, lt in zip(lang_list, lang_tabs): | |
with lt: | |
create_search_per_language(lang, search_dict) | |
with tabs[1]: | |
st.header("Submit model") | |
def get_id_number(id_val): | |
html_template = f""" | |
<div style="display: flex; align-items: flex-start; margin-bottom: 1rem;"> | |
<div style=" | |
width: 32px; | |
height: 32px; | |
border-radius: 50%; | |
display: flex; | |
align-items: center; | |
justify-content: center; | |
border: 1px solid #007BFF; | |
color: #007BFF; | |
font-size: 0.875rem; | |
font-weight: 600; | |
background-color: transparent;"> | |
{id_val} | |
</div>""" | |
return html_template | |
# create guide info | |
guide_info_list = [] | |
html_path = "assets/html" | |
for filename in os.listdir(html_path): | |
file_path = os.path.join(html_path, filename) | |
with open(file_path, "r", encoding="utf-8") as file: | |
guide_info_list.append(file.read()) | |
# display adding number id | |
for i, info_div in enumerate(guide_info_list): | |
st.markdown(get_id_number(i + 1) + info_div, unsafe_allow_html=True) | |
with st.form("submit_model_form"): | |
model_name = st.text_input( | |
"Model Name (format: user_name/model_name)", | |
help="Your model should be public on the Hub and follow the username/model-id format (e.g. mistralai/Mistral-7B-v0.1).", | |
) | |
description = st.text_area( | |
"Description", | |
help="Add a description of the proposed model for the evaluation to help prioritize its evaluation", | |
) | |
user_contact = st.text_input( | |
"Your Contact Email", | |
help="User e-mail to contact when there are updates", | |
) | |
precision_option = st.selectbox( | |
"Choose precision format:", | |
help="Size limits vary by precision: β’ FP16/BF16: up to 100B parameters β’ 8-bit: up to 280B parameters (2x) β’ 4-bit: up to 560B parameters (4x) Choose carefully as incorrect precision can cause evaluation errors.", | |
options=["float16", "bfloat16", "8bit", "4bit", "GPTQ"], | |
index=0, | |
) | |
weight_type_option = st.selectbox( | |
"Select what type of weights are being loaded from the checkpoint provided:", | |
help="Original: Complete model weights in safetensors format Delta: Weight differences from base model (requires base model for size calculation) Adapter: Lightweight fine-tuning layers (requires base model for size calculation)", | |
options=["Original", "Adapter", "Delta"], | |
index=0, | |
) | |
base_model_name = st.text_input( | |
"Base model", | |
help="Required for delta weights or adapters. This information is used to identify the original model and calculate the total parameter count by combining base model and adapter/delta parameters.", | |
value="", | |
) | |
model_type = st.selectbox( | |
"Choose model type:", | |
help="π’ Pretrained: Base models trained on text using masked modeling πΆ Fine-tuned: Domain-specific optimization π¬ Chat: Models using RLHF, DPO, or IFT for conversation π€ Merge: Combined weights without additional training", | |
options=[ | |
"π’ Pretrained", | |
"πΆ Fine-tuned", | |
"π¬ Chat", | |
"π€ Merge", | |
], | |
) | |
submit_button = st.form_submit_button("Submit Request") | |
if submit_button: | |
# validate model size, license, chat_templates | |
use_chat_template = True if model_type == "π¬ Chat" else False | |
validation_error = validate_model( | |
model_name, | |
precision_option, | |
base_model_name, | |
weight_type_option, | |
use_chat_template, | |
) | |
if validation_error is not None: | |
st.error(validation_error) | |
elif not re.match(r"[^@]+@[^@]+\.[^@]+", user_contact): | |
st.error("Invalid email address.") | |
else: | |
input_dict = { | |
"model_name": model_name, | |
"description": description, | |
"user_contact": user_contact, | |
"precision_option": precision_option, | |
"weight_type_option": weight_type_option, | |
"base_model_name": base_model_name, | |
"model_type": model_type, | |
} | |
try: | |
log_submission(input_dict) | |
st.success("Your request has been sent successfully.") | |
except Exception as e: | |
st.error( | |
f"Failed to send your request: {e}. Please try again later." | |
) | |