Spaces:

iberbench
/

leaderboard

Running

leaderboard / app.py

Alvaro Romo

Modified lang_list and task columns

f30b2f5 6 months ago

11 kB

	import json
	import os
	import re
	import uuid
	from pathlib import Path

	import pandas as pd
	import streamlit as st
	from datasets import load_dataset
	from huggingface_hub import CommitScheduler

	from src.check_validity import validate_model

	# define page config
	st.set_page_config(page_title="IVACE Leaderboard", layout="wide")

	# setup scheduler to upload user requests
	request_file = Path("user_request/") / f"data_{uuid.uuid4()}.json"
	request_folder = request_file.parent


	# columns = [
	# "eval_name",
	# "Model",
	# "Type",
	# "Average ⬆️",
	# "IFEval",
	# "MMLU-PRO",
	# "GPQA",
	# "MUSR",
	# "CO₂ cost (kg)",
	# ]
	# languages
	lang_list = ["Spain", "Portuguese", "English", "Spanish", "Costa Rica", "Mexico", "Peru", "Uruguay", "Basque", "Catalan", "Galician"]

	# column order
	model_columns = ["model_name", "url", "type"]

	scheduler = CommitScheduler(
	repo_id="iberbench/ivace-user-request",
	repo_type="dataset",
	private=True,
	folder_path=request_folder,
	token=st.secrets["HF_TOKEN"],
	path_in_repo="data",
	every=10,
	)


	def log_submission(input_dict: dict) -> None:
	"""
	Append input/outputs and user feedback to a JSON Lines file using a thread lock to avoid concurrent writes from different users.
	"""
	with scheduler.lock:
	with request_file.open("a") as f:
	f.write(json.dumps(input_dict))
	f.write("\n")


	# def get_url(html_content: str) -> str:
	# match = re.search(r'href=["\'](https?://[^\s"\']+)', html_content)
	# if match:
	# url = match.group(1)
	# return url
	# else:
	# raise ValueError("Url not found in the link")


	def get_lang_columns(columns: list, lang: str):
	"""Filter columns per language"""
	lang_norm = lang.lower().replace(" ", "_")

	return [col for col in columns if lang_norm in col]


	@st.cache_data
	def load_data(lang) -> pd.DataFrame:
	try:
	data = (
	load_dataset("iberbench/lm-eval-results-ac", token=st.secrets["HF_TOKEN"])["train"]
	.to_pandas()
	)
	# filter lang columns
	task_columns = [col for col in data.columns if col not in model_columns]
	task_lang_columns = get_lang_columns(task_columns, lang)
	data = data[model_columns + task_lang_columns]

	# data["Model"] = data["Model"].apply(get_url)
	# data.sort_values(by="Average ⬆️", ascending=False, inplace=True)
	# data.reset_index(drop=True, inplace=True)

	# add column to apply filtering
	data["Active"] = False

	return data
	except FileNotFoundError:
	st.error("iberbench/lm-eval-results-ac was not found in the hub")
	return pd.DataFrame()


	# functions to create filter
	def active_data(lang) -> pd.DataFrame:
	"""Change all records as active"""
	return st.session_state[f"leaderboard_data_{lang}"][
	st.session_state[f"leaderboard_data_{lang}"]["Active"] == True
	].copy()


	def get_index(lang, row) -> pd.Series:
	"""Get index of the row"""
	return active_data(lang).iloc[row].name


	def commit(lang) -> None:
	"""Commit changes to the session state"""
	for row in st.session_state[f"edited_data_{lang}"]["edited_rows"]:
	row_index = get_index(lang, row)
	for key, value in st.session_state[f"edited_data_{lang}"][
	"edited_rows"
	][row].items():
	st.session_state[f"leaderboard_data_{lang}"].at[
	row_index, key
	] = value


	def create_search_per_language(lang: str, search_dict: dict):
	if not st.session_state[f"leaderboard_data_{lang}"].empty:
	search_dict[lang] = st.text_input(
	"Search for ...",
	key=f"search_input_{lang}",
	on_change=commit,
	kwargs={"lang": lang},
	)
	if search_dict[lang] == "":
	st.session_state[f"leaderboard_data_{lang}"].Active = True
	else:
	st.session_state[f"leaderboard_data_{lang}"].Active = False
	st.session_state[f"leaderboard_data_{lang}"].loc[
	st.session_state[f"leaderboard_data_{lang}"][
	"model_name"
	].str.contains(search_dict[lang], case=False),
	"Active",
	] = True

	# select columns to display
	task_columns = [col for col in st.session_state[f"leaderboard_data_{lang}"].columns if col not in model_columns]
	task_lang_columns = get_lang_columns(task_columns, lang)
	columns = model_columns + task_lang_columns

	edited_data = st.data_editor(
	active_data(lang),
	column_order=columns,
	key=f"edited_data_{lang}",
	hide_index=False,
	# column_config={"Model": st.column_config.LinkColumn("Model")},
	column_config={"url": st.column_config.LinkColumn("url")},
	)
	else:
	st.write("No data found to display on leaderboard.")


	# streamlit UI
	for lang in lang_list:
	# todo: load a different dataset per language of load different column per lang
	leaderboard_data = load_data(lang)
	if f"leaderboard_data_{lang}" not in st.session_state:
	st.session_state[f"leaderboard_data_{lang}"] = leaderboard_data

	tabs = st.tabs(["Leaderboard", "Submit model"])
	search_dict = {}

	with tabs[0]:
	# logo image
	cols_logo = st.columns(5, vertical_alignment="center")
	with cols_logo[2]:
	st.image("assets/images/hf-logo.png", use_container_width=True)

	# title
	st.markdown(
	"""
	<div style="text-align: center;">
	<h1>IVACE LLM Leaderboard</h1>
	<p style="font-size: 1.2rem;">
	Comparing Large Language Models in an <span style="font-weight: 600;">open</span>
	and <span style="font-weight: 600;">reproducible</span> way
	</p>
	</div>
	""",
	unsafe_allow_html=True,
	)

	# create tabs per language
	lang_tabs = st.tabs(lang_list)

	for lang, lt in zip(lang_list, lang_tabs):
	with lt:
	create_search_per_language(lang, search_dict)


	with tabs[1]:
	st.header("Submit model")

	def get_id_number(id_val):
	html_template = f"""
	<div style="display: flex; align-items: flex-start; margin-bottom: 1rem;">
	<div style="
	width: 32px;
	height: 32px;
	border-radius: 50%;
	display: flex;
	align-items: center;
	justify-content: center;
	border: 1px solid #007BFF;
	color: #007BFF;
	font-size: 0.875rem;
	font-weight: 600;
	background-color: transparent;">
	{id_val}
	</div>"""
	return html_template

	# create guide info
	guide_info_list = []
	html_path = "assets/html"
	for filename in os.listdir(html_path):
	file_path = os.path.join(html_path, filename)
	with open(file_path, "r", encoding="utf-8") as file:
	guide_info_list.append(file.read())

	# display adding number id
	for i, info_div in enumerate(guide_info_list):
	st.markdown(get_id_number(i + 1) + info_div, unsafe_allow_html=True)

	with st.form("submit_model_form"):
	model_name = st.text_input(
	"Model Name (format: user_name/model_name)",
	help="Your model should be public on the Hub and follow the username/model-id format (e.g. mistralai/Mistral-7B-v0.1).",
	)
	description = st.text_area(
	"Description",
	help="Add a description of the proposed model for the evaluation to help prioritize its evaluation",
	)
	user_contact = st.text_input(
	"Your Contact Email",
	help="User e-mail to contact when there are updates",
	)
	precision_option = st.selectbox(
	"Choose precision format:",
	help="Size limits vary by precision: • FP16/BF16: up to 100B parameters • 8-bit: up to 280B parameters (2x) • 4-bit: up to 560B parameters (4x) Choose carefully as incorrect precision can cause evaluation errors.",
	options=["float16", "bfloat16", "8bit", "4bit", "GPTQ"],
	index=0,
	)
	weight_type_option = st.selectbox(
	"Select what type of weights are being loaded from the checkpoint provided:",
	help="Original: Complete model weights in safetensors format Delta: Weight differences from base model (requires base model for size calculation) Adapter: Lightweight fine-tuning layers (requires base model for size calculation)",
	options=["Original", "Adapter", "Delta"],
	index=0,
	)
	base_model_name = st.text_input(
	"Base model",
	help="Required for delta weights or adapters. This information is used to identify the original model and calculate the total parameter count by combining base model and adapter/delta parameters.",
	value="",
	)
	model_type = st.selectbox(
	"Choose model type:",
	help="🟢 Pretrained: Base models trained on text using masked modeling 🔶 Fine-tuned: Domain-specific optimization 💬 Chat: Models using RLHF, DPO, or IFT for conversation 🤝 Merge: Combined weights without additional training",
	options=[
	"🟢 Pretrained",
	"🔶 Fine-tuned",
	"💬 Chat",
	"🤝 Merge",
	],
	)
	submit_button = st.form_submit_button("Submit Request")

	if submit_button:
	# validate model size, license, chat_templates
	use_chat_template = True if model_type == "💬 Chat" else False
	validation_error = validate_model(
	model_name,
	precision_option,
	base_model_name,
	weight_type_option,
	use_chat_template,
	)
	if validation_error is not None:
	st.error(validation_error)
	elif not re.match(r"[^@]+@[^@]+\.[^@]+", user_contact):
	st.error("Invalid email address.")
	else:
	input_dict = {
	"model_name": model_name,
	"description": description,
	"user_contact": user_contact,
	"precision_option": precision_option,
	"weight_type_option": weight_type_option,
	"base_model_name": base_model_name,
	"model_type": model_type,
	}
	try:
	log_submission(input_dict)
	st.success("Your request has been sent successfully.")
	except Exception as e:
	st.error(
	f"Failed to send your request: {e}. Please try again later."
	)