import gradio as gr
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import snapshot_download
from src.data_utils import get_dataframe_category, get_dataframe_language
import src.config as configs
from utils import get_profile_and_organizations, download_with_restart
from vis_utils import load_leaderboard_data, create_domain_radar_chart, create_len_overall_scatter
from src.about import (
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
EVALUATION_QUEUE_TEXT,
EVALUATION_QUEUE_TEXT_OPTION1,
INTRODUCTION_TEXT,
BANNER,
TITLE,
LINK,
)
from src.display.css_html_js import custom_css
from src.display.utils import (
Precision
)
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
from src.submission.submit import add_new_eval_option
from ui import create_leaderboard_tab
def restart_space():
API.restart_space(repo_id=REPO_ID)
### Space initialisation
download_with_restart(
snapshot_download,
repo_id=QUEUE_REPO,
local_dir=EVAL_REQUESTS_PATH,
repo_type="dataset",
token=TOKEN,
restart_func=restart_space
)
download_with_restart(
snapshot_download,
repo_id=RESULTS_REPO,
local_dir=EVAL_RESULTS_PATH,
repo_type="dataset",
token=TOKEN,
restart_func=restart_space
)
theme = gr.themes.Default(
primary_hue="gray",
neutral_hue="gray"
)
demo = gr.Blocks(css=custom_css, theme=theme)
with demo:
gr.HTML(BANNER + TITLE + LINK)
user_state = gr.State()
organization_state = gr.State()
with gr.Tabs(elem_classes="tab-buttons") as main_tabs:
with gr.TabItem("TRUEBench", elem_id="llm-benchmark-tab-table", id=2):
gr.HTML(INTRODUCTION_TEXT)
gr.HTML("""
TRUEBench consists of 10 categories and 46 sub-categories which highly related to productivity assistants.
""")
# --- Category Explanation Box (2x5 grid, emoji, desc from about.py) ---
from src.about import CATEGORY_DESCRIPTIONS
gr.HTML(f"""
📝 Content Generation{CATEGORY_DESCRIPTIONS["Content Generation"]}
✂️ Editing{CATEGORY_DESCRIPTIONS["Editing"]}
📊 Data Analysis{CATEGORY_DESCRIPTIONS["Data Analysis"]}
🧠 Reasoning{CATEGORY_DESCRIPTIONS["Reasoning"]}
🦄 Hallucination{CATEGORY_DESCRIPTIONS["Hallucination"]}
🛡️ Safety{CATEGORY_DESCRIPTIONS["Safety"]}
🔁 Repetition{CATEGORY_DESCRIPTIONS["Repetition"]}
📝 Summarization{CATEGORY_DESCRIPTIONS["Summarization"]}
🌐 Translation{CATEGORY_DESCRIPTIONS["Translation"]}
💬 Multi-Turn{CATEGORY_DESCRIPTIONS["Multi-Turn"]}
""")
df = get_dataframe_category()
gr.HTML("""
""")
leaderboard_tab_cat = create_leaderboard_tab(
df,
"Category",
)
gr.HTML("
")
# --- Category Radar Chart Section ---
from vis_utils import load_leaderboard_data, create_domain_radar_chart
initial_df_cat = load_leaderboard_data()
# Top 5 models based on leaderboard (Average Accuracy)
if "Overall" in initial_df_cat.columns:
top5_models_cat = initial_df_cat.sort_values("Overall", ascending=False)['Model Name'].tolist()[:5]
else:
top5_models_cat = initial_df_cat['Model Name'].tolist()[:5]
gr.HTML('
')
# Radar chart model selector (up to 5)
from src.display.formatting import get_display_model_name
display_names_cat = initial_df_cat['Model Name'].apply(get_display_model_name).tolist()
original_names_cat = initial_df_cat['Model Name'].tolist()
display_to_original_cat = dict(zip(display_names_cat, original_names_cat))
top5_display_names_cat = [get_display_model_name(m) for m in top5_models_cat]
model_selector_cat = gr.Dropdown(
choices=display_names_cat,
value=top5_display_names_cat,
multiselect=True,
label="🎯 Select Models for Radar Chart",
info="Choose up to 5 models to visualize",
elem_classes=["dropdown", "custom-dropdown"],
interactive=True,
filterable=True,
allow_custom_value=False
)
gr.HTML("""
""")
radar_chart_cat = gr.Plot(
label="",
value=create_domain_radar_chart(
initial_df_cat,
"Average Accuracy",
top5_models_cat
),
elem_classes=["radar-chart", "plot-container"]
)
gr.HTML('
')
# Update radar chart when model_selector_cat selection changes
def update_radar_chart_cat(selected_display_names):
# If no selection, fallback to top-5
if not selected_display_names or len(selected_display_names) == 0:
df = load_leaderboard_data()
selected_display_names = [get_display_model_name(m) for m in df['Model Name'].tolist()[:5]]
selected_models = [display_to_original_cat[name] for name in selected_display_names if name in display_to_original_cat]
return create_domain_radar_chart(
load_leaderboard_data(),
"Average Accuracy",
selected_models
)
model_selector_cat.change(
fn=update_radar_chart_cat,
inputs=model_selector_cat,
outputs=radar_chart_cat
)
# --- Med. Len. vs Overall Scatter Plot Section ---
from vis_utils import create_len_overall_scatter
import json
with open("src/data/length_data.json", "r") as f:
length_data = json.load(f)
# --- Create a Gradio State component to hold length_data ---
length_data_state = gr.State(value=length_data)
gr.HTML("""
Explore the relationship between median output length and model performance by category
""")
# Category selection buttons (HTML + Gradio Radio for event)
category_columns = [col for col in configs.ON_LOAD_COLUMNS_CATEGORY if col not in configs.CATEGORY_EXCLUDED_COLUMNS]
# (cat-btn-radio related style block removed, now handled in custom_css)
category_selector = gr.Radio(
choices=category_columns,
value="Overall",
label="Select Category for Y-Axis",
elem_id="cat-btn-radio",
elem_classes=["cat-btn-radio"],
interactive=True,
show_label=False
)
x_axis_selector = gr.Radio(
choices=["Med. Len.", "Med. Resp. Len."],
value="Med. Len.",
label="Select X-Axis Data",
elem_id="x-axis-btn-radio",
elem_classes=["x-axis-btn-radio"],
interactive=True,
show_label=True
)
gr.HTML('
')
scatter_plot_cat = gr.Plot(
label="",
value=create_len_overall_scatter(
load_leaderboard_data(),
y_col="Overall",
length_data=length_data,
x_axis_data_source=x_axis_selector.value
),
elem_classes=["efficiency-chart", "plot-container"]
)
gr.HTML('
')
gr.HTML("
")
# Update plot when category or x-axis selection changes
def update_scatter_plot_cat(selected_category, selected_x_source, current_length_data_state):
return create_len_overall_scatter(
load_leaderboard_data(),
y_col=selected_category,
length_data=current_length_data_state,
x_axis_data_source=selected_x_source
)
category_selector.change(
fn=update_scatter_plot_cat,
inputs=[category_selector, x_axis_selector, length_data_state],
outputs=scatter_plot_cat
)
x_axis_selector.change(
fn=update_scatter_plot_cat,
inputs=[category_selector, x_axis_selector, length_data_state],
outputs=scatter_plot_cat
)
# When leaderboard selectors change, synchronize model_selector_cat and radar_chart_cat to top-5
def update_model_selector_and_radar_chart_cat_from_leaderboard(types, model_types, thinks, df, sort_col):
_, _, top5_models = leaderboard_tab_cat["unified_filter"](types, model_types, thinks, df, sort_col)
top5_display_names = [get_display_model_name(m) for m in top5_models[:5]]
return gr.update(value=top5_display_names), create_domain_radar_chart(
load_leaderboard_data(),
"Average Accuracy",
top5_models[:5]
)
leaderboard_selectors_cat = [
leaderboard_tab_cat["type_selector"],
leaderboard_tab_cat["model_type_selector"],
leaderboard_tab_cat["think_selector"],
leaderboard_tab_cat["df_state"],
leaderboard_tab_cat["sort_col_dropdown"]
]
for selector in leaderboard_selectors_cat:
selector.change(
fn=update_model_selector_and_radar_chart_cat_from_leaderboard,
inputs=leaderboard_selectors_cat,
outputs=[model_selector_cat, radar_chart_cat]
)
gr.HTML("""
As a multilingual benchmark, TRUEBench supports a total of 12 user input languages: Korean (KO), English (EN), Japanese (JA), Chinese (ZH), Polish (PL), German (DE), Portuguese (PT), Spanish (ES), French (FR), Italian (IT), Russian (RU), and Vietnamese (VI).
""")
df = get_dataframe_language()
leaderboard_tab_lang = create_leaderboard_tab(
df,
"Language",
)
# --- Language Radar Chart Section ---
from vis_utils import load_leaderboard_language_data, create_language_radar_chart
initial_df_lang = load_leaderboard_language_data()
# Top 5 models based on leaderboard (Overall)
if "Overall" in initial_df_lang.columns:
top5_models_lang = initial_df_lang.sort_values("Overall", ascending=False)['Model Name'].tolist()[:5]
else:
top5_models_lang = initial_df_lang['Model Name'].tolist()[:5]
gr.HTML('
')
# Add model selector
display_names_lang = initial_df_lang['Model Name'].apply(get_display_model_name).tolist()
original_names_lang = initial_df_lang['Model Name'].tolist()
display_to_original_lang = dict(zip(display_names_lang, original_names_lang))
top5_display_names_lang = [get_display_model_name(m) for m in top5_models_lang]
model_selector_lang = gr.Dropdown(
choices=display_names_lang,
value=top5_display_names_lang,
multiselect=True,
label="🎯 Select Models for Radar Chart",
info="Choose up to 5 models to visualize",
elem_classes=["dropdown", "custom-dropdown"],
interactive=True,
filterable=True,
allow_custom_value=False
)
gr.HTML("""
""")
radar_chart_lang = gr.Plot(
label="",
value=create_language_radar_chart(
initial_df_lang,
"Average Accuracy",
top5_models_lang
),
elem_classes=["radar-chart", "plot-container"]
)
gr.HTML('
')
# Update radar chart when model_selector_lang selection changes
def update_radar_chart_lang(selected_display_names):
if not selected_display_names or len(selected_display_names) == 0:
df = load_leaderboard_language_data()
selected_display_names = [get_display_model_name(m) for m in df['Model Name'].tolist()[:5]]
selected_models = [display_to_original_lang[name] for name in selected_display_names if name in display_to_original_lang]
return create_language_radar_chart(
load_leaderboard_language_data(),
"Average Accuracy",
selected_models
)
model_selector_lang.change(
fn=update_radar_chart_lang,
inputs=model_selector_lang,
outputs=radar_chart_lang
)
# When leaderboard selectors change, automatically synchronize model_selector_lang and radar_chart_lang to top-5
def update_model_selector_and_radar_chart_lang_from_leaderboard(types, model_types, thinks, df, sort_col):
_, _, top5_models = leaderboard_tab_lang["unified_filter"](types, model_types, thinks, df, sort_col)
top5_display_names = [get_display_model_name(m) for m in top5_models[:5]]
return gr.update(value=top5_display_names), create_language_radar_chart(
load_leaderboard_language_data(),
"Average Accuracy",
top5_models[:5]
)
leaderboard_selectors_lang = [
leaderboard_tab_lang["type_selector"],
leaderboard_tab_lang["model_type_selector"],
leaderboard_tab_lang["think_selector"],
leaderboard_tab_lang["df_state"],
leaderboard_tab_lang["sort_col_dropdown"]
]
for selector in leaderboard_selectors_lang:
selector.change(
fn=update_model_selector_and_radar_chart_lang_from_leaderboard,
inputs=leaderboard_selectors_lang,
outputs=[model_selector_lang, radar_chart_lang]
)
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
with gr.Column():
with gr.Row():
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
with gr.Row():
gr.Markdown(EVALUATION_QUEUE_TEXT_OPTION1, elem_classes="markdown-text")
with gr.Row():
gr.Markdown("## ✉️ Submit your model here!", elem_classes="markdown-text")
login_button = gr.LoginButton()
with gr.Row():
with gr.Column():
contact_email = gr.Textbox(label="Contact Email", placeholder="Your email address", interactive=True)
model_name_textbox = gr.Textbox(label="Model Name")
model_type_dropdown = gr.Dropdown(
choices=["Instruct", "Think", "Hybrid"],
label="Model Type (Instruct, Think, or Hybrid)",
multiselect=False,
value="Instruct",
interactive=True,
)
think_type_dropdown = gr.Dropdown(
choices=["On", "Off"],
label="Think Mode (On/Off)",
multiselect=False,
value="Off",
interactive=False,
)
precision = gr.Dropdown(
choices=[i.value.name for i in Precision if i != Precision.Unknown],
label="Precision",
multiselect=False,
value="float16",
interactive=True,
)
# --- Dynamically control think_type based on model_type and connect event ---
def update_think_type(model_type_value):
if model_type_value == "Instruct":
return gr.update(value="Off", interactive=False)
elif model_type_value == "Think":
return gr.update(value="On", interactive=False)
else: # Hybrid
return gr.update(value="On", interactive=True)
model_type_dropdown.change(
fn=update_think_type,
inputs=model_type_dropdown,
outputs=think_type_dropdown
)
response_prefix_textbox = gr.Textbox(label="Response prefix", placeholder="(e.g., )")
with gr.Column():
yml_textbox_placeholder = """# vLLM serving parameters
# Refence: https://docs.vllm.ai/en/latest/cli/serve.html
llm_serve_args:
max_model_len:
tensor_parallel_size:
dtype:
...
# OpenAI-compatible API (chat completion)
# Reference: https://platform.openai.com/docs/api-reference/chat
sampling_params:
top_p:
temperature:
presence_penalty:
...
# vLLM sampling parameters
# Reference: https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#chat-api_1
extra_body:
chat_template_kwargs:
enable_thinking:
...
top_k:
repetition_penalty:
..."""
yml_textbox = gr.Textbox(
label="Configuration (YAML format)",
elem_id="yml-textbox",
lines=7,
value=yml_textbox_placeholder
)
upbox = gr.File(
label="Upload configuration file as .yml or .yaml",
file_types=[".yml", ".yaml"],
type="filepath",
height=150
)
# Add Translate to JSON button below upbox
translate_button = gr.Button(
"Translate to JSON",
elem_id="translate-to-json-btn",
elem_classes=["translate-btn"],
scale=None
)
# Add custom style for the button
gr.HTML(
'''
'''
)
with gr.Column():
requirements_textbox = gr.Textbox(label="(Optional) Requirements", lines=30, elem_id="requirements-textbox")
output_dict = gr.Code(label="Translated Python Dictionary", language="json")
submit_button = gr.Button("Submit Eval")
submission_result = gr.Markdown()
def parse_and_display_yaml_config(upbox_path, yml_textbox_value):
import yaml, json
if upbox_path:
try:
with open(upbox_path, "r", encoding="utf-8") as f:
data = yaml.safe_load(f)
if data is None:
return "YAML file is empty."
return json.dumps(data, indent=4, ensure_ascii=False)
except Exception as e:
return f"Error parsing YAML file: {e}"
elif yml_textbox_value and yml_textbox_value.strip():
try:
data = yaml.safe_load(yml_textbox_value)
if data is None:
return "YAML textbox is empty or invalid."
return json.dumps(data, indent=4, ensure_ascii=False)
except Exception as e:
return f"Error parsing YAML textbox: {e}"
else:
return ""
event = submit_button.click(get_profile_and_organizations, inputs=[], outputs=[user_state, organization_state])
event.then(
add_new_eval_option,
[
contact_email,
model_name_textbox,
model_type_dropdown,
think_type_dropdown,
precision,
response_prefix_textbox,
requirements_textbox,
user_state,
organization_state,
yml_textbox,
upbox,
],
submission_result,
).then(
fn=parse_and_display_yaml_config,
inputs=[upbox, yml_textbox],
outputs=output_dict
)
translate_button.click(
fn=parse_and_display_yaml_config,
inputs=[upbox, yml_textbox],
outputs=output_dict
)
with gr.Row():
with gr.Accordion("📙 Citation", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
lines=20,
elem_id="citation-button",
show_copy_button=True,
)
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=1800)
scheduler.start()
demo.queue(default_concurrency_limit=40).launch()