import gradio as gr from apscheduler.schedulers.background import BackgroundScheduler from huggingface_hub import snapshot_download from src.data_utils import get_dataframe_category, get_dataframe_language import src.config as configs from utils import get_profile_and_organizations, download_with_restart from vis_utils import load_leaderboard_data, create_domain_radar_chart, create_len_overall_scatter from src.about import ( CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, EVALUATION_QUEUE_TEXT_OPTION1, INTRODUCTION_TEXT, BANNER, TITLE, LINK, ) from src.display.css_html_js import custom_css from src.display.utils import ( Precision ) from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN from src.submission.submit import add_new_eval_option from ui import create_leaderboard_tab def restart_space(): API.restart_space(repo_id=REPO_ID) ### Space initialisation download_with_restart( snapshot_download, repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", token=TOKEN, restart_func=restart_space ) download_with_restart( snapshot_download, repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", token=TOKEN, restart_func=restart_space ) theme = gr.themes.Default( primary_hue="gray", neutral_hue="gray" ) demo = gr.Blocks(css=custom_css, theme=theme) with demo: gr.HTML(BANNER + TITLE + LINK) user_state = gr.State() organization_state = gr.State() with gr.Tabs(elem_classes="tab-buttons") as main_tabs: with gr.TabItem("TRUEBench", elem_id="llm-benchmark-tab-table", id=2): gr.HTML(INTRODUCTION_TEXT) gr.HTML("""

Category Analysis

TRUEBench consists of 10 categories and 46 sub-categories which highly related to productivity assistants.

""") # --- Category Explanation Box (2x5 grid, emoji, desc from about.py) --- from src.about import CATEGORY_DESCRIPTIONS gr.HTML(f"""
📝 Content Generation
{CATEGORY_DESCRIPTIONS["Content Generation"]}
✂️ Editing
{CATEGORY_DESCRIPTIONS["Editing"]}
📊 Data Analysis
{CATEGORY_DESCRIPTIONS["Data Analysis"]}
🧠 Reasoning
{CATEGORY_DESCRIPTIONS["Reasoning"]}
🦄 Hallucination
{CATEGORY_DESCRIPTIONS["Hallucination"]}
🛡️ Safety
{CATEGORY_DESCRIPTIONS["Safety"]}
🔁 Repetition
{CATEGORY_DESCRIPTIONS["Repetition"]}
📝 Summarization
{CATEGORY_DESCRIPTIONS["Summarization"]}
🌐 Translation
{CATEGORY_DESCRIPTIONS["Translation"]}
💬 Multi-Turn
{CATEGORY_DESCRIPTIONS["Multi-Turn"]}
""") df = get_dataframe_category() gr.HTML("""
""") leaderboard_tab_cat = create_leaderboard_tab( df, "Category", ) gr.HTML("
") # --- Category Radar Chart Section --- from vis_utils import load_leaderboard_data, create_domain_radar_chart initial_df_cat = load_leaderboard_data() # Top 5 models based on leaderboard (Average Accuracy) if "Overall" in initial_df_cat.columns: top5_models_cat = initial_df_cat.sort_values("Overall", ascending=False)['Model Name'].tolist()[:5] else: top5_models_cat = initial_df_cat['Model Name'].tolist()[:5] gr.HTML('
') # Radar chart model selector (up to 5) from src.display.formatting import get_display_model_name display_names_cat = initial_df_cat['Model Name'].apply(get_display_model_name).tolist() original_names_cat = initial_df_cat['Model Name'].tolist() display_to_original_cat = dict(zip(display_names_cat, original_names_cat)) top5_display_names_cat = [get_display_model_name(m) for m in top5_models_cat] model_selector_cat = gr.Dropdown( choices=display_names_cat, value=top5_display_names_cat, multiselect=True, label="🎯 Select Models for Radar Chart", info="Choose up to 5 models to visualize", elem_classes=["dropdown", "custom-dropdown"], interactive=True, filterable=True, allow_custom_value=False ) gr.HTML(""" """) radar_chart_cat = gr.Plot( label="", value=create_domain_radar_chart( initial_df_cat, "Average Accuracy", top5_models_cat ), elem_classes=["radar-chart", "plot-container"] ) gr.HTML('
') # Update radar chart when model_selector_cat selection changes def update_radar_chart_cat(selected_display_names): # If no selection, fallback to top-5 if not selected_display_names or len(selected_display_names) == 0: df = load_leaderboard_data() selected_display_names = [get_display_model_name(m) for m in df['Model Name'].tolist()[:5]] selected_models = [display_to_original_cat[name] for name in selected_display_names if name in display_to_original_cat] return create_domain_radar_chart( load_leaderboard_data(), "Average Accuracy", selected_models ) model_selector_cat.change( fn=update_radar_chart_cat, inputs=model_selector_cat, outputs=radar_chart_cat ) # --- Med. Len. vs Overall Scatter Plot Section --- from vis_utils import create_len_overall_scatter import json with open("src/data/length_data.json", "r") as f: length_data = json.load(f) # --- Create a Gradio State component to hold length_data --- length_data_state = gr.State(value=length_data) gr.HTML("""

Output Length vs. Category Score

Explore the relationship between median output length and model performance by category

""") # Category selection buttons (HTML + Gradio Radio for event) category_columns = [col for col in configs.ON_LOAD_COLUMNS_CATEGORY if col not in configs.CATEGORY_EXCLUDED_COLUMNS] # (cat-btn-radio related style block removed, now handled in custom_css) category_selector = gr.Radio( choices=category_columns, value="Overall", label="Select Category for Y-Axis", elem_id="cat-btn-radio", elem_classes=["cat-btn-radio"], interactive=True, show_label=False ) x_axis_selector = gr.Radio( choices=["Med. Len.", "Med. Resp. Len."], value="Med. Len.", label="Select X-Axis Data", elem_id="x-axis-btn-radio", elem_classes=["x-axis-btn-radio"], interactive=True, show_label=True ) gr.HTML('
') scatter_plot_cat = gr.Plot( label="", value=create_len_overall_scatter( load_leaderboard_data(), y_col="Overall", length_data=length_data, x_axis_data_source=x_axis_selector.value ), elem_classes=["efficiency-chart", "plot-container"] ) gr.HTML('
') gr.HTML("
") # Update plot when category or x-axis selection changes def update_scatter_plot_cat(selected_category, selected_x_source, current_length_data_state): return create_len_overall_scatter( load_leaderboard_data(), y_col=selected_category, length_data=current_length_data_state, x_axis_data_source=selected_x_source ) category_selector.change( fn=update_scatter_plot_cat, inputs=[category_selector, x_axis_selector, length_data_state], outputs=scatter_plot_cat ) x_axis_selector.change( fn=update_scatter_plot_cat, inputs=[category_selector, x_axis_selector, length_data_state], outputs=scatter_plot_cat ) # When leaderboard selectors change, synchronize model_selector_cat and radar_chart_cat to top-5 def update_model_selector_and_radar_chart_cat_from_leaderboard(types, model_types, thinks, df, sort_col): _, _, top5_models = leaderboard_tab_cat["unified_filter"](types, model_types, thinks, df, sort_col) top5_display_names = [get_display_model_name(m) for m in top5_models[:5]] return gr.update(value=top5_display_names), create_domain_radar_chart( load_leaderboard_data(), "Average Accuracy", top5_models[:5] ) leaderboard_selectors_cat = [ leaderboard_tab_cat["type_selector"], leaderboard_tab_cat["model_type_selector"], leaderboard_tab_cat["think_selector"], leaderboard_tab_cat["df_state"], leaderboard_tab_cat["sort_col_dropdown"] ] for selector in leaderboard_selectors_cat: selector.change( fn=update_model_selector_and_radar_chart_cat_from_leaderboard, inputs=leaderboard_selectors_cat, outputs=[model_selector_cat, radar_chart_cat] ) gr.HTML("""

Language Analysis

As a multilingual benchmark, TRUEBench supports a total of 12 user input languages: Korean (KO), English (EN), Japanese (JA), Chinese (ZH), Polish (PL), German (DE), Portuguese (PT), Spanish (ES), French (FR), Italian (IT), Russian (RU), and Vietnamese (VI).

""") df = get_dataframe_language() leaderboard_tab_lang = create_leaderboard_tab( df, "Language", ) # --- Language Radar Chart Section --- from vis_utils import load_leaderboard_language_data, create_language_radar_chart initial_df_lang = load_leaderboard_language_data() # Top 5 models based on leaderboard (Overall) if "Overall" in initial_df_lang.columns: top5_models_lang = initial_df_lang.sort_values("Overall", ascending=False)['Model Name'].tolist()[:5] else: top5_models_lang = initial_df_lang['Model Name'].tolist()[:5] gr.HTML('
') # Add model selector display_names_lang = initial_df_lang['Model Name'].apply(get_display_model_name).tolist() original_names_lang = initial_df_lang['Model Name'].tolist() display_to_original_lang = dict(zip(display_names_lang, original_names_lang)) top5_display_names_lang = [get_display_model_name(m) for m in top5_models_lang] model_selector_lang = gr.Dropdown( choices=display_names_lang, value=top5_display_names_lang, multiselect=True, label="🎯 Select Models for Radar Chart", info="Choose up to 5 models to visualize", elem_classes=["dropdown", "custom-dropdown"], interactive=True, filterable=True, allow_custom_value=False ) gr.HTML(""" """) radar_chart_lang = gr.Plot( label="", value=create_language_radar_chart( initial_df_lang, "Average Accuracy", top5_models_lang ), elem_classes=["radar-chart", "plot-container"] ) gr.HTML('
') # Update radar chart when model_selector_lang selection changes def update_radar_chart_lang(selected_display_names): if not selected_display_names or len(selected_display_names) == 0: df = load_leaderboard_language_data() selected_display_names = [get_display_model_name(m) for m in df['Model Name'].tolist()[:5]] selected_models = [display_to_original_lang[name] for name in selected_display_names if name in display_to_original_lang] return create_language_radar_chart( load_leaderboard_language_data(), "Average Accuracy", selected_models ) model_selector_lang.change( fn=update_radar_chart_lang, inputs=model_selector_lang, outputs=radar_chart_lang ) # When leaderboard selectors change, automatically synchronize model_selector_lang and radar_chart_lang to top-5 def update_model_selector_and_radar_chart_lang_from_leaderboard(types, model_types, thinks, df, sort_col): _, _, top5_models = leaderboard_tab_lang["unified_filter"](types, model_types, thinks, df, sort_col) top5_display_names = [get_display_model_name(m) for m in top5_models[:5]] return gr.update(value=top5_display_names), create_language_radar_chart( load_leaderboard_language_data(), "Average Accuracy", top5_models[:5] ) leaderboard_selectors_lang = [ leaderboard_tab_lang["type_selector"], leaderboard_tab_lang["model_type_selector"], leaderboard_tab_lang["think_selector"], leaderboard_tab_lang["df_state"], leaderboard_tab_lang["sort_col_dropdown"] ] for selector in leaderboard_selectors_lang: selector.change( fn=update_model_selector_and_radar_chart_lang_from_leaderboard, inputs=leaderboard_selectors_lang, outputs=[model_selector_lang, radar_chart_lang] ) with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3): with gr.Column(): with gr.Row(): gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text") with gr.Row(): gr.Markdown(EVALUATION_QUEUE_TEXT_OPTION1, elem_classes="markdown-text") with gr.Row(): gr.Markdown("## ✉️ Submit your model here!", elem_classes="markdown-text") login_button = gr.LoginButton() with gr.Row(): with gr.Column(): contact_email = gr.Textbox(label="Contact Email", placeholder="Your email address", interactive=True) model_name_textbox = gr.Textbox(label="Model Name") model_type_dropdown = gr.Dropdown( choices=["Instruct", "Think", "Hybrid"], label="Model Type (Instruct, Think, or Hybrid)", multiselect=False, value="Instruct", interactive=True, ) think_type_dropdown = gr.Dropdown( choices=["On", "Off"], label="Think Mode (On/Off)", multiselect=False, value="Off", interactive=False, ) precision = gr.Dropdown( choices=[i.value.name for i in Precision if i != Precision.Unknown], label="Precision", multiselect=False, value="float16", interactive=True, ) # --- Dynamically control think_type based on model_type and connect event --- def update_think_type(model_type_value): if model_type_value == "Instruct": return gr.update(value="Off", interactive=False) elif model_type_value == "Think": return gr.update(value="On", interactive=False) else: # Hybrid return gr.update(value="On", interactive=True) model_type_dropdown.change( fn=update_think_type, inputs=model_type_dropdown, outputs=think_type_dropdown ) response_prefix_textbox = gr.Textbox(label="Response prefix", placeholder="(e.g., )") with gr.Column(): yml_textbox_placeholder = """# vLLM serving parameters # Refence: https://docs.vllm.ai/en/latest/cli/serve.html llm_serve_args: max_model_len: tensor_parallel_size: dtype: ... # OpenAI-compatible API (chat completion) # Reference: https://platform.openai.com/docs/api-reference/chat sampling_params: top_p: temperature: presence_penalty: ... # vLLM sampling parameters # Reference: https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#chat-api_1 extra_body: chat_template_kwargs: enable_thinking: ... top_k: repetition_penalty: ...""" yml_textbox = gr.Textbox( label="Configuration (YAML format)", elem_id="yml-textbox", lines=7, value=yml_textbox_placeholder ) upbox = gr.File( label="Upload configuration file as .yml or .yaml", file_types=[".yml", ".yaml"], type="filepath", height=150 ) # Add Translate to JSON button below upbox translate_button = gr.Button( "Translate to JSON", elem_id="translate-to-json-btn", elem_classes=["translate-btn"], scale=None ) # Add custom style for the button gr.HTML( ''' ''' ) with gr.Column(): requirements_textbox = gr.Textbox(label="(Optional) Requirements", lines=30, elem_id="requirements-textbox") output_dict = gr.Code(label="Translated Python Dictionary", language="json") submit_button = gr.Button("Submit Eval") submission_result = gr.Markdown() def parse_and_display_yaml_config(upbox_path, yml_textbox_value): import yaml, json if upbox_path: try: with open(upbox_path, "r", encoding="utf-8") as f: data = yaml.safe_load(f) if data is None: return "YAML file is empty." return json.dumps(data, indent=4, ensure_ascii=False) except Exception as e: return f"Error parsing YAML file: {e}" elif yml_textbox_value and yml_textbox_value.strip(): try: data = yaml.safe_load(yml_textbox_value) if data is None: return "YAML textbox is empty or invalid." return json.dumps(data, indent=4, ensure_ascii=False) except Exception as e: return f"Error parsing YAML textbox: {e}" else: return "" event = submit_button.click(get_profile_and_organizations, inputs=[], outputs=[user_state, organization_state]) event.then( add_new_eval_option, [ contact_email, model_name_textbox, model_type_dropdown, think_type_dropdown, precision, response_prefix_textbox, requirements_textbox, user_state, organization_state, yml_textbox, upbox, ], submission_result, ).then( fn=parse_and_display_yaml_config, inputs=[upbox, yml_textbox], outputs=output_dict ) translate_button.click( fn=parse_and_display_yaml_config, inputs=[upbox, yml_textbox], outputs=output_dict ) with gr.Row(): with gr.Accordion("📙 Citation", open=False): citation_button = gr.Textbox( value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True, ) scheduler = BackgroundScheduler() scheduler.add_job(restart_space, "interval", seconds=1800) scheduler.start() demo.queue(default_concurrency_limit=40).launch()