import gradio as gr import pandas as pd from pathlib import Path import plotly.express as px from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns from apscheduler.schedulers.background import BackgroundScheduler from huggingface_hub import snapshot_download from src.about import ( CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE, ) from src.display.css_html_js import custom_css from src.display.utils import ( BENCHMARK_COLS, COLS, EVAL_COLS, EVAL_TYPES, AutoEvalColumn, ModelType, fields, WeightType, Precision ) from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN from src.populate import get_evaluation_queue_df, get_leaderboard_df from src.submission.submit import add_new_eval import base64 def restart_space(): API.restart_space(repo_id=REPO_ID) def make_rate_chart(df: pd.DataFrame): """Return a Plotly bar chart of hallucination rates.""" # long-form dataframe for grouped bars df_long = df.melt( id_vars="Models", value_vars=["RAG Hallucination Rate (%)", "Non-RAG Hallucination Rate (%)"], var_name="Benchmark", value_name="Rate", ) fig = px.bar( df_long, x="Models", y="Rate", color="Benchmark", barmode="group", title="Hallucination Rates by Model", height=400, ) fig.update_layout(xaxis_title="", yaxis_title="%") return fig def make_leaderboard_plot(df: pd.DataFrame, col: str, title: str, bar_color: str): """ Return a horizontal bar chart sorted ascending by `col`. Lowest value (best) at the top. """ df_sorted = df.sort_values(col, ascending=False) # best → worst fig = px.bar( df_sorted, x=col, y="Models", orientation="h", title=title, text_auto=".2f", height=400, color_discrete_sequence=[bar_color], ) fig.update_traces(textposition="outside", cliponaxis=False) fig.update_layout( xaxis_title="Hallucination Rate (%)", yaxis_title="", yaxis=dict(dtick=1), # ensure every model shown margin=dict(l=140, r=60, t=60, b=40) ) fig.update_traces(textposition="outside") return fig def color_scale(s, cmap): """ Return background-colour styles for a numeric Series (lower = greener, higher = redder). Works with any palette length. """ colours = px.colors.sequential.__dict__[cmap] n = len(colours) - 1 # max valid index rng = s.max() - s.min() norm = (s - s.min()) / (rng if rng else 1) return [f"background-color:{colours[int(v * n)]}" for v in 1 - norm] ### Space initialisation try: print(EVAL_REQUESTS_PATH) snapshot_download( repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN ) except Exception: # restart_space() print(f"[WARN] Skipping RESULTS sync: {Exception}") try: print(EVAL_RESULTS_PATH) snapshot_download( repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN ) except Exception: # restart_space() print(f"[WARN] Skipping RESULTS sync: {Exception}") # LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS) LEADERBOARD_DF = get_leaderboard_df("leaderboard/data/leaderboard.csv") # ( # finished_eval_queue_df, # running_eval_queue_df, # pending_eval_queue_df, # ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS) def init_leaderboard(df: pd.DataFrame): if df is None or df.empty: raise ValueError("Leaderboard DataFrame is empty or None.") return Leaderboard( value=df, datatype=["markdown", "markdown", "number", "number", "number"], select_columns=SelectColumns( default_selection=[ "Rank", "Models", "Average Hallucination Rate (%)", "RAG Hallucination Rate (%)", "Non-RAG Hallucination Rate (%)" ], cant_deselect=["Models", "Rank"], label="Select Columns to Display:", ), search_columns=["Models"], # column_widths=["3%"], bool_checkboxgroup_label=None, interactive=False, ) image_path = "static/kluster-color.png" with open(image_path, "rb") as img_file: b64_string = base64.b64encode(img_file.read()).decode("utf-8") # print("CUSTOM CSS\n", custom_css[-1000:], "\n---------") demo = gr.Blocks(css=custom_css) with demo: gr.HTML(f"""
kluster.ai logo
LLM Hallucination Detection Leaderboard
Evaluating factual accuracy and faithfulness of LLMs in both RAG and real-world knowledge settings with Verify by kluster.ai
""") # gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") with gr.Tabs(elem_classes="tab-buttons") as tabs: with gr.TabItem("🏅 Hallucination Leaderboard", elem_id="llm-benchmark-tab-table", id=0): # ---------- Chart ---------- with gr.Row(): gr.Plot( make_leaderboard_plot( LEADERBOARD_DF, "RAG Hallucination Rate (%)", "RAG Hallucination Rate (lower is better)", bar_color="#4CAF50", ), show_label=False, ) gr.Plot( make_leaderboard_plot( LEADERBOARD_DF, "Non-RAG Hallucination Rate (%)", "Non-RAG Hallucination Rate (lower is better)", bar_color="#FF7043", ), show_label=False, ) # ---------- Leaderboard ---------- leaderboard = init_leaderboard(LEADERBOARD_DF) with gr.TabItem("📝 Details", elem_id="llm-benchmark-tab-table", id=2): gr.Markdown((Path(__file__).parent / "docs.md").read_text()) with gr.TabItem("🚀 Submit Here! ", elem_id="llm-benchmark-tab-table", id=3): gr.Markdown((Path(__file__).parent / "submit.md").read_text()) # with gr.Column(): # with gr.Row(): # gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text") # with gr.Column(): # with gr.Accordion( # f"✅ Finished Evaluations ({len(finished_eval_queue_df)})", # open=False, # ): # with gr.Row(): # finished_eval_table = gr.components.Dataframe( # value=finished_eval_queue_df, # headers=EVAL_COLS, # datatype=EVAL_TYPES, # row_count=5, # ) # with gr.Accordion( # f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})", # open=False, # ): # with gr.Row(): # running_eval_table = gr.components.Dataframe( # value=running_eval_queue_df, # headers=EVAL_COLS, # datatype=EVAL_TYPES, # row_count=5, # ) # with gr.Accordion( # f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})", # open=False, # ): # with gr.Row(): # pending_eval_table = gr.components.Dataframe( # value=pending_eval_queue_df, # headers=EVAL_COLS, # datatype=EVAL_TYPES, # row_count=5, # ) # with gr.Row(): # gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text") # with gr.Row(): # with gr.Column(): # model_name_textbox = gr.Textbox(label="Model name") # revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main") # model_type = gr.Dropdown( # choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown], # label="Model type", # multiselect=False, # value=None, # interactive=True, # ) # with gr.Column(): # precision = gr.Dropdown( # choices=[i.value.name for i in Precision if i != Precision.Unknown], # label="Precision", # multiselect=False, # value="float16", # interactive=True, # ) # weight_type = gr.Dropdown( # choices=[i.value.name for i in WeightType], # label="Weights type", # multiselect=False, # value="Original", # interactive=True, # ) # base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)") # submit_button = gr.Button("Submit Eval") # submission_result = gr.Markdown() # submit_button.click( # add_new_eval, # [ # model_name_textbox, # base_model_name_textbox, # revision_name_textbox, # precision, # weight_type, # model_type, # ], # submission_result, # ) # with gr.Row(): # with gr.Accordion("📙 Citation", open=False): # citation_button = gr.Textbox( # value=CITATION_BUTTON_TEXT, # label=CITATION_BUTTON_LABEL, # lines=20, # elem_id="citation-button", # show_copy_button=True, # ) scheduler = BackgroundScheduler() scheduler.add_job(restart_space, "interval", seconds=1800) scheduler.start() demo.queue(default_concurrency_limit=40).launch()