import gradio as gr import pandas as pd from pathlib import Path import plotly.express as px from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns from apscheduler.schedulers.background import BackgroundScheduler from huggingface_hub import snapshot_download import textwrap from src.about import ( CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE, ) from src.display.css_html_js import custom_css from src.display.utils import ( BENCHMARK_COLS, COLS, EVAL_COLS, EVAL_TYPES, AutoEvalColumn, ModelType, fields, WeightType, Precision ) from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN from src.populate import get_evaluation_queue_df, get_leaderboard_df, get_rag_leaderboard_df from src.submission.submit import add_new_eval import base64 def restart_space(): API.restart_space(repo_id=REPO_ID) def make_rate_chart(df: pd.DataFrame): """Return a Plotly bar chart of hallucination rates.""" # long-form dataframe for grouped bars df_long = df.melt( id_vars="Models", value_vars=["RAG Hallucination Rate (%)", "Non-RAG Hallucination Rate (%)"], var_name="Benchmark", value_name="Rate", ) fig = px.bar( df_long, x="Models", y="Rate", color="Benchmark", barmode="group", title="Hallucination Rates by Model", height=400, ) fig.update_layout(xaxis_title="", yaxis_title="%") return fig def make_leaderboard_plot(df: pd.DataFrame, col: str, title: str, bar_color: str): df_sorted = df.sort_values(col, ascending=False) fig = px.bar( df_sorted, x=col, y="Models", orientation="h", title=title, text_auto=".2f", height=400, color_discrete_sequence=[bar_color], ) fig.update_traces(textposition="outside", cliponaxis=False) fig.update_layout( xaxis_title="Hallucination Rate (%)", yaxis_title="", yaxis=dict(dtick=1), # ensure every model shown margin=dict(l=140, r=60, t=60, b=40) ) fig.update_traces(textposition="outside") return fig def make_rag_average_plot(df: pd.DataFrame, col: str, title: str, bar_color: str): rag_cols = [ "Context in System Prompt (%)", "Context and Question Single-Turn (%)", "Context and Question Two-Turns (%)", ] df_plot = df.copy() if col not in df_plot.columns: df_plot[col] = df_plot[rag_cols].mean(axis=1, skipna=True).round(2) df_plot["Std Dev"] = df_plot[rag_cols].std(axis=1, skipna=True).round(2) df_sorted = df_plot.sort_values(col, ascending=False) fig = px.bar( df_sorted, x=col, y="Models", orientation="h", title=title, height=400, color_discrete_sequence=[bar_color], error_x="Std Dev", ) fig.update_traces( texttemplate="%{x:.2f}", textposition="inside", insidetextanchor="middle", cliponaxis=False, ) fig.update_layout( xaxis_title="Hallucination Rate (%)", yaxis_title="", yaxis=dict(dtick=1), margin=dict(l=140, r=60, t=60, b=40), ) return fig def make_rag_method_average_plot(df: pd.DataFrame, title: str, bar_color: str): method_cols = [ "Context in System Prompt (%)", "Context and Question Single-Turn (%)", "Context and Question Two-Turns (%)", ] averages = df[method_cols].mean().round(2) stds = df[method_cols].std().round(2) avg_df = pd.DataFrame( { "RAG Method": averages.index, "Average Hallucination Rate (%)": averages.values, "Std Dev": stds.values, } ) fig = px.bar( avg_df, x="RAG Method", y="Average Hallucination Rate (%)", error_y="Std Dev", title=title, height=400, color_discrete_sequence=[bar_color], ) fig.update_traces( texttemplate="%{y:.2f}" if 'orientation' not in fig.data[0] or fig.data[0].orientation == 'v' else "%{x:.2f}", textposition="inside", insidetextanchor="start", cliponaxis=False, textfont_color="white", ) labels_map = { "Context in System Prompt (%)": "Context in
System Prompt", "Context and Question Single-Turn (%)": "Context & Question
Single-Turn", "Context and Question Two-Turns (%)": "Context & Question
Two-Turns", } fig.update_xaxes( tickmode="array", tickvals=list(labels_map.keys()), ticktext=list(labels_map.values()), tickangle=0, automargin=True, ) fig.update_layout( xaxis_title="", yaxis_title="Hallucination Rate (%)", margin=dict(l=40, r=100, t=60, b=120), ) return fig def color_scale(s, cmap): """ Return background-colour styles for a numeric Series (lower = greener, higher = redder). Works with any palette length. """ colours = px.colors.sequential.__dict__[cmap] n = len(colours) - 1 # max valid index rng = s.max() - s.min() norm = (s - s.min()) / (rng if rng else 1) return [f"background-color:{colours[int(v * n)]}" for v in 1 - norm] ### Space initialisation try: print(EVAL_REQUESTS_PATH) snapshot_download( repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN ) except Exception: # restart_space() print(f"[WARN] Skipping RESULTS sync: {Exception}") try: print(EVAL_RESULTS_PATH) snapshot_download( repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN ) except Exception: # restart_space() print(f"[WARN] Skipping RESULTS sync: {Exception}") LEADERBOARD_DF = get_leaderboard_df("leaderboard/data/leaderboard.csv") RAG_DF = get_rag_leaderboard_df("leaderboard/data/rag_methods_compare.csv") def init_leaderboard(df: pd.DataFrame): if df is None or df.empty: raise ValueError("Leaderboard DataFrame is empty or None.") return Leaderboard( value=df, datatype=["markdown", "markdown", "number", "number", "number"], select_columns=SelectColumns( default_selection=[ "Rank", "Models", "Average Hallucination Rate (%)", "RAG Hallucination Rate (%)", "Non-RAG Hallucination Rate (%)" ], cant_deselect=["Models", "Rank"], label="Select Columns to Display:", ), search_columns=["Models"], # column_widths=["3%"], bool_checkboxgroup_label=None, interactive=False, height=800 ) image_path = "static/kluster-color.png" with open(image_path, "rb") as img_file: b64_string = base64.b64encode(img_file.read()).decode("utf-8") # print("CUSTOM CSS\n", custom_css[-1000:], "\n---------") demo = gr.Blocks(css=custom_css) with demo: gr.HTML(f"""
kluster.ai logo
LLM Hallucination Detection Leaderboard
Evaluating factual accuracy and faithfulness of LLMs in both RAG and non-RAG settings with Verify by kluster.ai which provides an API for detecting hallucinations with any model.
""") with gr.Tabs(elem_classes="tab-buttons") as tabs: with gr.TabItem("๐Ÿ… Hallucination Leaderboard", elem_id="llm-benchmark-tab-table", id=0): gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") # ---------- Chart ---------- with gr.Row(): with gr.Column(): gr.Plot( make_leaderboard_plot( LEADERBOARD_DF, "RAG Hallucination Rate (%)", "RAG Hallucination Rate (lower is better)", bar_color="#4CAF50", ), show_label=False, ) gr.Markdown("*HaluEval-QA benchmark (RAG): The model receives a question plus supporting context. We report the % of answers that introduce facts not found in that context โ€” lower is better. See the **Methodology** section below for more information.*", elem_classes="plot-caption") with gr.Column(): gr.Plot( make_leaderboard_plot( LEADERBOARD_DF, "Non-RAG Hallucination Rate (%)", "Non-RAG Hallucination Rate (lower is better)", bar_color="#FF7043", ), show_label=False, ) gr.Markdown("*UltraChat benchmark (~11 k prompts, non-RAG): Evaluates open-domain answers when only the question is given. Score is the % of hallucinated responses โ€” lower is better. See the **Methodology** section below for more information.*", elem_classes="plot-caption") # ---------- Leaderboard ---------- leaderboard = init_leaderboard(LEADERBOARD_DF) # ---------- Get Started with Verify ---------- verify_markdown = (Path(__file__).parent / "verify.md").read_text() gr.Markdown(verify_markdown, elem_classes="markdown-text") code_example_reliability = textwrap.dedent( r"""curl -X POST https://api.kluster.ai/v1/verify/reliability \ -H "Authorization: Bearer YOUR_API_KEY" \ -H "Content-Type: application/json" \ -d '{ "prompt": "Tell me about the new iPhone 20 features", "output": "The iPhone 20 includes a revolutionary holographic display, 200MP camera with AI scene detection, and can project 3D holograms up to 6 feet away for video calls.", "context": null }'""" ) gr.Code(code_example_reliability, language="shell") code_example_chat = textwrap.dedent( r"""curl -X POST https://api.kluster.ai/v1/chat/completions \ -H "Authorization: Bearer YOUR_API_KEY" \ -H "Content-Type: application/json" \ -d '{ "model": "klusterai/verify-reliability", "messages": [ { "role": "user", "content": "What can you tell me about Milos Burger Joint?" }, { "role": "assistant", "content": "Milos Burger Joint has been serving authentic Burgers cuisine since 1999 and just won 2 Michelin stars last week, making it the highest-rated burger restaurant in the city." } ] }'""" ) gr.Code(code_example_chat, language="shell") gr.Markdown((Path(__file__).parent / "docs.md").read_text(), elem_classes="markdown-text") with gr.TabItem("๐Ÿงช RAG Techniques and Hallucinations", elem_id="llm-benchmark-tab-table", id=2): rag_intro_markdown = (Path(__file__).parent / "rag_techniques_intro.md").read_text() rag_details_markdown = (Path(__file__).parent / "rag_techniques_details.md").read_text() gr.Markdown(rag_intro_markdown, elem_classes="markdown-text") with gr.Row(): with gr.Column(): gr.Plot( make_rag_method_average_plot( RAG_DF, "Average Hallucination Rate by RAG Method (lower is better)", bar_color="#4CAF50", ), show_label=False, ) gr.Markdown( "*Mean hallucination rate for each RAG prompting strategy across all models on the HaluEval-QA benchmark. Error bars represent ยฑ1 SD; lower is better.*", elem_classes="plot-caption", ) with gr.Column(): gr.Plot( make_rag_average_plot( RAG_DF, "Average Hallucination Rate (%)", "Average Hallucination Rate per Model (lower is better)", bar_color="#2196F3", ), show_label=False, ) gr.Markdown( "*Mean hallucination rate across the three RAG prompting settings for each individual model. Error bars show ยฑ1 SD across the three strategies; lower is better.*", elem_classes="plot-caption", ) rag_leaderboard = Leaderboard( value=RAG_DF, datatype=["markdown", "number", "number", "number"], select_columns=SelectColumns( default_selection=[ "Models", "Context in System Prompt (%)", "Context and Question Single-Turn (%)", "Context and Question Two-Turns (%)", ], cant_deselect=["Models"], label="Select RAG Method Columns:", ), search_columns=["Models"], bool_checkboxgroup_label=None, interactive=False, height=700 ) gr.Markdown(rag_details_markdown, elem_classes="markdown-text") with gr.TabItem("๐Ÿš€ Submit Here! ", elem_id="llm-benchmark-tab-table", id=4): gr.Markdown((Path(__file__).parent / "submit.md").read_text(), elem_classes="markdown-text") scheduler = BackgroundScheduler() scheduler.add_job(restart_space, "interval", seconds=1800) scheduler.start() demo.queue(default_concurrency_limit=40).launch(show_api=False)