Spaces:

kluster-ai
/

LLM-Hallucination-Detection-Leaderboard

Running

File size: 14,884 Bytes

import gradio as gr
import pandas as pd
from pathlib import Path
import plotly.express as px
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import snapshot_download
import textwrap

from src.about import (
    CITATION_BUTTON_LABEL,
    CITATION_BUTTON_TEXT,
    EVALUATION_QUEUE_TEXT,
    INTRODUCTION_TEXT,
    LLM_BENCHMARKS_TEXT,
    TITLE,
)
from src.display.css_html_js import custom_css
from src.display.utils import (
    BENCHMARK_COLS,
    COLS,
    EVAL_COLS,
    EVAL_TYPES,
    AutoEvalColumn,
    ModelType,
    fields,
    WeightType,
    Precision
)
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
from src.populate import get_evaluation_queue_df, get_leaderboard_df, get_rag_leaderboard_df
from src.submission.submit import add_new_eval
import base64


def restart_space():
    API.restart_space(repo_id=REPO_ID)



def make_rate_chart(df: pd.DataFrame):
    """Return a Plotly bar chart of hallucination rates."""
    # long-form dataframe for grouped bars
    df_long = df.melt(
        id_vars="Models",
        value_vars=["RAG Hallucination Rate (%)", "Non-RAG Hallucination Rate (%)"],
        var_name="Benchmark",
        value_name="Rate",
    )
    fig = px.bar(
        df_long,
        x="Models",
        y="Rate",
        color="Benchmark",
        barmode="group",
        title="Hallucination Rates by Model",
        height=400,
    )
    fig.update_layout(xaxis_title="", yaxis_title="%")
    return fig

def make_leaderboard_plot(df: pd.DataFrame, col: str, title: str, bar_color: str):

    df_sorted = df.sort_values(col, ascending=False)          
    fig = px.bar(
        df_sorted,
        x=col,
        y="Models",
        orientation="h",
        title=title,
        text_auto=".2f",
        height=400,
        color_discrete_sequence=[bar_color],
    )
    fig.update_traces(textposition="outside", cliponaxis=False)

    fig.update_layout(
        xaxis_title="Hallucination Rate (%)",
        yaxis_title="",
        yaxis=dict(dtick=1),   # ensure every model shown
        margin=dict(l=140, r=60, t=60, b=40)
    )
    fig.update_traces(textposition="outside")
    return fig


def make_rag_average_plot(df: pd.DataFrame, col: str, title: str, bar_color: str):
    rag_cols = [
        "Context in System Prompt (%)",
        "Context and Question Single-Turn (%)",
        "Context and Question Two-Turns (%)",
    ]

    df_plot = df.copy()
    if col not in df_plot.columns:
        df_plot[col] = df_plot[rag_cols].mean(axis=1, skipna=True).round(2)

    df_plot["Std Dev"] = df_plot[rag_cols].std(axis=1, skipna=True).round(2)

    df_sorted = df_plot.sort_values(col, ascending=False)

    fig = px.bar(
        df_sorted,
        x=col,
        y="Models",
        orientation="h",
        title=title,
        height=400,
        color_discrete_sequence=[bar_color],
        error_x="Std Dev",
    )
    fig.update_traces(
        texttemplate="%{x:.2f}",
        textposition="inside",
        insidetextanchor="middle",
        cliponaxis=False,
    )
    fig.update_layout(
        xaxis_title="Hallucination Rate (%)",
        yaxis_title="",
        yaxis=dict(dtick=1),
        margin=dict(l=140, r=60, t=60, b=40),
    )
    return fig


def make_rag_method_average_plot(df: pd.DataFrame, title: str, bar_color: str):
    method_cols = [
        "Context in System Prompt (%)",
        "Context and Question Single-Turn (%)",
        "Context and Question Two-Turns (%)",
    ]

    averages = df[method_cols].mean().round(2)
    stds = df[method_cols].std().round(2)

    avg_df = pd.DataFrame(
        {
            "RAG Method": averages.index,
            "Average Hallucination Rate (%)": averages.values,
            "Std Dev": stds.values,
        }
    )

    fig = px.bar(
        avg_df,
        x="RAG Method",
        y="Average Hallucination Rate (%)",
        error_y="Std Dev",
        title=title,
        height=400,
        color_discrete_sequence=[bar_color],
    )
    fig.update_traces(
        texttemplate="%{y:.2f}" if 'orientation' not in fig.data[0] or fig.data[0].orientation == 'v' else "%{x:.2f}",
        textposition="inside",
        insidetextanchor="start", 
        cliponaxis=False,
        textfont_color="white",
    )
    labels_map = {
        "Context in System Prompt (%)": "Context in<br>System Prompt",
        "Context and Question Single-Turn (%)": "Context & Question<br>Single-Turn",
        "Context and Question Two-Turns (%)": "Context & Question<br>Two-Turns",
    }
    fig.update_xaxes(
        tickmode="array",
        tickvals=list(labels_map.keys()),
        ticktext=list(labels_map.values()),
        tickangle=0,
        automargin=True,
    )
    fig.update_layout(
        xaxis_title="",
        yaxis_title="Hallucination Rate (%)",
        margin=dict(l=40, r=100, t=60, b=120), 
    )
    return fig


def color_scale(s, cmap):
    """
    Return background-colour styles for a numeric Series (lower = greener,
    higher = redder). Works with any palette length.
    """
    colours = px.colors.sequential.__dict__[cmap]
    n = len(colours) - 1                     # max valid index

    rng = s.max() - s.min()
    norm = (s - s.min()) / (rng if rng else 1)

    return [f"background-color:{colours[int(v * n)]}" for v in 1 - norm]


### Space initialisation
try:
    print(EVAL_REQUESTS_PATH)
    snapshot_download(
        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
    )
except Exception:
    # restart_space()
    print(f"[WARN] Skipping RESULTS sync: {Exception}")
try:
    print(EVAL_RESULTS_PATH)
    snapshot_download(
        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
    )
except Exception:
    # restart_space()
    print(f"[WARN] Skipping RESULTS sync: {Exception}")


LEADERBOARD_DF = get_leaderboard_df("leaderboard/data/leaderboard.csv")
RAG_DF = get_rag_leaderboard_df("leaderboard/data/rag_methods_compare.csv")


def init_leaderboard(df: pd.DataFrame):
    if df is None or df.empty:
        raise ValueError("Leaderboard DataFrame is empty or None.")
    
    return Leaderboard(
        value=df,
        datatype=["markdown", "markdown", "number", "number", "number"],  
        select_columns=SelectColumns(
            default_selection=[
                "Rank", "Models",
                "Average Hallucination Rate (%)",
                "RAG Hallucination Rate (%)",
                "Non-RAG Hallucination Rate (%)"
            ],
            cant_deselect=["Models", "Rank"],
            label="Select Columns to Display:",
        ),
        search_columns=["Models"],
        # column_widths=["3%"],   
        bool_checkboxgroup_label=None,
        interactive=False,
        height=800 
    )

image_path = "static/kluster-color.png"
with open(image_path, "rb") as img_file:
    b64_string = base64.b64encode(img_file.read()).decode("utf-8")


# print("CUSTOM CSS\n", custom_css[-1000:], "\n---------")
demo = gr.Blocks(css=custom_css)
with demo:
    gr.HTML(f"""
        <div style="text-align: center; margin-top: 2em; margin-bottom: 1em;">
            <img src="data:image/png;base64,{b64_string}" alt="kluster.ai logo"
                style="height: 80px; display: block; margin-left: auto; margin-right: auto;" />
            
            <div style="font-size: 2.5em; font-weight: bold; margin-top: 0.4em; color: var(--text-color);">
                LLM Hallucination Detection Leaderboard
            </div>
            
            <div style="font-size: 1.5em; margin-top: 0.5em;">
                Evaluating factual accuracy and faithfulness of LLMs in both RAG and non-RAG settings with
                <a href="https://platform.kluster.ai/verify" target="_blank">
                    Verify
                </a> by
                <a href="https://kluster.ai/" target="_blank">
                    kluster.ai
                </a> which provides an API for detecting hallucinations with any model. 
            </div>
        </div>
        """)


    with gr.Tabs(elem_classes="tab-buttons") as tabs:
        with gr.TabItem("🏅 Hallucination Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
            gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
            # ----------  Chart  ----------
            with gr.Row():
                with gr.Column():
                    gr.Plot(
                        make_leaderboard_plot(
                            LEADERBOARD_DF,
                            "RAG Hallucination Rate (%)",
                            "RAG Hallucination Rate (lower is better)",
                            bar_color="#4CAF50",
                        ),
                        show_label=False,
                    )
                    gr.Markdown("*HaluEval-QA benchmark (RAG): The model receives a question plus supporting context. We report the % of answers that introduce facts not found in that context — lower is better. See the **Methodology** section below for more information.*", elem_classes="plot-caption")
                with gr.Column():
                    gr.Plot(
                        make_leaderboard_plot(
                            LEADERBOARD_DF,
                            "Non-RAG Hallucination Rate (%)",
                            "Non-RAG Hallucination Rate (lower is better)",
                            bar_color="#FF7043",
                        ),
                        show_label=False,
                    )
                    gr.Markdown("*UltraChat benchmark (~11 k prompts, non-RAG): Evaluates open-domain answers when only the question is given. Score is the % of hallucinated responses — lower is better. See the **Methodology** section below for more information.*", elem_classes="plot-caption")

            # ----------  Leaderboard  ----------
            leaderboard = init_leaderboard(LEADERBOARD_DF)

            # ----------  Get Started with Verify  ----------
            verify_markdown = (Path(__file__).parent / "verify.md").read_text()

            gr.Markdown(verify_markdown, elem_classes="markdown-text")

            code_example_reliability = textwrap.dedent(
                r"""curl -X POST https://api.kluster.ai/v1/verify/reliability \
  -H "Authorization: Bearer YOUR_API_KEY" \
  -H "Content-Type: application/json" \
  -d '{
    "prompt": "Tell me about the new iPhone 20 features",
    "output": "The iPhone 20 includes a revolutionary holographic display, 200MP camera with AI scene detection, and can project 3D holograms up to 6 feet away for video calls.",
    "context": null
  }'"""
            )

            gr.Code(code_example_reliability, language="shell")

            code_example_chat = textwrap.dedent(
                r"""curl -X POST https://api.kluster.ai/v1/chat/completions \
  -H "Authorization: Bearer YOUR_API_KEY" \
  -H "Content-Type: application/json" \
  -d '{
    "model": "klusterai/verify-reliability",
    "messages": [
      { "role": "user", "content": "What can you tell me about Milos Burger Joint?" },
      { "role": "assistant", "content": "Milos Burger Joint has been serving authentic Burgers cuisine since 1999 and just won 2 Michelin stars last week, making it the highest-rated burger restaurant in the city." }
    ]
  }'"""
            )

            gr.Code(code_example_chat, language="shell")

            with gr.Accordion("📄 Methodology & Benchmark Details", open=True):
                gr.Markdown((Path(__file__).parent / "docs.md").read_text(), elem_classes="markdown-text")

        
        with gr.TabItem("🧪 RAG Techniques and Hallucinations", elem_id="llm-benchmark-tab-table", id=2):
            rag_intro_markdown = (Path(__file__).parent / "rag_techniques_intro.md").read_text()
            rag_details_markdown = (Path(__file__).parent / "rag_techniques_details.md").read_text()

            gr.Markdown(rag_intro_markdown, elem_classes="markdown-text")

            with gr.Row():
                with gr.Column():
                    gr.Plot(
                        make_rag_method_average_plot(
                            RAG_DF,
                            "Average Hallucination Rate by RAG Method (lower is better)",
                            bar_color="#4CAF50",
                        ),
                        show_label=False,
                    )
                    gr.Markdown(
                        "*Mean hallucination rate for each RAG prompting strategy across all models on the HaluEval-QA benchmark. Error bars represent ±1 SD; lower is better.*",
                        elem_classes="plot-caption",
                    )
                with gr.Column():
                    gr.Plot(
                        make_rag_average_plot(
                            RAG_DF,
                            "Average Hallucination Rate (%)",
                            "Average Hallucination Rate per Model (lower is better)",
                            bar_color="#2196F3",
                        ),
                        show_label=False,
                    )
                    gr.Markdown(
                        "*Mean hallucination rate across the three RAG prompting settings for each individual model. Error bars show ±1 SD across the three strategies; lower is better.*",
                        elem_classes="plot-caption",
                    )


            rag_leaderboard = Leaderboard(
                value=RAG_DF,
                datatype=["markdown", "number", "number", "number"],
                select_columns=SelectColumns(
                    default_selection=[
                        "Models",
                        "Context in System Prompt (%)",
                        "Context and Question Single-Turn (%)",
                        "Context and Question Two-Turns (%)",
                    ],
                    cant_deselect=["Models"],
                    label="Select RAG Method Columns:",
                ),
                search_columns=["Models"],
                bool_checkboxgroup_label=None,
                interactive=False,
                height=700 
            )

            with gr.Accordion("📄 RAG Techniques & Benchmark Details", open=True):
                gr.Markdown(rag_details_markdown, elem_classes="markdown-text")




        with gr.TabItem("🚀 Submit Here! ", elem_id="llm-benchmark-tab-table", id=4):
            gr.Markdown((Path(__file__).parent / "submit.md").read_text(), elem_classes="markdown-text")

   
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=1800)
scheduler.start()
demo.queue(default_concurrency_limit=40).launch(show_api=False)