Spaces:

kluster-ai
/

LLM-Hallucination-Detection-Leaderboard

Running

File size: 11,905 Bytes

import gradio as gr
import pandas as pd
from pathlib import Path
import plotly.express as px
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import snapshot_download

from src.about import (
    CITATION_BUTTON_LABEL,
    CITATION_BUTTON_TEXT,
    EVALUATION_QUEUE_TEXT,
    INTRODUCTION_TEXT,
    LLM_BENCHMARKS_TEXT,
    TITLE,
)
from src.display.css_html_js import custom_css
from src.display.utils import (
    BENCHMARK_COLS,
    COLS,
    EVAL_COLS,
    EVAL_TYPES,
    AutoEvalColumn,
    ModelType,
    fields,
    WeightType,
    Precision
)
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
from src.populate import get_evaluation_queue_df, get_leaderboard_df
from src.submission.submit import add_new_eval
import base64


def restart_space():
    API.restart_space(repo_id=REPO_ID)



def make_rate_chart(df: pd.DataFrame):
    """Return a Plotly bar chart of hallucination rates."""
    # long-form dataframe for grouped bars
    df_long = df.melt(
        id_vars="Models",
        value_vars=["RAG Hallucination Rate (%)", "Non-RAG Hallucination Rate (%)"],
        var_name="Benchmark",
        value_name="Rate",
    )
    fig = px.bar(
        df_long,
        x="Models",
        y="Rate",
        color="Benchmark",
        barmode="group",
        title="Hallucination Rates by Model",
        height=400,
    )
    fig.update_layout(xaxis_title="", yaxis_title="%")
    return fig

def make_leaderboard_plot(df: pd.DataFrame, col: str, title: str, bar_color: str):
    """

    Return a horizontal bar chart sorted ascending by `col`.

    Lowest value (best) at the top.

    """
    df_sorted = df.sort_values(col, ascending=False)           # best → worst
    fig = px.bar(
        df_sorted,
        x=col,
        y="Models",
        orientation="h",
        title=title,
        text_auto=".2f",
        height=400,
        color_discrete_sequence=[bar_color],
    )
    fig.update_traces(textposition="outside", cliponaxis=False)

    fig.update_layout(
        xaxis_title="Hallucination Rate (%)",
        yaxis_title="",
        yaxis=dict(dtick=1),   # ensure every model shown
        margin=dict(l=140, r=60, t=60, b=40)
    )
    fig.update_traces(textposition="outside")
    return fig


def color_scale(s, cmap):
    """

    Return background-colour styles for a numeric Series (lower = greener,

    higher = redder). Works with any palette length.

    """
    colours = px.colors.sequential.__dict__[cmap]
    n = len(colours) - 1                     # max valid index

    rng = s.max() - s.min()
    norm = (s - s.min()) / (rng if rng else 1)

    return [f"background-color:{colours[int(v * n)]}" for v in 1 - norm]


### Space initialisation
try:
    print(EVAL_REQUESTS_PATH)
    snapshot_download(
        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
    )
except Exception:
    # restart_space()
    print(f"[WARN] Skipping RESULTS sync: {Exception}")
try:
    print(EVAL_RESULTS_PATH)
    snapshot_download(
        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
    )
except Exception:
    # restart_space()
    print(f"[WARN] Skipping RESULTS sync: {Exception}")


# LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
LEADERBOARD_DF = get_leaderboard_df("leaderboard/data/leaderboard.csv")

# (
#     finished_eval_queue_df,
#     running_eval_queue_df,
#     pending_eval_queue_df,
# ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)

def init_leaderboard(df: pd.DataFrame):
    if df is None or df.empty:
        raise ValueError("Leaderboard DataFrame is empty or None.")
    
    return Leaderboard(
        value=df,
        datatype=["markdown", "markdown", "number", "number", "number"],  
        select_columns=SelectColumns(
            default_selection=[
                "Rank", "Models",
                "Average Hallucination Rate (%)",
                "RAG Hallucination Rate (%)",
                "Non-RAG Hallucination Rate (%)"
            ],
            cant_deselect=["Models", "Rank"],
            label="Select Columns to Display:",
        ),
        search_columns=["Models"],
        # column_widths=["3%"],   
        bool_checkboxgroup_label=None,
        interactive=False,
    )

image_path = "static/kluster-color.png"
with open(image_path, "rb") as img_file:
    b64_string = base64.b64encode(img_file.read()).decode("utf-8")


# print("CUSTOM CSS\n", custom_css[-1000:], "\n---------")
demo = gr.Blocks(css=custom_css)
with demo:
    gr.HTML(f"""

        <div style="text-align: center; margin-top: 2em; margin-bottom: 1em;">

            <img src="data:image/png;base64,{b64_string}" alt="kluster.ai logo"

                style="height: 80px; display: block; margin-left: auto; margin-right: auto;" />

            

            <div style="font-size: 2.5em; font-weight: bold; margin-top: 0.4em; color: var(--text-color);">

                LLM Hallucination Detection Leaderboard

            </div>

            

            <div style="font-size: 1.5em; margin-top: 0.5em;">

                Evaluating factual accuracy and faithfulness of LLMs in both RAG and real-world knowledge settings with

                <a href="https://platform.kluster.ai/verify" target="_blank">

                    Verify

                </a> by

                <a href="https://platform.kluster.ai/" target="_blank">

                    kluster.ai

                </a>

            </div>

        </div>

        """)


    # gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

    with gr.Tabs(elem_classes="tab-buttons") as tabs:
        with gr.TabItem("🏅 Hallucination Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
            # ----------  Chart  ----------
            with gr.Row():
                gr.Plot(
                    make_leaderboard_plot(
                        LEADERBOARD_DF,
                        "RAG Hallucination Rate (%)",
                        "RAG Hallucination Rate (lower is better)",
                        bar_color="#4CAF50",
                    ),
                    show_label=False,
                )
                gr.Plot(
                    make_leaderboard_plot(
                        LEADERBOARD_DF,
                        "Non-RAG Hallucination Rate (%)",
                        "Non-RAG Hallucination Rate (lower is better)",
                        bar_color="#FF7043",
                    ),
                    show_label=False,
                )

            # ----------  Leaderboard  ----------
            leaderboard = init_leaderboard(LEADERBOARD_DF)

        with gr.TabItem("📝 Details", elem_id="llm-benchmark-tab-table", id=2):
            gr.Markdown((Path(__file__).parent / "docs.md").read_text())

        with gr.TabItem("🚀 Submit Here! ", elem_id="llm-benchmark-tab-table", id=3):
            gr.Markdown((Path(__file__).parent / "submit.md").read_text())

            # with gr.Column():
            #     with gr.Row():
            #         gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")

            #     with gr.Column():
            #         with gr.Accordion(
            #             f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
            #             open=False,
            #         ):
            #             with gr.Row():
            #                 finished_eval_table = gr.components.Dataframe(
            #                     value=finished_eval_queue_df,
            #                     headers=EVAL_COLS,
            #                     datatype=EVAL_TYPES,
            #                     row_count=5,
            #                 )
            #         with gr.Accordion(
            #             f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
            #             open=False,
            #         ):
            #             with gr.Row():
            #                 running_eval_table = gr.components.Dataframe(
            #                     value=running_eval_queue_df,
            #                     headers=EVAL_COLS,
            #                     datatype=EVAL_TYPES,
            #                     row_count=5,
            #                 )

            #         with gr.Accordion(
            #             f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
            #             open=False,
            #         ):
            #             with gr.Row():
            #                 pending_eval_table = gr.components.Dataframe(
            #                     value=pending_eval_queue_df,
            #                     headers=EVAL_COLS,
            #                     datatype=EVAL_TYPES,
            #                     row_count=5,
            #                 )
            # with gr.Row():
            #     gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")

            # with gr.Row():
            #     with gr.Column():
            #         model_name_textbox = gr.Textbox(label="Model name")
            #         revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
            #         model_type = gr.Dropdown(
            #             choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
            #             label="Model type",
            #             multiselect=False,
            #             value=None,
            #             interactive=True,
            #         )

            #     with gr.Column():
            #         precision = gr.Dropdown(
            #             choices=[i.value.name for i in Precision if i != Precision.Unknown],
            #             label="Precision",
            #             multiselect=False,
            #             value="float16",
            #             interactive=True,
            #         )
            #         weight_type = gr.Dropdown(
            #             choices=[i.value.name for i in WeightType],
            #             label="Weights type",
            #             multiselect=False,
            #             value="Original",
            #             interactive=True,
            #         )
            #         base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")

            # submit_button = gr.Button("Submit Eval")
            # submission_result = gr.Markdown()
            # submit_button.click(
            #     add_new_eval,
            #     [
            #         model_name_textbox,
            #         base_model_name_textbox,
            #         revision_name_textbox,
            #         precision,
            #         weight_type,
            #         model_type,
            #     ],
            #     submission_result,
            # )

    # with gr.Row():
    #     with gr.Accordion("📙 Citation", open=False):
    #         citation_button = gr.Textbox(
    #             value=CITATION_BUTTON_TEXT,
    #             label=CITATION_BUTTON_LABEL,
    #             lines=20,
    #             elem_id="citation-button",
    #             show_copy_button=True,
    #         )

scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=1800)
scheduler.start()
demo.queue(default_concurrency_limit=40).launch()