|
import gradio as gr |
|
import pandas as pd |
|
from pathlib import Path |
|
import plotly.express as px |
|
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns |
|
from apscheduler.schedulers.background import BackgroundScheduler |
|
from huggingface_hub import snapshot_download |
|
import textwrap |
|
|
|
from src.about import ( |
|
CITATION_BUTTON_LABEL, |
|
CITATION_BUTTON_TEXT, |
|
EVALUATION_QUEUE_TEXT, |
|
INTRODUCTION_TEXT, |
|
LLM_BENCHMARKS_TEXT, |
|
TITLE, |
|
) |
|
from src.display.css_html_js import custom_css |
|
from src.display.utils import ( |
|
BENCHMARK_COLS, |
|
COLS, |
|
EVAL_COLS, |
|
EVAL_TYPES, |
|
AutoEvalColumn, |
|
ModelType, |
|
fields, |
|
WeightType, |
|
Precision |
|
) |
|
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN |
|
from src.populate import get_evaluation_queue_df, get_leaderboard_df, get_rag_leaderboard_df |
|
from src.submission.submit import add_new_eval |
|
import base64 |
|
|
|
|
|
def restart_space(): |
|
API.restart_space(repo_id=REPO_ID) |
|
|
|
|
|
|
|
def make_rate_chart(df: pd.DataFrame): |
|
"""Return a Plotly bar chart of hallucination rates.""" |
|
|
|
df_long = df.melt( |
|
id_vars="Models", |
|
value_vars=["RAG Hallucination Rate (%)", "Non-RAG Hallucination Rate (%)"], |
|
var_name="Benchmark", |
|
value_name="Rate", |
|
) |
|
fig = px.bar( |
|
df_long, |
|
x="Models", |
|
y="Rate", |
|
color="Benchmark", |
|
barmode="group", |
|
title="Hallucination Rates by Model", |
|
height=400, |
|
) |
|
fig.update_layout(xaxis_title="", yaxis_title="%") |
|
return fig |
|
|
|
def make_leaderboard_plot(df: pd.DataFrame, col: str, title: str, bar_color: str): |
|
|
|
df_sorted = df.sort_values(col, ascending=False) |
|
fig = px.bar( |
|
df_sorted, |
|
x=col, |
|
y="Models", |
|
orientation="h", |
|
title=title, |
|
text_auto=".2f", |
|
height=400, |
|
color_discrete_sequence=[bar_color], |
|
) |
|
fig.update_traces(textposition="outside", cliponaxis=False) |
|
|
|
fig.update_layout( |
|
xaxis_title="Hallucination Rate (%)", |
|
yaxis_title="", |
|
yaxis=dict(dtick=1), |
|
margin=dict(l=140, r=60, t=60, b=40) |
|
) |
|
fig.update_traces(textposition="outside") |
|
return fig |
|
|
|
|
|
def make_rag_average_plot(df: pd.DataFrame, col: str, title: str, bar_color: str): |
|
rag_cols = [ |
|
"Context in System Prompt (%)", |
|
"Context and Question Single-Turn (%)", |
|
"Context and Question Two-Turns (%)", |
|
] |
|
|
|
df_plot = df.copy() |
|
if col not in df_plot.columns: |
|
df_plot[col] = df_plot[rag_cols].mean(axis=1, skipna=True).round(2) |
|
|
|
df_plot["Std Dev"] = df_plot[rag_cols].std(axis=1, skipna=True).round(2) |
|
|
|
df_sorted = df_plot.sort_values(col, ascending=False) |
|
|
|
fig = px.bar( |
|
df_sorted, |
|
x=col, |
|
y="Models", |
|
orientation="h", |
|
title=title, |
|
height=400, |
|
color_discrete_sequence=[bar_color], |
|
error_x="Std Dev", |
|
) |
|
fig.update_traces( |
|
texttemplate="%{x:.2f}", |
|
textposition="inside", |
|
insidetextanchor="middle", |
|
cliponaxis=False, |
|
) |
|
fig.update_layout( |
|
xaxis_title="Hallucination Rate (%)", |
|
yaxis_title="", |
|
yaxis=dict(dtick=1), |
|
margin=dict(l=140, r=60, t=60, b=40), |
|
) |
|
return fig |
|
|
|
|
|
def make_rag_method_average_plot(df: pd.DataFrame, title: str, bar_color: str): |
|
method_cols = [ |
|
"Context in System Prompt (%)", |
|
"Context and Question Single-Turn (%)", |
|
"Context and Question Two-Turns (%)", |
|
] |
|
|
|
averages = df[method_cols].mean().round(2) |
|
stds = df[method_cols].std().round(2) |
|
|
|
avg_df = pd.DataFrame( |
|
{ |
|
"RAG Method": averages.index, |
|
"Average Hallucination Rate (%)": averages.values, |
|
"Std Dev": stds.values, |
|
} |
|
) |
|
|
|
fig = px.bar( |
|
avg_df, |
|
x="RAG Method", |
|
y="Average Hallucination Rate (%)", |
|
error_y="Std Dev", |
|
title=title, |
|
height=400, |
|
color_discrete_sequence=[bar_color], |
|
) |
|
fig.update_traces( |
|
texttemplate="%{y:.2f}" if 'orientation' not in fig.data[0] or fig.data[0].orientation == 'v' else "%{x:.2f}", |
|
textposition="inside", |
|
insidetextanchor="start", |
|
cliponaxis=False, |
|
textfont_color="white", |
|
) |
|
labels_map = { |
|
"Context in System Prompt (%)": "Context in<br>System Prompt", |
|
"Context and Question Single-Turn (%)": "Context & Question<br>Single-Turn", |
|
"Context and Question Two-Turns (%)": "Context & Question<br>Two-Turns", |
|
} |
|
fig.update_xaxes( |
|
tickmode="array", |
|
tickvals=list(labels_map.keys()), |
|
ticktext=list(labels_map.values()), |
|
tickangle=0, |
|
automargin=True, |
|
) |
|
fig.update_layout( |
|
xaxis_title="", |
|
yaxis_title="Hallucination Rate (%)", |
|
margin=dict(l=40, r=100, t=60, b=120), |
|
) |
|
return fig |
|
|
|
|
|
def color_scale(s, cmap): |
|
""" |
|
Return background-colour styles for a numeric Series (lower = greener, |
|
higher = redder). Works with any palette length. |
|
""" |
|
colours = px.colors.sequential.__dict__[cmap] |
|
n = len(colours) - 1 |
|
|
|
rng = s.max() - s.min() |
|
norm = (s - s.min()) / (rng if rng else 1) |
|
|
|
return [f"background-color:{colours[int(v * n)]}" for v in 1 - norm] |
|
|
|
|
|
|
|
try: |
|
print(EVAL_REQUESTS_PATH) |
|
snapshot_download( |
|
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN |
|
) |
|
except Exception: |
|
|
|
print(f"[WARN] Skipping RESULTS sync: {Exception}") |
|
try: |
|
print(EVAL_RESULTS_PATH) |
|
snapshot_download( |
|
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN |
|
) |
|
except Exception: |
|
|
|
print(f"[WARN] Skipping RESULTS sync: {Exception}") |
|
|
|
|
|
LEADERBOARD_DF = get_leaderboard_df("leaderboard/data/leaderboard.csv") |
|
RAG_DF = get_rag_leaderboard_df("leaderboard/data/rag_methods_compare.csv") |
|
|
|
|
|
def init_leaderboard(df: pd.DataFrame): |
|
if df is None or df.empty: |
|
raise ValueError("Leaderboard DataFrame is empty or None.") |
|
|
|
return Leaderboard( |
|
value=df, |
|
datatype=["markdown", "markdown", "number", "number", "number"], |
|
select_columns=SelectColumns( |
|
default_selection=[ |
|
"Rank", "Models", |
|
"Average Hallucination Rate (%)", |
|
"RAG Hallucination Rate (%)", |
|
"Non-RAG Hallucination Rate (%)" |
|
], |
|
cant_deselect=["Models", "Rank"], |
|
label="Select Columns to Display:", |
|
), |
|
search_columns=["Models"], |
|
|
|
bool_checkboxgroup_label=None, |
|
interactive=False, |
|
height=800 |
|
) |
|
|
|
image_path = "static/kluster-color.png" |
|
with open(image_path, "rb") as img_file: |
|
b64_string = base64.b64encode(img_file.read()).decode("utf-8") |
|
|
|
|
|
|
|
demo = gr.Blocks(css=custom_css) |
|
with demo: |
|
gr.HTML(f""" |
|
<div style="text-align: center; margin-top: 2em; margin-bottom: 1em;"> |
|
<img src="data:image/png;base64,{b64_string}" alt="kluster.ai logo" |
|
style="height: 80px; display: block; margin-left: auto; margin-right: auto;" /> |
|
|
|
<div style="font-size: 2.5em; font-weight: bold; margin-top: 0.4em; color: var(--text-color);"> |
|
LLM Hallucination Detection Leaderboard |
|
</div> |
|
|
|
<div style="font-size: 1.5em; margin-top: 0.5em;"> |
|
Evaluating factual accuracy and faithfulness of LLMs in both RAG and non-RAG settings with |
|
<a href="https://platform.kluster.ai/verify" target="_blank"> |
|
Verify |
|
</a> by |
|
<a href="https://kluster.ai/" target="_blank"> |
|
kluster.ai |
|
</a> which provides an API for detecting hallucinations with any model. |
|
</div> |
|
</div> |
|
""") |
|
|
|
|
|
with gr.Tabs(elem_classes="tab-buttons") as tabs: |
|
with gr.TabItem("🏅 Hallucination Leaderboard", elem_id="llm-benchmark-tab-table", id=0): |
|
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.Plot( |
|
make_leaderboard_plot( |
|
LEADERBOARD_DF, |
|
"RAG Hallucination Rate (%)", |
|
"RAG Hallucination Rate (lower is better)", |
|
bar_color="#4CAF50", |
|
), |
|
show_label=False, |
|
) |
|
gr.Markdown("*HaluEval-QA benchmark (RAG): The model receives a question plus supporting context. We report the % of answers that introduce facts not found in that context — lower is better. See the **Methodology** section below for more information.*", elem_classes="plot-caption") |
|
with gr.Column(): |
|
gr.Plot( |
|
make_leaderboard_plot( |
|
LEADERBOARD_DF, |
|
"Non-RAG Hallucination Rate (%)", |
|
"Non-RAG Hallucination Rate (lower is better)", |
|
bar_color="#FF7043", |
|
), |
|
show_label=False, |
|
) |
|
gr.Markdown("*UltraChat benchmark (~11 k prompts, non-RAG): Evaluates open-domain answers when only the question is given. Score is the % of hallucinated responses — lower is better. See the **Methodology** section below for more information.*", elem_classes="plot-caption") |
|
|
|
|
|
leaderboard = init_leaderboard(LEADERBOARD_DF) |
|
|
|
|
|
verify_markdown = (Path(__file__).parent / "verify.md").read_text() |
|
|
|
gr.Markdown(verify_markdown, elem_classes="markdown-text") |
|
|
|
code_example_reliability = textwrap.dedent( |
|
r"""curl -X POST https://api.kluster.ai/v1/verify/reliability \ |
|
-H "Authorization: Bearer YOUR_API_KEY" \ |
|
-H "Content-Type: application/json" \ |
|
-d '{ |
|
"prompt": "Tell me about the new iPhone 20 features", |
|
"output": "The iPhone 20 includes a revolutionary holographic display, 200MP camera with AI scene detection, and can project 3D holograms up to 6 feet away for video calls.", |
|
"context": null |
|
}'""" |
|
) |
|
|
|
gr.Code(code_example_reliability, language="shell") |
|
|
|
code_example_chat = textwrap.dedent( |
|
r"""curl -X POST https://api.kluster.ai/v1/chat/completions \ |
|
-H "Authorization: Bearer YOUR_API_KEY" \ |
|
-H "Content-Type: application/json" \ |
|
-d '{ |
|
"model": "klusterai/verify-reliability", |
|
"messages": [ |
|
{ "role": "user", "content": "What can you tell me about Milos Burger Joint?" }, |
|
{ "role": "assistant", "content": "Milos Burger Joint has been serving authentic Burgers cuisine since 1999 and just won 2 Michelin stars last week, making it the highest-rated burger restaurant in the city." } |
|
] |
|
}'""" |
|
) |
|
|
|
gr.Code(code_example_chat, language="shell") |
|
|
|
gr.Markdown((Path(__file__).parent / "docs.md").read_text(), elem_classes="markdown-text") |
|
|
|
|
|
with gr.TabItem("🧪 RAG Techniques and Hallucinations", elem_id="llm-benchmark-tab-table", id=2): |
|
rag_intro_markdown = (Path(__file__).parent / "rag_techniques_intro.md").read_text() |
|
rag_details_markdown = (Path(__file__).parent / "rag_techniques_details.md").read_text() |
|
|
|
gr.Markdown(rag_intro_markdown, elem_classes="markdown-text") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.Plot( |
|
make_rag_method_average_plot( |
|
RAG_DF, |
|
"Average Hallucination Rate by RAG Method (lower is better)", |
|
bar_color="#4CAF50", |
|
), |
|
show_label=False, |
|
) |
|
gr.Markdown( |
|
"*Mean hallucination rate for each RAG prompting strategy across all models on the HaluEval-QA benchmark. Error bars represent ±1 SD; lower is better.*", |
|
elem_classes="plot-caption", |
|
) |
|
with gr.Column(): |
|
gr.Plot( |
|
make_rag_average_plot( |
|
RAG_DF, |
|
"Average Hallucination Rate (%)", |
|
"Average Hallucination Rate per Model (lower is better)", |
|
bar_color="#2196F3", |
|
), |
|
show_label=False, |
|
) |
|
gr.Markdown( |
|
"*Mean hallucination rate across the three RAG prompting settings for each individual model. Error bars show ±1 SD across the three strategies; lower is better.*", |
|
elem_classes="plot-caption", |
|
) |
|
|
|
|
|
rag_leaderboard = Leaderboard( |
|
value=RAG_DF, |
|
datatype=["markdown", "number", "number", "number"], |
|
select_columns=SelectColumns( |
|
default_selection=[ |
|
"Models", |
|
"Context in System Prompt (%)", |
|
"Context and Question Single-Turn (%)", |
|
"Context and Question Two-Turns (%)", |
|
], |
|
cant_deselect=["Models"], |
|
label="Select RAG Method Columns:", |
|
), |
|
search_columns=["Models"], |
|
bool_checkboxgroup_label=None, |
|
interactive=False, |
|
height=700 |
|
) |
|
|
|
gr.Markdown(rag_details_markdown, elem_classes="markdown-text") |
|
|
|
|
|
|
|
|
|
with gr.TabItem("🚀 Submit Here! ", elem_id="llm-benchmark-tab-table", id=4): |
|
gr.Markdown((Path(__file__).parent / "submit.md").read_text(), elem_classes="markdown-text") |
|
|
|
|
|
scheduler = BackgroundScheduler() |
|
scheduler.add_job(restart_space, "interval", seconds=1800) |
|
scheduler.start() |
|
demo.queue(default_concurrency_limit=40).launch(show_api=False) |
|
|