rymc's picture
remove accord
0d9790a verified
import gradio as gr
import pandas as pd
from pathlib import Path
import plotly.express as px
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import snapshot_download
import textwrap
from src.about import (
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
EVALUATION_QUEUE_TEXT,
INTRODUCTION_TEXT,
LLM_BENCHMARKS_TEXT,
TITLE,
)
from src.display.css_html_js import custom_css
from src.display.utils import (
BENCHMARK_COLS,
COLS,
EVAL_COLS,
EVAL_TYPES,
AutoEvalColumn,
ModelType,
fields,
WeightType,
Precision
)
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
from src.populate import get_evaluation_queue_df, get_leaderboard_df, get_rag_leaderboard_df
from src.submission.submit import add_new_eval
import base64
def restart_space():
API.restart_space(repo_id=REPO_ID)
def make_rate_chart(df: pd.DataFrame):
"""Return a Plotly bar chart of hallucination rates."""
# long-form dataframe for grouped bars
df_long = df.melt(
id_vars="Models",
value_vars=["RAG Hallucination Rate (%)", "Non-RAG Hallucination Rate (%)"],
var_name="Benchmark",
value_name="Rate",
)
fig = px.bar(
df_long,
x="Models",
y="Rate",
color="Benchmark",
barmode="group",
title="Hallucination Rates by Model",
height=400,
)
fig.update_layout(xaxis_title="", yaxis_title="%")
return fig
def make_leaderboard_plot(df: pd.DataFrame, col: str, title: str, bar_color: str):
df_sorted = df.sort_values(col, ascending=False)
fig = px.bar(
df_sorted,
x=col,
y="Models",
orientation="h",
title=title,
text_auto=".2f",
height=400,
color_discrete_sequence=[bar_color],
)
fig.update_traces(textposition="outside", cliponaxis=False)
fig.update_layout(
xaxis_title="Hallucination Rate (%)",
yaxis_title="",
yaxis=dict(dtick=1), # ensure every model shown
margin=dict(l=140, r=60, t=60, b=40)
)
fig.update_traces(textposition="outside")
return fig
def make_rag_average_plot(df: pd.DataFrame, col: str, title: str, bar_color: str):
rag_cols = [
"Context in System Prompt (%)",
"Context and Question Single-Turn (%)",
"Context and Question Two-Turns (%)",
]
df_plot = df.copy()
if col not in df_plot.columns:
df_plot[col] = df_plot[rag_cols].mean(axis=1, skipna=True).round(2)
df_plot["Std Dev"] = df_plot[rag_cols].std(axis=1, skipna=True).round(2)
df_sorted = df_plot.sort_values(col, ascending=False)
fig = px.bar(
df_sorted,
x=col,
y="Models",
orientation="h",
title=title,
height=400,
color_discrete_sequence=[bar_color],
error_x="Std Dev",
)
fig.update_traces(
texttemplate="%{x:.2f}",
textposition="inside",
insidetextanchor="middle",
cliponaxis=False,
)
fig.update_layout(
xaxis_title="Hallucination Rate (%)",
yaxis_title="",
yaxis=dict(dtick=1),
margin=dict(l=140, r=60, t=60, b=40),
)
return fig
def make_rag_method_average_plot(df: pd.DataFrame, title: str, bar_color: str):
method_cols = [
"Context in System Prompt (%)",
"Context and Question Single-Turn (%)",
"Context and Question Two-Turns (%)",
]
averages = df[method_cols].mean().round(2)
stds = df[method_cols].std().round(2)
avg_df = pd.DataFrame(
{
"RAG Method": averages.index,
"Average Hallucination Rate (%)": averages.values,
"Std Dev": stds.values,
}
)
fig = px.bar(
avg_df,
x="RAG Method",
y="Average Hallucination Rate (%)",
error_y="Std Dev",
title=title,
height=400,
color_discrete_sequence=[bar_color],
)
fig.update_traces(
texttemplate="%{y:.2f}" if 'orientation' not in fig.data[0] or fig.data[0].orientation == 'v' else "%{x:.2f}",
textposition="inside",
insidetextanchor="start",
cliponaxis=False,
textfont_color="white",
)
labels_map = {
"Context in System Prompt (%)": "Context in<br>System Prompt",
"Context and Question Single-Turn (%)": "Context & Question<br>Single-Turn",
"Context and Question Two-Turns (%)": "Context & Question<br>Two-Turns",
}
fig.update_xaxes(
tickmode="array",
tickvals=list(labels_map.keys()),
ticktext=list(labels_map.values()),
tickangle=0,
automargin=True,
)
fig.update_layout(
xaxis_title="",
yaxis_title="Hallucination Rate (%)",
margin=dict(l=40, r=100, t=60, b=120),
)
return fig
def color_scale(s, cmap):
"""
Return background-colour styles for a numeric Series (lower = greener,
higher = redder). Works with any palette length.
"""
colours = px.colors.sequential.__dict__[cmap]
n = len(colours) - 1 # max valid index
rng = s.max() - s.min()
norm = (s - s.min()) / (rng if rng else 1)
return [f"background-color:{colours[int(v * n)]}" for v in 1 - norm]
### Space initialisation
try:
print(EVAL_REQUESTS_PATH)
snapshot_download(
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
)
except Exception:
# restart_space()
print(f"[WARN] Skipping RESULTS sync: {Exception}")
try:
print(EVAL_RESULTS_PATH)
snapshot_download(
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
)
except Exception:
# restart_space()
print(f"[WARN] Skipping RESULTS sync: {Exception}")
LEADERBOARD_DF = get_leaderboard_df("leaderboard/data/leaderboard.csv")
RAG_DF = get_rag_leaderboard_df("leaderboard/data/rag_methods_compare.csv")
def init_leaderboard(df: pd.DataFrame):
if df is None or df.empty:
raise ValueError("Leaderboard DataFrame is empty or None.")
return Leaderboard(
value=df,
datatype=["markdown", "markdown", "number", "number", "number"],
select_columns=SelectColumns(
default_selection=[
"Rank", "Models",
"Average Hallucination Rate (%)",
"RAG Hallucination Rate (%)",
"Non-RAG Hallucination Rate (%)"
],
cant_deselect=["Models", "Rank"],
label="Select Columns to Display:",
),
search_columns=["Models"],
# column_widths=["3%"],
bool_checkboxgroup_label=None,
interactive=False,
height=800
)
image_path = "static/kluster-color.png"
with open(image_path, "rb") as img_file:
b64_string = base64.b64encode(img_file.read()).decode("utf-8")
# print("CUSTOM CSS\n", custom_css[-1000:], "\n---------")
demo = gr.Blocks(css=custom_css)
with demo:
gr.HTML(f"""
<div style="text-align: center; margin-top: 2em; margin-bottom: 1em;">
<img src="data:image/png;base64,{b64_string}" alt="kluster.ai logo"
style="height: 80px; display: block; margin-left: auto; margin-right: auto;" />
<div style="font-size: 2.5em; font-weight: bold; margin-top: 0.4em; color: var(--text-color);">
LLM Hallucination Detection Leaderboard
</div>
<div style="font-size: 1.5em; margin-top: 0.5em;">
Evaluating factual accuracy and faithfulness of LLMs in both RAG and non-RAG settings with
<a href="https://platform.kluster.ai/verify" target="_blank">
Verify
</a> by
<a href="https://kluster.ai/" target="_blank">
kluster.ai
</a> which provides an API for detecting hallucinations with any model.
</div>
</div>
""")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("🏅 Hallucination Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
# ---------- Chart ----------
with gr.Row():
with gr.Column():
gr.Plot(
make_leaderboard_plot(
LEADERBOARD_DF,
"RAG Hallucination Rate (%)",
"RAG Hallucination Rate (lower is better)",
bar_color="#4CAF50",
),
show_label=False,
)
gr.Markdown("*HaluEval-QA benchmark (RAG): The model receives a question plus supporting context. We report the % of answers that introduce facts not found in that context — lower is better. See the **Methodology** section below for more information.*", elem_classes="plot-caption")
with gr.Column():
gr.Plot(
make_leaderboard_plot(
LEADERBOARD_DF,
"Non-RAG Hallucination Rate (%)",
"Non-RAG Hallucination Rate (lower is better)",
bar_color="#FF7043",
),
show_label=False,
)
gr.Markdown("*UltraChat benchmark (~11 k prompts, non-RAG): Evaluates open-domain answers when only the question is given. Score is the % of hallucinated responses — lower is better. See the **Methodology** section below for more information.*", elem_classes="plot-caption")
# ---------- Leaderboard ----------
leaderboard = init_leaderboard(LEADERBOARD_DF)
# ---------- Get Started with Verify ----------
verify_markdown = (Path(__file__).parent / "verify.md").read_text()
gr.Markdown(verify_markdown, elem_classes="markdown-text")
code_example_reliability = textwrap.dedent(
r"""curl -X POST https://api.kluster.ai/v1/verify/reliability \
-H "Authorization: Bearer YOUR_API_KEY" \
-H "Content-Type: application/json" \
-d '{
"prompt": "Tell me about the new iPhone 20 features",
"output": "The iPhone 20 includes a revolutionary holographic display, 200MP camera with AI scene detection, and can project 3D holograms up to 6 feet away for video calls.",
"context": null
}'"""
)
gr.Code(code_example_reliability, language="shell")
code_example_chat = textwrap.dedent(
r"""curl -X POST https://api.kluster.ai/v1/chat/completions \
-H "Authorization: Bearer YOUR_API_KEY" \
-H "Content-Type: application/json" \
-d '{
"model": "klusterai/verify-reliability",
"messages": [
{ "role": "user", "content": "What can you tell me about Milos Burger Joint?" },
{ "role": "assistant", "content": "Milos Burger Joint has been serving authentic Burgers cuisine since 1999 and just won 2 Michelin stars last week, making it the highest-rated burger restaurant in the city." }
]
}'"""
)
gr.Code(code_example_chat, language="shell")
gr.Markdown((Path(__file__).parent / "docs.md").read_text(), elem_classes="markdown-text")
with gr.TabItem("🧪 RAG Techniques and Hallucinations", elem_id="llm-benchmark-tab-table", id=2):
rag_intro_markdown = (Path(__file__).parent / "rag_techniques_intro.md").read_text()
rag_details_markdown = (Path(__file__).parent / "rag_techniques_details.md").read_text()
gr.Markdown(rag_intro_markdown, elem_classes="markdown-text")
with gr.Row():
with gr.Column():
gr.Plot(
make_rag_method_average_plot(
RAG_DF,
"Average Hallucination Rate by RAG Method (lower is better)",
bar_color="#4CAF50",
),
show_label=False,
)
gr.Markdown(
"*Mean hallucination rate for each RAG prompting strategy across all models on the HaluEval-QA benchmark. Error bars represent ±1 SD; lower is better.*",
elem_classes="plot-caption",
)
with gr.Column():
gr.Plot(
make_rag_average_plot(
RAG_DF,
"Average Hallucination Rate (%)",
"Average Hallucination Rate per Model (lower is better)",
bar_color="#2196F3",
),
show_label=False,
)
gr.Markdown(
"*Mean hallucination rate across the three RAG prompting settings for each individual model. Error bars show ±1 SD across the three strategies; lower is better.*",
elem_classes="plot-caption",
)
rag_leaderboard = Leaderboard(
value=RAG_DF,
datatype=["markdown", "number", "number", "number"],
select_columns=SelectColumns(
default_selection=[
"Models",
"Context in System Prompt (%)",
"Context and Question Single-Turn (%)",
"Context and Question Two-Turns (%)",
],
cant_deselect=["Models"],
label="Select RAG Method Columns:",
),
search_columns=["Models"],
bool_checkboxgroup_label=None,
interactive=False,
height=700
)
gr.Markdown(rag_details_markdown, elem_classes="markdown-text")
with gr.TabItem("🚀 Submit Here! ", elem_id="llm-benchmark-tab-table", id=4):
gr.Markdown((Path(__file__).parent / "submit.md").read_text(), elem_classes="markdown-text")
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=1800)
scheduler.start()
demo.queue(default_concurrency_limit=40).launch(show_api=False)