Files changed (3) hide show
  1. app.py +7 -9
  2. src/populate.py +17 -2
  3. submit.md +75 -1
app.py CHANGED
@@ -161,21 +161,19 @@ demo = gr.Blocks(css=custom_css)
161
  with demo:
162
  gr.HTML(f"""
163
  <div style="text-align: center; margin-top: 2em; margin-bottom: 1em;">
164
- <img src="data:image/png;base64,{b64_string}" alt="KlusterAI logo"
165
  style="height: 80px; display: block; margin-left: auto; margin-right: auto;" />
166
 
167
  <div style="font-size: 2.5em; font-weight: bold; margin-top: 0.4em; color: var(--text-color);">
168
- LLM Hallucination Detection <span style="color: var(--link-color);">Leaderboard</span>
169
  </div>
170
 
171
- <div style="font-size: 1.5em; margin-top: 0.5em; color: var(--text-color);">
172
  Evaluating factual accuracy and faithfulness of LLMs in both RAG and real-world knowledge settings with
173
- <a href="https://platform.kluster.ai/verify" target="_blank"
174
- style="color: var(--link-color); text-decoration: none;">
175
  Verify
176
  </a> by
177
- <a href="https://platform.kluster.ai/" target="_blank"
178
- style="color: var(--link-color); text-decoration: none;">
179
  kluster.ai
180
  </a>
181
  </div>
@@ -211,10 +209,10 @@ with demo:
211
  # ---------- Leaderboard ----------
212
  leaderboard = init_leaderboard(LEADERBOARD_DF)
213
 
214
- with gr.TabItem("πŸ“ Document", elem_id="llm-benchmark-tab-table", id=2):
215
  gr.Markdown((Path(__file__).parent / "docs.md").read_text())
216
 
217
- with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
218
  gr.Markdown((Path(__file__).parent / "submit.md").read_text())
219
 
220
  # with gr.Column():
 
161
  with demo:
162
  gr.HTML(f"""
163
  <div style="text-align: center; margin-top: 2em; margin-bottom: 1em;">
164
+ <img src="data:image/png;base64,{b64_string}" alt="kluster.ai logo"
165
  style="height: 80px; display: block; margin-left: auto; margin-right: auto;" />
166
 
167
  <div style="font-size: 2.5em; font-weight: bold; margin-top: 0.4em; color: var(--text-color);">
168
+ LLM Hallucination Detection Leaderboard
169
  </div>
170
 
171
+ <div style="font-size: 1.5em; margin-top: 0.5em;">
172
  Evaluating factual accuracy and faithfulness of LLMs in both RAG and real-world knowledge settings with
173
+ <a href="https://platform.kluster.ai/verify" target="_blank">
 
174
  Verify
175
  </a> by
176
+ <a href="https://platform.kluster.ai/" target="_blank">
 
177
  kluster.ai
178
  </a>
179
  </div>
 
209
  # ---------- Leaderboard ----------
210
  leaderboard = init_leaderboard(LEADERBOARD_DF)
211
 
212
+ with gr.TabItem("πŸ“ Details", elem_id="llm-benchmark-tab-table", id=2):
213
  gr.Markdown((Path(__file__).parent / "docs.md").read_text())
214
 
215
+ with gr.TabItem("πŸš€ Submit Here! ", elem_id="llm-benchmark-tab-table", id=3):
216
  gr.Markdown((Path(__file__).parent / "submit.md").read_text())
217
 
218
  # with gr.Column():
src/populate.py CHANGED
@@ -33,8 +33,23 @@ def get_leaderboard_df(results_path):
33
  medal_map = {1: "πŸ₯‡", 2: "πŸ₯ˆ", 3: "πŸ₯‰"}
34
 
35
  def medal_html(rank):
36
- m = medal_map.get(rank)
37
- return f'<span style="font-size:2.0rem;">{m}</span>' if m else rank
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  df["Rank"] = df.index + 1
40
  df["Rank"] = df["Rank"].apply(medal_html)
 
33
  medal_map = {1: "πŸ₯‡", 2: "πŸ₯ˆ", 3: "πŸ₯‰"}
34
 
35
  def medal_html(rank):
36
+ """Return an HTML span with the medal icon for the top 3 ranks.
37
+
38
+ The numeric rank is stored in the data-order attribute equal to the numerical rank so that
39
+ DataTables (used under-the-hood by the gradio_leaderboard component)
40
+ can sort the column by this hidden numeric value while still
41
+ displaying the pretty medal icon. For ranks > 3 we just return the
42
+ integer so the column remains fully numeric.
43
+ """
44
+ medal = medal_map.get(rank)
45
+ if medal:
46
+ # Prepend a hidden numeric span so string sorting still works numerically.
47
+ return (
48
+ f'<span style="display:none">{rank:04}</span>' # zero-padded for stable string sort
49
+ f'<span style="font-size:2.0rem;">{medal}</span>'
50
+ )
51
+ # For other ranks, also zero-pad to keep width and ensure proper string sort
52
+ return f'<span style="display:none">{rank:04}</span>{rank}'
53
 
54
  df["Rank"] = df.index + 1
55
  df["Rank"] = df["Rank"].apply(medal_html)
submit.md CHANGED
@@ -1 +1,75 @@
1
- # If you are interested, please submit here ...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # LLM Hallucination Detection Leaderboard Submission Guidelines
2
+
3
+ Thank you for your interest in contributing to the **LLM Hallucination Detection Leaderboard**! We welcome submissions from researchers and practitioners who have built or finetuned language models that can be evaluated on our hallucination benchmarks.
4
+
5
+ ---
6
+
7
+ ## 1. What to Send
8
+
9
+ Please email **ryan@kluster.ai** with the subject line:
10
+
11
+ ```
12
+ [Verify Leaderboard Submission] <Your-Model-Name>
13
+ ```
14
+
15
+ Attach **one ZIP file** that contains **all of the following**:
16
+
17
+ 1. **`model_card.md`** – A short Markdown file describing your model:
18
+ β€’ Name and version
19
+ β€’ Architecture / base model
20
+ β€’ Training or finetuning procedure
21
+ β€’ License
22
+ β€’ Intended use & known limitations
23
+ β€’ Contact information
24
+ 2. **`results.csv`** – A CSV file with **one row per prompt** and **one column per field** (see schema below).
25
+ 3. (Optional) **`extra_notes.md`** – Anything else you would like us to know (e.g., additional analysis).
26
+
27
+ ---
28
+
29
+ ## 2. CSV Schema
30
+
31
+ | Column | Description |
32
+ |--------------------|---------------------------------------------------------------------------|
33
+ | `request` | The exact input prompt shown to the model. |
34
+ | `response` | The raw output produced by the model. |
35
+ | `verify_response` | The Verify judgment or explanation regarding hallucination. |
36
+ | `verify_label` | The final boolean / categorical label (e.g., `TRUE`, `FALSE`). |
37
+ | `task` | The benchmark or dataset name the sample comes from. |
38
+
39
+ **Important:** Use UTF-8 encoding and **do not** add additional columns without prior discussion; extra information should go in the `metadata` field. You must use Verify by kluster.ai to ensure fairness in the leaderboard.
40
+
41
+ ---
42
+
43
+ ## 3. Evaluation Datasets
44
+
45
+ Run your model on the following public datasets and include *all* examples in your CSV. You can load them directly from Hugging Face:
46
+
47
+ | Dataset | Hugging Face Link |
48
+ |---------|-------------------|
49
+ | HaluEval QA (qa_samples subet with Question and Knowledge column) | https://huggingface.co/datasets/pminervini/HaluEval |
50
+ | UltraChat | https://huggingface.co/datasets/kluster-ai/ultrachat-sampled |
51
+
52
+ ---
53
+
54
+ ## 5. Example Row
55
+
56
+ ```csv
57
+ request,response,verify_response,verify_label,task
58
+ "What is the capital of the UK?","London is the capital of the UK.","The statement is factually correct.",CORRECT,TruthfulQA
59
+ ```
60
+
61
+ ---
62
+
63
+ ## 6. Review Process
64
+
65
+ 1. We will sanity-check the file format and reproduce a random subset.
66
+ 2. If everything looks good, your scores will appear on the public leaderboard.
67
+ 3. We may reach out for clarifications, please keep an eye on your inbox.
68
+
69
+ ---
70
+
71
+ ## 7. Contact
72
+
73
+ Questions? Email **ryan@kluster.ai**.
74
+
75
+ We look forward to your submissions and to advancing reliable language models together!