Spaces:
Running
Running
File size: 14,411 Bytes
8e68ad1 35c36b4 56e8880 fd13ef2 75b257e 35c36b4 578adcb 901e92c 35c36b4 22f13ed d707ec3 6594157 22f13ed 6594157 22f13ed 6594157 22f13ed 6594157 22f13ed 7f1d974 6594157 643980c 6594157 4fbb024 51cdf2a 6594157 578adcb 35c36b4 a2789e4 35c36b4 578adcb e78a002 578adcb 35c36b4 578adcb 8540c68 840ff89 8540c68 c46f5d5 a2789e4 c46f5d5 a553802 b530127 a553802 b530127 a553802 b530127 a553802 0cc3c4e b530127 c46f5d5 8540c68 840ff89 8540c68 7cd2eec 8540c68 840ff89 8540c68 c1ecfc3 94a28a8 8540c68 e78a002 15e5737 8540c68 3b858bd 8540c68 3b858bd 8540c68 94a28a8 8540c68 5c95c80 8540c68 5c95c80 8540c68 5c95c80 8540c68 5c95c80 8540c68 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 |
import streamlit as st
import pandas as pd
from PIL import Image
import base64
from io import BytesIO
import numpy as np
# βββ Page config ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
st.set_page_config(page_title="ExpertLongBench Leaderboard", layout="wide")
logo_small = Image.open("src/logo.png")
logo_image = Image.open("src/ExpertLongBench.png")
def encode_image(image):
buffered = BytesIO()
image.save(buffered, format="PNG")
return base64.b64encode(buffered.getvalue()).decode("utf-8")
# Display logo
# buffered = BytesIO()
# logo_image.save(buffered, format="PNG")
# img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
img_logo = encode_image(logo_small)
img_data = encode_image(logo_image)
# <div class="logo-container" style="display:flex; justify-content: center;">
st.markdown(
f"""
<div class="logo-container" style="display:flex; justify-content: center; align-items: center; gap: 20px;">
<img src="data:image/png;base64,{img_logo}" style="width:60px;"/>
<img src="data:image/png;base64,{img_data}" style="width:50%; max-width:700px;"/>
</div>
""",
unsafe_allow_html=True
)
st.markdown(
'''
<div class="header">
<br/>
<p style="font-size:22px;">
ExpertLongBench: Benchmarking Language Models on Expert-Level Long-Form Generation with Structured Checklists
</p>
<p style="font-size:20px;">
π <a href="https://arxiv.org/abs/2506.01241">Paper</a> | π» <a href="https://github.com/launchnlp/ExpertLongBench">GitHub</a> | π€ <a href="https://huggingface.co/datasets/launch/ExpertLongBench">Public Dataset</a> |
βοΈ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 12 | Updated: <strong>June 2025</strong>
</p>
</div>
''',
unsafe_allow_html=True
)
# βββ Load data ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
@st.cache_data
def load_data(path):
df = pd.read_json(path, lines=True)
score_cols = [f"T{i}" for i in range(1, 12)]
df["Avg"] = df[score_cols].mean(axis=1).round(1)
# df["Avg"] = np.ceil(df[score_cols].mean(axis=1) * 10) / 10
# Compute rank per column (1 = best)
for col in score_cols + ["Avg"]:
df[f"{col}_rank"] = df[col].rank(ascending=False, method="min").astype(int)
return df
# one page description
st.markdown("## π Leaderboard")
# st.markdown("**Leaderboard:** higher scores shaded green; best models bolded.")
tiers = ['F1', 'Accuracy']
selected_tier = st.selectbox('Select metric:', tiers)
if selected_tier == 'F1':
df = load_data("src/models.json")
# Precompute max ranks for color scaling
score_cols = [f"T{i}" for i in range(1, 12)] + ["Avg"]
max_ranks = {col: df[f"{col}_rank"].max() for col in score_cols}
# Build raw HTML table
cols = ["Model"] + [f"T{i}" for i in range(1,12)] + ["Avg"]
html = "<table style='border-collapse:collapse; width:100%; font-size:14px;'>"
# header
html += "<tr>" + "".join(f"<th style='padding:6px;'>{col}</th>" for col in cols) + "</tr>"
# rows
for _, row in df.iterrows():
html += "<tr>"
for col in cols:
val = row[col]
if col == "Model":
html += f"<td style='padding:6px; text-align:left;'>{val}</td>"
else:
rank = int(row[f"{col}_rank"])
norm = 1 - (rank - 1) / ((max_ranks[col] - 1) or 1)
# interpolate green (182,243,182) β white (255,255,255)
r = int(255 - norm*(255-182))
g = int(255 - norm*(255-243))
b = 255
bold = "font-weight:bold;" if rank == 1 else ""
style = f"background-color:rgb({r},{g},{b}); padding:6px; {bold}"
html += f"<td style='{style}'>{val}</td>"
html += "</tr>"
html += "</table>"
st.markdown(html, unsafe_allow_html=True)
else:
df2 = load_data("src/model_acc.json")
# Precompute max ranks for color scaling
score_cols = [f"T{i}" for i in range(1, 12)] + ["Avg"]
max_ranks = {col: df2[f"{col}_rank"].max() for col in score_cols}
# Build raw HTML table
cols = ["Model"] + [f"T{i}" for i in range(1,12)] + ["Avg"]
html = "<table style='border-collapse:collapse; width:100%; font-size:14px;'>"
# header
html += "<tr>" + "".join(f"<th style='padding:6px;'>{col}</th>" for col in cols) + "</tr>"
# rows
for _, row in df2.iterrows():
html += "<tr>"
for col in cols:
# val = row[col]
val = f"{row[col]:.1f}" if col != "Model" else row[col]
if col == "Model":
html += f"<td style='padding:6px; text-align:left;'>{val}</td>"
else:
rank = int(row[f"{col}_rank"])
norm = 1 - (rank - 1) / ((max_ranks[col] - 1) or 1)
# interpolate green (182,243,182) β white (255,255,255)
r = int(255 - norm*(255-182))
g = int(255 - norm*(255-243))
b = 255
bold = "font-weight:bold;" if rank == 1 else ""
style = f"background-color:rgb({r},{g},{b}); padding:6px; {bold}"
html += f"<td style='{style}'>{val}</td>"
html += "</tr>"
html += "</table>"
st.markdown(html, unsafe_allow_html=True)
pipeline_image = Image.open("src/pipeline.png")
buffered2 = BytesIO()
pipeline_image.save(buffered2, format="PNG")
img_data_pipeline = base64.b64encode(buffered2.getvalue()).decode("utf-8")
st.markdown("## π§ Abstract")
st.write(
"""
The paper introduces ExpertLongBench, an expert-level benchmark containing 11 tasks from 9 domains that reflect realistic expert workflows and applications.
Beyond question answering, the application-driven tasks in ExpertLongBench demand long-form outputs that can exceed 5,000 tokens and strict adherence to domain-specific requirements. Notably, each task includes rubrics, designed or validated by domain experts, to specify task requirements and guide output evaluation. Furthermore, we propose CLEAR to support accurate evaluation of long-form model outputs on our benchmark.
For fine-grained, expert-aligned evaluation, CLEAR derives checklists from model outputs and reference outputs by extracting information corresponding to items on the task-specific rubrics.
Checklist items for model outputs are then compared with corresponding items for reference outputs to assess their correctness, enabling grounded evaluation.
We benchmark 11 large language models (LLMs) and analyze components in CLEAR, showing that:
(1) existing LLMs, with the top performer achieving only a 26.8% F1 score, require significant improvement for expert-level tasks;
(2) models can generate content corresponding to the required aspects, though often not accurately; and
(3) accurate checklist extraction and comparison in CLEAR can be achieved by open-weight models for more scalable and low-cost usage.
"""
)
st.markdown("## π§° Evaluation Pipeline")
st.markdown(
f"""
<div class="logo-container" style="display:flex; justify-content: center;">
<img src="data:image/png;base64,{img_data_pipeline}" style="width:90%; max-width:900px;"/>
</div>
""",
unsafe_allow_html=True
)
st.markdown('## π€ Submit Your Model')
st.write(
"""
We provide both π **public** and π **private** subsets of the dataset.
π§ͺ We recommend starting with the public set for initial testing and development.
π€ You're welcome to submit your model for evaluation on the private set β just make sure to include your results on the public set.
π You can submit your model through the following link: [https://forms.gle/mWa6joCfgQnwXsxeA](https://forms.gle/mWa6joCfgQnwXsxeA)
"""
)
st.markdown('## π’ We Welcome Contributions from the Community')
st.write(
"""
We actively encourage contributions from the research community β including:
- β
Proposing new tasks and contributing data
- π Suggesting improvements to existing ones
- π§ Sharing domain-specific insights βοΈπ§ͺπ₯π
Your input is invaluable in making ExpertLongBench more representative and impactful across expert domains.
If you're interested in contributing or collaborating, feel free to reach out to us: Jie Ruan (jieruan@umich.edu), Inderjeet Nair (inair@umich.edu), Shuyang Cao (caoshuy@umich.edu), Lu Wang (wangluxy@umich.edu).
Letβs build better evaluations for expert-level AI β together ππ€
"""
)
st.markdown("## π Citation")
st.write("""
```bibtex
@article{ruan2025expertlongbench,
title={ExpertLongBench: Benchmarking Language Models on Expert-Level Long-Form Generation Tasks with Structured Checklists},
author={Ruan, Jie and Nair, Inderjeet and Cao, Shuyang and Liu, Amy and Munir, Sheza and Pollens-Dempsey, Micah and Chiang, Tiffany and Kates, Lucy and David, Nicholas and Chen, Sihan and others},
journal={arXiv preprint arXiv:2506.01241},
year={2025}
}
""")
# # βββ Tabs ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# tab1, tab2, tab3, tab4 = st.tabs(["π Leaderboard", "π Benchmark Details", "π€ Submit Your Model", "π§© Community Contributions Welcome"])
# with tab1:
# # st.markdown("**Leaderboard:** higher scores shaded green; best models bolded.")
# # Build raw HTML table
# cols = ["Model"] + [f"T{i}" for i in range(1,12)] + ["Avg"]
# html = "<table style='border-collapse:collapse; width:100%; font-size:14px;'>"
# # header
# html += "<tr>" + "".join(f"<th style='padding:6px;'>{col}</th>" for col in cols) + "</tr>"
# # rows
# for _, row in df.iterrows():
# html += "<tr>"
# for col in cols:
# val = row[col]
# if col == "Model":
# html += f"<td style='padding:6px; text-align:left;'>{val}</td>"
# else:
# rank = int(row[f"{col}_rank"])
# norm = 1 - (rank - 1) / ((max_ranks[col] - 1) or 1)
# # interpolate green (182,243,182) β white (255,255,255)
# r = int(255 - norm*(255-182))
# g = int(255 - norm*(255-243))
# b = 255
# bold = "font-weight:bold;" if rank == 1 else ""
# style = f"background-color:rgb({r},{g},{b}); padding:6px; {bold}"
# html += f"<td style='{style}'>{val}</td>"
# html += "</tr>"
# html += "</table>"
# st.markdown(html, unsafe_allow_html=True)
# with tab2:
# pipeline_image = Image.open("src/pipeline.png")
# buffered2 = BytesIO()
# pipeline_image.save(buffered2, format="PNG")
# img_data_pipeline = base64.b64encode(buffered2.getvalue()).decode("utf-8")
# st.markdown("## Abstract")
# st.write(
# """
# The paper introduces ExpertLongBench, an expert-level benchmark containing 11 tasks from 9 domains that reflect realistic expert workflows and applications.
# Beyond question answering, the application-driven tasks in ExpertLongBench demand long-form outputs that can exceed 5,000 tokens and strict adherence to domain-specific requirements. Notably, each task includes rubrics, designed or validated by domain experts, to specify task requirements and guide output evaluation. Furthermore, we propose CLEAR to support accurate evaluation of long-form model outputs on our benchmark.
# For fine-grained, expert-aligned evaluation, CLEAR derives checklists from model outputs and reference outputs by extracting information corresponding to items on the task-specific rubrics.
# Checklist items for model outputs are then compared with corresponding items for reference outputs to assess their correctness, enabling grounded evaluation.
# We benchmark 11 large language models (LLMs) and analyze components in CLEAR, showing that:
# (1) existing LLMs, with the top performer achieving only a 26.8% F1 score, require significant improvement for expert-level tasks;
# (2) models can generate content corresponding to the required aspects, though often not accurately; and
# (3) accurate checklist extraction and comparison in CLEAR can be achieved by open-weight models for more scalable and low-cost usage.
# """
# )
# st.markdown("## Pipeline")
# st.markdown(
# f"""
# <div class="logo-container" style="display:flex; justify-content: center;">
# <img src="data:image/png;base64,{img_data_pipeline}" style="width:90%; max-width:900px;"/>
# </div>
# """,
# unsafe_allow_html=True
# )
# with tab3:
# st.markdown('## π€ Submit Your Model')
# st.write(
# """
# We provide both π **public** and π **private** subsets of the dataset.
# π§ͺ We recommend starting with the public set for initial testing and development.
# π€ You're welcome to submit your model for evaluation on the private set β just make sure to include your results on the public set.
# π You can submit your model through the following link: [https://forms.gle/mWa6joCfgQnwXsxeA](https://forms.gle/mWa6joCfgQnwXsxeA)
# """
# )
# with tab4:
# st.markdown('## π’ We Welcome Contributions from the Community')
# st.write(
# """
# We actively encourage contributions from the research community β including:
# - β
Proposing new tasks and contributing data
# - π Suggesting improvements to existing ones
# - π§ Sharing domain-specific insights βοΈπ§ͺπ₯π
# Your input is invaluable in making ExpertLongBench more representative and impactful across expert domains.
# If you're interested in contributing or collaborating, feel free to reach out to us: Jie Ruan (jieruan@umich.edu), Inderjeet Nair (inair@umich.edu), Shuyang Cao (caoshuy@umich.edu), Lu Wang (wangluxy@umich.edu).
# Letβs build better evaluations for expert-level AI β together ππ€
# """
# )
|