File size: 14,411 Bytes
8e68ad1
35c36b4
56e8880
 
fd13ef2
75b257e
35c36b4
578adcb
901e92c
35c36b4
22f13ed
d707ec3
6594157
22f13ed
 
 
 
 
6594157
22f13ed
 
 
 
 
6594157
22f13ed
6594157
 
22f13ed
7f1d974
6594157
 
 
 
 
 
 
 
 
 
 
643980c
6594157
 
4fbb024
51cdf2a
6594157
 
 
 
 
578adcb
35c36b4
a2789e4
35c36b4
578adcb
e78a002
 
578adcb
 
35c36b4
 
 
578adcb
8540c68
840ff89
8540c68
c46f5d5
 
 
 
 
a2789e4
 
 
 
 
 
c46f5d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a553802
b530127
 
 
a553802
b530127
 
 
 
 
 
a553802
b530127
 
a553802
0cc3c4e
b530127
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c46f5d5
 
 
8540c68
 
 
 
 
840ff89
8540c68
 
 
 
 
 
 
 
 
 
 
 
 
 
7cd2eec
8540c68
840ff89
8540c68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c1ecfc3
94a28a8
8540c68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e78a002
 
 
 
 
 
 
 
 
 
15e5737
8540c68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3b858bd
8540c68
3b858bd
8540c68
94a28a8
8540c68
 
 
 
 
 
 
 
 
5c95c80
8540c68
 
 
5c95c80
8540c68
5c95c80
8540c68
5c95c80
8540c68
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
import streamlit as st
import pandas as pd
from PIL import Image
import base64
from io import BytesIO  
import numpy as np

# ─── Page config ──────────────────────────────────────────────────────────────
st.set_page_config(page_title="ExpertLongBench Leaderboard", layout="wide")

logo_small = Image.open("src/logo.png")
logo_image = Image.open("src/ExpertLongBench.png")

def encode_image(image):
    buffered = BytesIO()
    image.save(buffered, format="PNG")
    return base64.b64encode(buffered.getvalue()).decode("utf-8")

# Display logo
# buffered = BytesIO()
# logo_image.save(buffered, format="PNG")
# img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
img_logo = encode_image(logo_small)
img_data = encode_image(logo_image)

    # <div class="logo-container" style="display:flex; justify-content: center;">
st.markdown(
    f"""
    <div class="logo-container" style="display:flex; justify-content: center; align-items: center; gap: 20px;">
        <img src="data:image/png;base64,{img_logo}" style="width:60px;"/>
        <img src="data:image/png;base64,{img_data}" style="width:50%; max-width:700px;"/>
    </div>
    """,
    unsafe_allow_html=True
)

st.markdown(
    '''
    <div class="header">
        <br/>
        <p style="font-size:22px;">
        ExpertLongBench: Benchmarking Language Models on Expert-Level Long-Form Generation with Structured Checklists
        </p>
        <p style="font-size:20px;">
              πŸ“‘ <a href="https://arxiv.org/abs/2506.01241">Paper</a> | πŸ’» <a href="https://github.com/launchnlp/ExpertLongBench">GitHub</a> | πŸ€— <a href="https://huggingface.co/datasets/launch/ExpertLongBench">Public Dataset</a> |     
               βš™οΈ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 12 | Updated: <strong>June 2025</strong>
        </p>
    </div>
    ''',
    unsafe_allow_html=True
)
# ─── Load data ────────────────────────────────────────────────────────────────
@st.cache_data
def load_data(path):
    df = pd.read_json(path, lines=True)
    score_cols = [f"T{i}" for i in range(1, 12)]
    df["Avg"] = df[score_cols].mean(axis=1).round(1)
    # df["Avg"] = np.ceil(df[score_cols].mean(axis=1) * 10) / 10
    # Compute rank per column (1 = best)
    for col in score_cols + ["Avg"]:
        df[f"{col}_rank"] = df[col].rank(ascending=False, method="min").astype(int)
    return df


# one page description
st.markdown("## πŸ† Leaderboard")
# st.markdown("**Leaderboard:** higher scores shaded green; best models bolded.")

tiers = ['F1', 'Accuracy']
selected_tier = st.selectbox('Select metric:', tiers)

if selected_tier == 'F1':
    df = load_data("src/models.json")

    # Precompute max ranks for color scaling
    score_cols = [f"T{i}" for i in range(1, 12)] + ["Avg"]
    max_ranks = {col: df[f"{col}_rank"].max() for col in score_cols}
    
    # Build raw HTML table
    cols = ["Model"] + [f"T{i}" for i in range(1,12)] + ["Avg"]
    html = "<table style='border-collapse:collapse; width:100%; font-size:14px;'>"
    # header
    html += "<tr>" + "".join(f"<th style='padding:6px;'>{col}</th>" for col in cols) + "</tr>"
    # rows
    for _, row in df.iterrows():
        html += "<tr>"
        for col in cols:
            val = row[col]
            if col == "Model":
                html += f"<td style='padding:6px; text-align:left;'>{val}</td>"
            else:
                rank = int(row[f"{col}_rank"])
                norm = 1 - (rank - 1) / ((max_ranks[col] - 1) or 1)
                # interpolate green (182,243,182) β†’ white (255,255,255)
                r = int(255 - norm*(255-182))
                g = int(255 - norm*(255-243))
                b = 255
                bold = "font-weight:bold;" if rank == 1 else ""
                style = f"background-color:rgb({r},{g},{b}); padding:6px; {bold}"
                html += f"<td style='{style}'>{val}</td>"
        html += "</tr>"
    html += "</table>"
    st.markdown(html, unsafe_allow_html=True)
else:
    df2 = load_data("src/model_acc.json")

    # Precompute max ranks for color scaling
    score_cols = [f"T{i}" for i in range(1, 12)] + ["Avg"]
    max_ranks = {col: df2[f"{col}_rank"].max() for col in score_cols}
    # Build raw HTML table
    cols = ["Model"] + [f"T{i}" for i in range(1,12)] + ["Avg"]
    html = "<table style='border-collapse:collapse; width:100%; font-size:14px;'>"
    # header
    html += "<tr>" + "".join(f"<th style='padding:6px;'>{col}</th>" for col in cols) + "</tr>"
    # rows
    for _, row in df2.iterrows():
        html += "<tr>"
        for col in cols:
            # val = row[col]
            val = f"{row[col]:.1f}" if col != "Model" else row[col]
            if col == "Model":
                html += f"<td style='padding:6px; text-align:left;'>{val}</td>"
            else:
                rank = int(row[f"{col}_rank"])
                norm = 1 - (rank - 1) / ((max_ranks[col] - 1) or 1)
                # interpolate green (182,243,182) β†’ white (255,255,255)
                r = int(255 - norm*(255-182))
                g = int(255 - norm*(255-243))
                b = 255
                bold = "font-weight:bold;" if rank == 1 else ""
                style = f"background-color:rgb({r},{g},{b}); padding:6px; {bold}"
                html += f"<td style='{style}'>{val}</td>"
        html += "</tr>"
    html += "</table>"
    st.markdown(html, unsafe_allow_html=True)  





pipeline_image = Image.open("src/pipeline.png")
buffered2 = BytesIO()
pipeline_image.save(buffered2, format="PNG")
img_data_pipeline = base64.b64encode(buffered2.getvalue()).decode("utf-8")
st.markdown("## 🧠 Abstract")
st.write(
"""
The paper introduces ExpertLongBench, an expert-level benchmark containing 11 tasks from 9 domains that reflect realistic expert workflows and applications.
Beyond question answering, the application-driven tasks in ExpertLongBench demand long-form outputs that can exceed 5,000 tokens and strict adherence to domain-specific requirements. Notably, each task includes rubrics, designed or validated by domain experts, to specify task requirements and guide output evaluation. Furthermore, we propose CLEAR to support accurate evaluation of long-form model outputs on our benchmark.

For fine-grained, expert-aligned evaluation, CLEAR derives checklists from model outputs and reference outputs by extracting information corresponding to items on the task-specific rubrics.
Checklist items for model outputs are then compared with corresponding items for reference outputs to assess their correctness, enabling grounded evaluation.  

We benchmark 11 large language models (LLMs) and analyze components in CLEAR, showing that:
(1) existing LLMs, with the top performer achieving only a 26.8% F1 score, require significant improvement for expert-level tasks;
(2) models can generate content corresponding to the required aspects, though often not accurately; and
(3) accurate checklist extraction and comparison in CLEAR can be achieved by open-weight models for more scalable and low-cost usage.
"""
)


st.markdown("## 🧰 Evaluation Pipeline")
st.markdown(
f"""
<div class="logo-container" style="display:flex; justify-content: center;">
    <img src="data:image/png;base64,{img_data_pipeline}" style="width:90%; max-width:900px;"/>
</div>
""",
unsafe_allow_html=True
)

st.markdown('## πŸ€– Submit Your Model')
st.write(
"""
We provide both 🌐 **public** and πŸ”’ **private** subsets of the dataset.

πŸ§ͺ We recommend starting with the public set for initial testing and development.

πŸ“€ You're welcome to submit your model for evaluation on the private set β€” just make sure to include your results on the public set.

πŸ‘‰ You can submit your model through the following link: [https://forms.gle/mWa6joCfgQnwXsxeA](https://forms.gle/mWa6joCfgQnwXsxeA)
"""
)

st.markdown('## πŸ“’ We Welcome Contributions from the Community')
st.write(
"""
We actively encourage contributions from the research community β€” including:

- βœ… Proposing new tasks and contributing data
- πŸ” Suggesting improvements to existing ones
- 🧠 Sharing domain-specific insights βš–οΈπŸ§ͺπŸ₯πŸ“š 

Your input is invaluable in making ExpertLongBench more representative and impactful across expert domains.

If you're interested in contributing or collaborating, feel free to reach out to us: Jie Ruan (jieruan@umich.edu), Inderjeet Nair (inair@umich.edu), Shuyang Cao (caoshuy@umich.edu), Lu Wang (wangluxy@umich.edu).

Let’s build better evaluations for expert-level AI β€” together πŸš€πŸ€
"""
)

st.markdown("## πŸ“š Citation")
st.write("""
```bibtex
@article{ruan2025expertlongbench,
  title={ExpertLongBench: Benchmarking Language Models on Expert-Level Long-Form Generation Tasks with Structured Checklists},
  author={Ruan, Jie and Nair, Inderjeet and Cao, Shuyang and Liu, Amy and Munir, Sheza and Pollens-Dempsey, Micah and Chiang, Tiffany and Kates, Lucy and David, Nicholas and Chen, Sihan and others},
  journal={arXiv preprint arXiv:2506.01241},
  year={2025}
}
""")

# # ─── Tabs ──────────────────────────────────────────────────────────────────────
# tab1, tab2, tab3, tab4 = st.tabs(["πŸ† Leaderboard", "πŸ” Benchmark Details", "πŸ€– Submit Your Model", "🧩 Community Contributions Welcome"])

# with tab1:
#     # st.markdown("**Leaderboard:** higher scores shaded green; best models bolded.")
#     # Build raw HTML table
#     cols = ["Model"] + [f"T{i}" for i in range(1,12)] + ["Avg"]
#     html = "<table style='border-collapse:collapse; width:100%; font-size:14px;'>"
#     # header
#     html += "<tr>" + "".join(f"<th style='padding:6px;'>{col}</th>" for col in cols) + "</tr>"
#     # rows
#     for _, row in df.iterrows():
#         html += "<tr>"
#         for col in cols:
#             val = row[col]
#             if col == "Model":
#                 html += f"<td style='padding:6px; text-align:left;'>{val}</td>"
#             else:
#                 rank = int(row[f"{col}_rank"])
#                 norm = 1 - (rank - 1) / ((max_ranks[col] - 1) or 1)
#                 # interpolate green (182,243,182) β†’ white (255,255,255)
#                 r = int(255 - norm*(255-182))
#                 g = int(255 - norm*(255-243))
#                 b = 255
#                 bold = "font-weight:bold;" if rank == 1 else ""
#                 style = f"background-color:rgb({r},{g},{b}); padding:6px; {bold}"
#                 html += f"<td style='{style}'>{val}</td>"
#         html += "</tr>"
#     html += "</table>"
#     st.markdown(html, unsafe_allow_html=True)

# with tab2:
#     pipeline_image = Image.open("src/pipeline.png")
#     buffered2 = BytesIO()
#     pipeline_image.save(buffered2, format="PNG")
#     img_data_pipeline = base64.b64encode(buffered2.getvalue()).decode("utf-8")
#     st.markdown("## Abstract")
#     st.write(
#     """
#     The paper introduces ExpertLongBench, an expert-level benchmark containing 11 tasks from 9 domains that reflect realistic expert workflows and applications.
#     Beyond question answering, the application-driven tasks in ExpertLongBench demand long-form outputs that can exceed 5,000 tokens and strict adherence to domain-specific requirements. Notably, each task includes rubrics, designed or validated by domain experts, to specify task requirements and guide output evaluation. Furthermore, we propose CLEAR to support accurate evaluation of long-form model outputs on our benchmark.

#     For fine-grained, expert-aligned evaluation, CLEAR derives checklists from model outputs and reference outputs by extracting information corresponding to items on the task-specific rubrics.
#     Checklist items for model outputs are then compared with corresponding items for reference outputs to assess their correctness, enabling grounded evaluation.  

#     We benchmark 11 large language models (LLMs) and analyze components in CLEAR, showing that:
#     (1) existing LLMs, with the top performer achieving only a 26.8% F1 score, require significant improvement for expert-level tasks;
#     (2) models can generate content corresponding to the required aspects, though often not accurately; and
#     (3) accurate checklist extraction and comparison in CLEAR can be achieved by open-weight models for more scalable and low-cost usage.
#     """
#     )

    
#     st.markdown("## Pipeline")
#     st.markdown(
#     f"""
#     <div class="logo-container" style="display:flex; justify-content: center;">
#         <img src="data:image/png;base64,{img_data_pipeline}" style="width:90%; max-width:900px;"/>
#     </div>
#     """,
#     unsafe_allow_html=True
# )

# with tab3:
#     st.markdown('## πŸ€– Submit Your Model')
#     st.write(
#     """
#     We provide both 🌐 **public** and πŸ”’ **private** subsets of the dataset.
    
#     πŸ§ͺ We recommend starting with the public set for initial testing and development.
    
#     πŸ“€ You're welcome to submit your model for evaluation on the private set β€” just make sure to include your results on the public set.
    
#     πŸ‘‰ You can submit your model through the following link: [https://forms.gle/mWa6joCfgQnwXsxeA](https://forms.gle/mWa6joCfgQnwXsxeA)
#     """
#     )

# with tab4:
#     st.markdown('## πŸ“’ We Welcome Contributions from the Community')
#     st.write(
#     """
#     We actively encourage contributions from the research community β€” including:
    
#     - βœ… Proposing new tasks and contributing data
#     - πŸ” Suggesting improvements to existing ones
#     - 🧠 Sharing domain-specific insights βš–οΈπŸ§ͺπŸ₯πŸ“š 
    
#     Your input is invaluable in making ExpertLongBench more representative and impactful across expert domains.
    
#     If you're interested in contributing or collaborating, feel free to reach out to us: Jie Ruan (jieruan@umich.edu), Inderjeet Nair (inair@umich.edu), Shuyang Cao (caoshuy@umich.edu), Lu Wang (wangluxy@umich.edu).
    
#     Let’s build better evaluations for expert-level AI β€” together πŸš€πŸ€
#     """
#     )