JieRuan commited on
Commit
8540c68
Β·
verified Β·
1 Parent(s): 8ade390

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +174 -84
src/streamlit_app.py CHANGED
@@ -65,97 +65,187 @@ df = load_data()
65
  score_cols = [f"T{i}" for i in range(1, 12)] + ["Avg"]
66
  max_ranks = {col: df[f"{col}_rank"].max() for col in score_cols}
67
 
68
- # ─── Tabs ──────────────────────────────────────────────────────────────────────
69
- tab1, tab2, tab3, tab4 = st.tabs(["πŸ† Leaderboard", "πŸ” Benchmark Details", "πŸ€– Submit Your Model", "🧩 Community Contributions Welcome"])
70
-
71
- with tab1:
72
- # st.markdown("**Leaderboard:** higher scores shaded green; best models bolded.")
73
- # Build raw HTML table
74
- cols = ["Model"] + [f"T{i}" for i in range(1,12)] + ["Avg"]
75
- html = "<table style='border-collapse:collapse; width:100%; font-size:14px;'>"
76
- # header
77
- html += "<tr>" + "".join(f"<th style='padding:6px;'>{col}</th>" for col in cols) + "</tr>"
78
- # rows
79
- for _, row in df.iterrows():
80
- html += "<tr>"
81
- for col in cols:
82
- val = row[col]
83
- if col == "Model":
84
- html += f"<td style='padding:6px; text-align:left;'>{val}</td>"
85
- else:
86
- rank = int(row[f"{col}_rank"])
87
- norm = 1 - (rank - 1) / ((max_ranks[col] - 1) or 1)
88
- # interpolate green (182,243,182) β†’ white (255,255,255)
89
- r = int(255 - norm*(255-182))
90
- g = int(255 - norm*(255-243))
91
- b = 255
92
- bold = "font-weight:bold;" if rank == 1 else ""
93
- style = f"background-color:rgb({r},{g},{b}); padding:6px; {bold}"
94
- html += f"<td style='{style}'>{val}</td>"
95
- html += "</tr>"
96
- html += "</table>"
97
- st.markdown(html, unsafe_allow_html=True)
98
-
99
- with tab2:
100
- pipeline_image = Image.open("src/pipeline.png")
101
- buffered2 = BytesIO()
102
- pipeline_image.save(buffered2, format="PNG")
103
- img_data_pipeline = base64.b64encode(buffered2.getvalue()).decode("utf-8")
104
- st.markdown("## Abstract")
105
- st.write(
106
- """
107
- The paper introduces ExpertLongBench, an expert-level benchmark containing 11 tasks from 9 domains that reflect realistic expert workflows and applications.
108
- Beyond question answering, the application-driven tasks in ExpertLongBench demand long-form outputs that can exceed 5,000 tokens and strict adherence to domain-specific requirements. Notably, each task includes rubrics, designed or validated by domain experts, to specify task requirements and guide output evaluation. Furthermore, we propose CLEAR to support accurate evaluation of long-form model outputs on our benchmark.
109
-
110
- For fine-grained, expert-aligned evaluation, CLEAR derives checklists from model outputs and reference outputs by extracting information corresponding to items on the task-specific rubrics.
111
- Checklist items for model outputs are then compared with corresponding items for reference outputs to assess their correctness, enabling grounded evaluation.
112
-
113
- We benchmark 11 large language models (LLMs) and analyze components in CLEAR, showing that:
114
- (1) existing LLMs, with the top performer achieving only a 26.8% F1 score, require significant improvement for expert-level tasks;
115
- (2) models can generate content corresponding to the required aspects, though often not accurately; and
116
- (3) accurate checklist extraction and comparison in CLEAR can be achieved by open-weight models for more scalable and low-cost usage.
117
- """
118
- )
119
 
120
-
121
- st.markdown("## Pipeline")
122
- st.markdown(
123
- f"""
124
- <div class="logo-container" style="display:flex; justify-content: center;">
125
- <img src="data:image/png;base64,{img_data_pipeline}" style="width:90%; max-width:900px;"/>
126
- </div>
127
- """,
128
- unsafe_allow_html=True
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  )
130
 
131
- with tab3:
132
- st.markdown('## πŸ€– Submit Your Model')
133
- st.write(
134
- """
135
- We provide both 🌐 **public** and πŸ”’ **private** subsets of the dataset.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
- πŸ§ͺ We recommend starting with the public set for initial testing and development.
138
 
139
- πŸ“€ You're welcome to submit your model for evaluation on the private set β€” just make sure to include your results on the public set.
140
 
141
- πŸ‘‰ You can submit your model through the following link: [https://forms.gle/mWa6joCfgQnwXsxeA](https://forms.gle/mWa6joCfgQnwXsxeA)
142
- """
143
- )
144
-
145
- with tab4:
146
- st.markdown('## πŸ“’ We Welcome Contributions from the Community')
147
- st.write(
148
- """
149
- We actively encourage contributions from the research community β€” including:
150
 
151
- - βœ… Proposing new tasks and contributing data
152
- - πŸ” Suggesting improvements to existing ones
153
- - 🧠 Sharing domain-specific insights βš–οΈπŸ§ͺπŸ₯πŸ“š
154
 
155
- Your input is invaluable in making ExpertLongBench more representative and impactful across expert domains.
156
 
157
- If you're interested in contributing or collaborating, feel free to reach out to us: Jie Ruan (jieruan@umich.edu), Inderjeet Nair (inair@umich.edu), Shuyang Cao (caoshuy@umich.edu), Lu Wang (wangluxy@umich.edu).
158
 
159
- Let’s build better evaluations for expert-level AI β€” together πŸš€πŸ€
160
- """
161
- )
 
65
  score_cols = [f"T{i}" for i in range(1, 12)] + ["Avg"]
66
  max_ranks = {col: df[f"{col}_rank"].max() for col in score_cols}
67
 
68
+ # one page description
69
+ # st.markdown("**Leaderboard:** higher scores shaded green; best models bolded.")
70
+ # Build raw HTML table
71
+ cols = ["Model"] + [f"T{i}" for i in range(1,12)] + ["Avg"]
72
+ html = "<table style='border-collapse:collapse; width:100%; font-size:14px;'>"
73
+ # header
74
+ html += "<tr>" + "".join(f"<th style='padding:6px;'>{col}</th>" for col in cols) + "</tr>"
75
+ # rows
76
+ for _, row in df.iterrows():
77
+ html += "<tr>"
78
+ for col in cols:
79
+ val = row[col]
80
+ if col == "Model":
81
+ html += f"<td style='padding:6px; text-align:left;'>{val}</td>"
82
+ else:
83
+ rank = int(row[f"{col}_rank"])
84
+ norm = 1 - (rank - 1) / ((max_ranks[col] - 1) or 1)
85
+ # interpolate green (182,243,182) β†’ white (255,255,255)
86
+ r = int(255 - norm*(255-182))
87
+ g = int(255 - norm*(255-243))
88
+ b = 255
89
+ bold = "font-weight:bold;" if rank == 1 else ""
90
+ style = f"background-color:rgb({r},{g},{b}); padding:6px; {bold}"
91
+ html += f"<td style='{style}'>{val}</td>"
92
+ html += "</tr>"
93
+ html += "</table>"
94
+ st.markdown(html, unsafe_allow_html=True)
95
+
96
+ pipeline_image = Image.open("src/pipeline.png")
97
+ buffered2 = BytesIO()
98
+ pipeline_image.save(buffered2, format="PNG")
99
+ img_data_pipeline = base64.b64encode(buffered2.getvalue()).decode("utf-8")
100
+ st.markdown("## Abstract")
101
+ st.write(
102
+ """
103
+ The paper introduces ExpertLongBench, an expert-level benchmark containing 11 tasks from 9 domains that reflect realistic expert workflows and applications.
104
+ Beyond question answering, the application-driven tasks in ExpertLongBench demand long-form outputs that can exceed 5,000 tokens and strict adherence to domain-specific requirements. Notably, each task includes rubrics, designed or validated by domain experts, to specify task requirements and guide output evaluation. Furthermore, we propose CLEAR to support accurate evaluation of long-form model outputs on our benchmark.
105
+
106
+ For fine-grained, expert-aligned evaluation, CLEAR derives checklists from model outputs and reference outputs by extracting information corresponding to items on the task-specific rubrics.
107
+ Checklist items for model outputs are then compared with corresponding items for reference outputs to assess their correctness, enabling grounded evaluation.
108
+
109
+ We benchmark 11 large language models (LLMs) and analyze components in CLEAR, showing that:
110
+ (1) existing LLMs, with the top performer achieving only a 26.8% F1 score, require significant improvement for expert-level tasks;
111
+ (2) models can generate content corresponding to the required aspects, though often not accurately; and
112
+ (3) accurate checklist extraction and comparison in CLEAR can be achieved by open-weight models for more scalable and low-cost usage.
113
+ """
114
+ )
 
 
 
 
115
 
116
+
117
+ st.markdown("## Pipeline")
118
+ st.markdown(
119
+ f"""
120
+ <div class="logo-container" style="display:flex; justify-content: center;">
121
+ <img src="data:image/png;base64,{img_data_pipeline}" style="width:90%; max-width:900px;"/>
122
+ </div>
123
+ """,
124
+ unsafe_allow_html=True
125
+ )
126
+
127
+ st.markdown('## πŸ€– Submit Your Model')
128
+ st.write(
129
+ """
130
+ We provide both 🌐 **public** and πŸ”’ **private** subsets of the dataset.
131
+
132
+ πŸ§ͺ We recommend starting with the public set for initial testing and development.
133
+
134
+ πŸ“€ You're welcome to submit your model for evaluation on the private set β€” just make sure to include your results on the public set.
135
+
136
+ πŸ‘‰ You can submit your model through the following link: [https://forms.gle/mWa6joCfgQnwXsxeA](https://forms.gle/mWa6joCfgQnwXsxeA)
137
+ """
138
  )
139
 
140
+ st.markdown('## πŸ“’ We Welcome Contributions from the Community')
141
+ st.write(
142
+ """
143
+ We actively encourage contributions from the research community β€” including:
144
+
145
+ - βœ… Proposing new tasks and contributing data
146
+ - πŸ” Suggesting improvements to existing ones
147
+ - 🧠 Sharing domain-specific insights βš–οΈπŸ§ͺπŸ₯πŸ“š
148
+
149
+ Your input is invaluable in making ExpertLongBench more representative and impactful across expert domains.
150
+
151
+ If you're interested in contributing or collaborating, feel free to reach out to us: Jie Ruan (jieruan@umich.edu), Inderjeet Nair (inair@umich.edu), Shuyang Cao (caoshuy@umich.edu), Lu Wang (wangluxy@umich.edu).
152
+
153
+ Let’s build better evaluations for expert-level AI β€” together πŸš€πŸ€
154
+ """
155
+ )
156
+
157
+
158
+ # # ─── Tabs ──────────────────────────────────────────────────────────────────────
159
+ # tab1, tab2, tab3, tab4 = st.tabs(["πŸ† Leaderboard", "πŸ” Benchmark Details", "πŸ€– Submit Your Model", "🧩 Community Contributions Welcome"])
160
+
161
+ # with tab1:
162
+ # # st.markdown("**Leaderboard:** higher scores shaded green; best models bolded.")
163
+ # # Build raw HTML table
164
+ # cols = ["Model"] + [f"T{i}" for i in range(1,12)] + ["Avg"]
165
+ # html = "<table style='border-collapse:collapse; width:100%; font-size:14px;'>"
166
+ # # header
167
+ # html += "<tr>" + "".join(f"<th style='padding:6px;'>{col}</th>" for col in cols) + "</tr>"
168
+ # # rows
169
+ # for _, row in df.iterrows():
170
+ # html += "<tr>"
171
+ # for col in cols:
172
+ # val = row[col]
173
+ # if col == "Model":
174
+ # html += f"<td style='padding:6px; text-align:left;'>{val}</td>"
175
+ # else:
176
+ # rank = int(row[f"{col}_rank"])
177
+ # norm = 1 - (rank - 1) / ((max_ranks[col] - 1) or 1)
178
+ # # interpolate green (182,243,182) β†’ white (255,255,255)
179
+ # r = int(255 - norm*(255-182))
180
+ # g = int(255 - norm*(255-243))
181
+ # b = 255
182
+ # bold = "font-weight:bold;" if rank == 1 else ""
183
+ # style = f"background-color:rgb({r},{g},{b}); padding:6px; {bold}"
184
+ # html += f"<td style='{style}'>{val}</td>"
185
+ # html += "</tr>"
186
+ # html += "</table>"
187
+ # st.markdown(html, unsafe_allow_html=True)
188
+
189
+ # with tab2:
190
+ # pipeline_image = Image.open("src/pipeline.png")
191
+ # buffered2 = BytesIO()
192
+ # pipeline_image.save(buffered2, format="PNG")
193
+ # img_data_pipeline = base64.b64encode(buffered2.getvalue()).decode("utf-8")
194
+ # st.markdown("## Abstract")
195
+ # st.write(
196
+ # """
197
+ # The paper introduces ExpertLongBench, an expert-level benchmark containing 11 tasks from 9 domains that reflect realistic expert workflows and applications.
198
+ # Beyond question answering, the application-driven tasks in ExpertLongBench demand long-form outputs that can exceed 5,000 tokens and strict adherence to domain-specific requirements. Notably, each task includes rubrics, designed or validated by domain experts, to specify task requirements and guide output evaluation. Furthermore, we propose CLEAR to support accurate evaluation of long-form model outputs on our benchmark.
199
+
200
+ # For fine-grained, expert-aligned evaluation, CLEAR derives checklists from model outputs and reference outputs by extracting information corresponding to items on the task-specific rubrics.
201
+ # Checklist items for model outputs are then compared with corresponding items for reference outputs to assess their correctness, enabling grounded evaluation.
202
+
203
+ # We benchmark 11 large language models (LLMs) and analyze components in CLEAR, showing that:
204
+ # (1) existing LLMs, with the top performer achieving only a 26.8% F1 score, require significant improvement for expert-level tasks;
205
+ # (2) models can generate content corresponding to the required aspects, though often not accurately; and
206
+ # (3) accurate checklist extraction and comparison in CLEAR can be achieved by open-weight models for more scalable and low-cost usage.
207
+ # """
208
+ # )
209
+
210
+
211
+ # st.markdown("## Pipeline")
212
+ # st.markdown(
213
+ # f"""
214
+ # <div class="logo-container" style="display:flex; justify-content: center;">
215
+ # <img src="data:image/png;base64,{img_data_pipeline}" style="width:90%; max-width:900px;"/>
216
+ # </div>
217
+ # """,
218
+ # unsafe_allow_html=True
219
+ # )
220
+
221
+ # with tab3:
222
+ # st.markdown('## πŸ€– Submit Your Model')
223
+ # st.write(
224
+ # """
225
+ # We provide both 🌐 **public** and πŸ”’ **private** subsets of the dataset.
226
 
227
+ # πŸ§ͺ We recommend starting with the public set for initial testing and development.
228
 
229
+ # πŸ“€ You're welcome to submit your model for evaluation on the private set β€” just make sure to include your results on the public set.
230
 
231
+ # πŸ‘‰ You can submit your model through the following link: [https://forms.gle/mWa6joCfgQnwXsxeA](https://forms.gle/mWa6joCfgQnwXsxeA)
232
+ # """
233
+ # )
234
+
235
+ # with tab4:
236
+ # st.markdown('## πŸ“’ We Welcome Contributions from the Community')
237
+ # st.write(
238
+ # """
239
+ # We actively encourage contributions from the research community β€” including:
240
 
241
+ # - βœ… Proposing new tasks and contributing data
242
+ # - πŸ” Suggesting improvements to existing ones
243
+ # - 🧠 Sharing domain-specific insights βš–οΈπŸ§ͺπŸ₯πŸ“š
244
 
245
+ # Your input is invaluable in making ExpertLongBench more representative and impactful across expert domains.
246
 
247
+ # If you're interested in contributing or collaborating, feel free to reach out to us: Jie Ruan (jieruan@umich.edu), Inderjeet Nair (inair@umich.edu), Shuyang Cao (caoshuy@umich.edu), Lu Wang (wangluxy@umich.edu).
248
 
249
+ # Let’s build better evaluations for expert-level AI β€” together πŸš€πŸ€
250
+ # """
251
+ # )