Upgrade Gradio table
Browse files- README.md +2 -2
- app.py +27 -6
- debug.ipynb → dev.ipynb +0 -0
README.md
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
emoji: ⚡
|
| 4 |
colorFrom: gray
|
| 5 |
colorTo: red
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version:
|
| 8 |
app_file: app.py
|
| 9 |
pinned: true
|
| 10 |
---
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Large Reasoning Models Leaderboard
|
| 3 |
emoji: ⚡
|
| 4 |
colorFrom: gray
|
| 5 |
colorTo: red
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 5.24.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: true
|
| 10 |
---
|
app.py
CHANGED
|
@@ -4,10 +4,10 @@ from pathlib import Path
|
|
| 4 |
import gradio as gr
|
| 5 |
import pandas as pd
|
| 6 |
|
| 7 |
-
TITLE = """<h1 align="center" id="space-title">
|
| 8 |
|
| 9 |
DESCRIPTION = f"""
|
| 10 |
-
Evaluation of
|
| 11 |
"""
|
| 12 |
|
| 13 |
BENCHMARKS_TO_SKIP = ["math", "mini_math", "aimo_math_integer_lvl4-5", "mini_math_v2"]
|
|
@@ -80,6 +80,14 @@ def get_leaderboard_df():
|
|
| 80 |
elif task.lower() == "agieval":
|
| 81 |
value = data["results"]["all"]["acc_norm"]
|
| 82 |
df.loc[model_revision, task] = float(value)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
# MATH reports qem
|
| 84 |
elif task.lower() in ["aimo_kaggle", "math_deepseek_cot", "math_deepseek_rl_cot"]:
|
| 85 |
value = data["results"]["all"]["qem"]
|
|
@@ -135,7 +143,10 @@ def get_leaderboard_df():
|
|
| 135 |
# Trim AIMO column names
|
| 136 |
df.columns = [c.replace("aimo_", "") for c in df.columns]
|
| 137 |
|
| 138 |
-
df = df.reset_index().rename(columns={"index": "Model"})
|
|
|
|
|
|
|
|
|
|
| 139 |
# Strip off date from model name
|
| 140 |
df["Model"] = df["Model"].apply(lambda x: x.rsplit("_", 1)[0])
|
| 141 |
|
|
@@ -154,6 +165,9 @@ def agg_df(df, agg: str = "max"):
|
|
| 154 |
|
| 155 |
# Convert all values to percentage
|
| 156 |
df[df.select_dtypes(include=["number"]).columns] *= 100.0
|
|
|
|
|
|
|
|
|
|
| 157 |
df = df.sort_values(by=["Average"], ascending=False)
|
| 158 |
return df
|
| 159 |
|
|
@@ -177,6 +191,9 @@ def filter_and_search(cols: list[str], search_query: str, agg: str):
|
|
| 177 |
df = df.copy().dropna(how="all", axis=0, subset=[c for c in df.columns if c in cols])
|
| 178 |
# Recompute average
|
| 179 |
df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))
|
|
|
|
|
|
|
|
|
|
| 180 |
return df
|
| 181 |
|
| 182 |
|
|
@@ -187,7 +204,9 @@ with demo:
|
|
| 187 |
with gr.Column():
|
| 188 |
gr.Markdown(DESCRIPTION, elem_classes="markdown-text")
|
| 189 |
with gr.Row():
|
| 190 |
-
search_bar = gr.Textbox(
|
|
|
|
|
|
|
| 191 |
agg = gr.Radio(
|
| 192 |
["min", "max", "mean"],
|
| 193 |
value="max",
|
|
@@ -196,7 +215,7 @@ with demo:
|
|
| 196 |
)
|
| 197 |
with gr.Row():
|
| 198 |
cols_bar = gr.CheckboxGroup(
|
| 199 |
-
choices=[c for c in leaderboard_df.columns[1:] if c
|
| 200 |
show_label=False,
|
| 201 |
info="Select columns to display",
|
| 202 |
)
|
|
@@ -204,8 +223,10 @@ with demo:
|
|
| 204 |
leaderboard_table = gr.Dataframe(
|
| 205 |
value=leaderboard_df,
|
| 206 |
wrap=True,
|
| 207 |
-
|
| 208 |
column_widths=[400, 110] + [(260 + len(c)) for c in leaderboard_df.columns[1:]],
|
|
|
|
|
|
|
| 209 |
)
|
| 210 |
|
| 211 |
cols_bar.change(filter_and_search, inputs=[cols_bar, search_bar, agg], outputs=[leaderboard_table])
|
|
|
|
| 4 |
import gradio as gr
|
| 5 |
import pandas as pd
|
| 6 |
|
| 7 |
+
TITLE = """<h1 align="center" id="space-title">Large Reasoning Models Leaderboard</h1>"""
|
| 8 |
|
| 9 |
DESCRIPTION = f"""
|
| 10 |
+
Evaluation of Open R1 models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy.
|
| 11 |
"""
|
| 12 |
|
| 13 |
BENCHMARKS_TO_SKIP = ["math", "mini_math", "aimo_math_integer_lvl4-5", "mini_math_v2"]
|
|
|
|
| 80 |
elif task.lower() == "agieval":
|
| 81 |
value = data["results"]["all"]["acc_norm"]
|
| 82 |
df.loc[model_revision, task] = float(value)
|
| 83 |
+
# AIME24 and 25 report pass@1
|
| 84 |
+
elif task.lower() in ["aime24", "aime25"]:
|
| 85 |
+
value = (
|
| 86 |
+
data["results"]["all"]["math_pass@1:32_samples"]
|
| 87 |
+
if "math_pass@1:32_samples" in data["results"]["all"]
|
| 88 |
+
else -1
|
| 89 |
+
)
|
| 90 |
+
df.loc[model_revision, task] = float(value)
|
| 91 |
# MATH reports qem
|
| 92 |
elif task.lower() in ["aimo_kaggle", "math_deepseek_cot", "math_deepseek_rl_cot"]:
|
| 93 |
value = data["results"]["all"]["qem"]
|
|
|
|
| 143 |
# Trim AIMO column names
|
| 144 |
df.columns = [c.replace("aimo_", "") for c in df.columns]
|
| 145 |
|
| 146 |
+
df = df.reset_index().rename(columns={"index": "Model"})
|
| 147 |
+
# Apply rounding only to numeric columns
|
| 148 |
+
numeric_cols = df.select_dtypes(include=["float64", "float32", "int64", "int32"]).columns
|
| 149 |
+
df[numeric_cols] = df[numeric_cols].round(4)
|
| 150 |
# Strip off date from model name
|
| 151 |
df["Model"] = df["Model"].apply(lambda x: x.rsplit("_", 1)[0])
|
| 152 |
|
|
|
|
| 165 |
|
| 166 |
# Convert all values to percentage
|
| 167 |
df[df.select_dtypes(include=["number"]).columns] *= 100.0
|
| 168 |
+
# Apply rounding only to numeric columns
|
| 169 |
+
numeric_cols = df.select_dtypes(include=["float64", "float32", "int64", "int32"]).columns
|
| 170 |
+
df[numeric_cols] = df[numeric_cols].round(4)
|
| 171 |
df = df.sort_values(by=["Average"], ascending=False)
|
| 172 |
return df
|
| 173 |
|
|
|
|
| 191 |
df = df.copy().dropna(how="all", axis=0, subset=[c for c in df.columns if c in cols])
|
| 192 |
# Recompute average
|
| 193 |
df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))
|
| 194 |
+
# Apply rounding only to numeric columns
|
| 195 |
+
numeric_cols = df.select_dtypes(include=["float64", "float32", "int64", "int32"]).columns
|
| 196 |
+
df[numeric_cols] = df[numeric_cols].round(4)
|
| 197 |
return df
|
| 198 |
|
| 199 |
|
|
|
|
| 204 |
with gr.Column():
|
| 205 |
gr.Markdown(DESCRIPTION, elem_classes="markdown-text")
|
| 206 |
with gr.Row():
|
| 207 |
+
search_bar = gr.Textbox(
|
| 208 |
+
placeholder="Search for your model. Use semicolons for multiple terms", show_label=False
|
| 209 |
+
)
|
| 210 |
agg = gr.Radio(
|
| 211 |
["min", "max", "mean"],
|
| 212 |
value="max",
|
|
|
|
| 215 |
)
|
| 216 |
with gr.Row():
|
| 217 |
cols_bar = gr.CheckboxGroup(
|
| 218 |
+
choices=sorted([c for c in leaderboard_df.columns[1:] if c not in ["Average", "Date"]]),
|
| 219 |
show_label=False,
|
| 220 |
info="Select columns to display",
|
| 221 |
)
|
|
|
|
| 223 |
leaderboard_table = gr.Dataframe(
|
| 224 |
value=leaderboard_df,
|
| 225 |
wrap=True,
|
| 226 |
+
max_height=1000,
|
| 227 |
column_widths=[400, 110] + [(260 + len(c)) for c in leaderboard_df.columns[1:]],
|
| 228 |
+
show_row_numbers=True,
|
| 229 |
+
show_copy_button=True,
|
| 230 |
)
|
| 231 |
|
| 232 |
cols_bar.change(filter_and_search, inputs=[cols_bar, search_bar, agg], outputs=[leaderboard_table])
|
debug.ipynb → dev.ipynb
RENAMED
|
File without changes
|