Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
asdf
#1
by
galb-dai
- opened
- .gitattributes +0 -3
- README.md +40 -22
- SECURITY.md +0 -4
- TRADEMARKS.md +0 -5
- app.py +184 -542
- assets/DominatingSetAnimation.mp4 +0 -3
- assets/banner.png +2 -2
- assets/{bag_modifications.png → banner_backup.png} +2 -2
- assets/perf_plot.png +0 -3
- docs/privacy-policy.md +0 -56
- dummy_submission.jsonl +0 -2
- requirements.txt +2 -3
- scripts/upload_f1_dataset.py +5 -20
- src/about.py +54 -110
- src/datamodel/data.py +10 -21
- src/display/__init__.py +0 -0
- src/display/css_html_js.py +75 -195
- src/display/formatting.py +17 -0
- src/display/utils.py +82 -19
- src/envs.py +0 -2
- src/leaderboard/read_evals.py +196 -0
- src/logger.py +1 -5
- src/populate.py +68 -56
- src/submission/check_validity.py +102 -0
- src/submission/submit.py +83 -171
- src/validation/__init__.py +0 -0
- src/validation/validate.py +0 -115
- terms/submission-agreement.md +0 -71
.gitattributes
CHANGED
@@ -34,6 +34,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
|
36 |
*.png filter=lfs diff=lfs merge=lfs -text
|
37 |
-
*.mp4 filter=lfs diff=lfs merge=lfs -text
|
38 |
-
git filter=lfs diff=lfs merge=lfs -text
|
39 |
-
add filter=lfs diff=lfs merge=lfs -text
|
|
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
|
36 |
*.png filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
README.md
CHANGED
@@ -4,37 +4,55 @@ emoji: 🥇
|
|
4 |
colorFrom: green
|
5 |
colorTo: indigo
|
6 |
sdk: gradio
|
7 |
-
hf_oauth: true
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
license: apache-2.0
|
11 |
short_description: FormulaOne Leaderboard
|
12 |
-
sdk_version: 5.
|
13 |
-
python_version: 3.12.0
|
14 |
---
|
15 |
|
|
|
16 |
|
17 |
-
|
18 |
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
-
|
34 |
|
35 |
-
|
36 |
|
37 |
-
|
38 |
-
*Gal Beniamini, Yuval Dor, Alon Vinnikov, Shir Granot Peled, Or Weinstein, Or Sharir, Noam Wies, Tomer Nussbaum, Nadav Schweiger, Ido Ben Shaul, Tomer Zekharya, Yoav Levine, Shai Shalev-Shwartz, Amnon Shashua* <br>
|
39 |
-
**AAI, July 2025**
|
40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
colorFrom: green
|
5 |
colorTo: indigo
|
6 |
sdk: gradio
|
|
|
7 |
app_file: app.py
|
8 |
pinned: true
|
9 |
license: apache-2.0
|
10 |
short_description: FormulaOne Leaderboard
|
11 |
+
sdk_version: 5.19.0
|
|
|
12 |
---
|
13 |
|
14 |
+
# Start the configuration
|
15 |
|
16 |
+
Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
|
17 |
|
18 |
+
Results files should have the following format and be stored as json files:
|
19 |
+
```json
|
20 |
+
{
|
21 |
+
"config": {
|
22 |
+
"model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
|
23 |
+
"model_name": "path of the model on the hub: org/model",
|
24 |
+
"model_sha": "revision on the hub",
|
25 |
+
},
|
26 |
+
"results": {
|
27 |
+
"task_name": {
|
28 |
+
"metric_name": score,
|
29 |
+
},
|
30 |
+
"task_name2": {
|
31 |
+
"metric_name": score,
|
32 |
+
}
|
33 |
+
}
|
34 |
+
}
|
35 |
+
```
|
36 |
|
37 |
+
Request files are created automatically by this tool.
|
38 |
|
39 |
+
If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
|
40 |
|
41 |
+
# Code logic for more complex edits
|
|
|
|
|
42 |
|
43 |
+
You'll find
|
44 |
+
- the main table' columns names and properties in `src/display/utils.py`
|
45 |
+
- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
|
46 |
+
- the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
|
47 |
+
|
48 |
+
|
49 |
+
# Setting up the environment
|
50 |
+
If you encounter issues with SentencePiece package, see this thread:
|
51 |
+
https://github.com/google/sentencepiece/issues/1083
|
52 |
+
|
53 |
+
Specifically:
|
54 |
+
```
|
55 |
+
brew install cmake pkg-config
|
56 |
+
export CMAKE_POLICY_VERSION_MINIMUM=3.5
|
57 |
+
pip install sentencepiece --no-cache-dir
|
58 |
+
```
|
SECURITY.md
DELETED
@@ -1,4 +0,0 @@
|
|
1 |
-
To report a vulnerability or security concern, email
|
2 |
-
security@formulaone-ai.org.
|
3 |
-
Please include steps to reproduce and any relevant logs. We will
|
4 |
-
acknowledge receipt within 3 business days.
|
|
|
|
|
|
|
|
|
|
TRADEMARKS.md
DELETED
@@ -1,5 +0,0 @@
|
|
1 |
-
"FormulaOne" is used solely to identify this research benchmark.
|
2 |
-
You may use the name to refer to the benchmark, but you may not use it:
|
3 |
-
- to imply sponsorship or endorsement of your project or results, or
|
4 |
-
- as part of your own product or service name without written
|
5 |
-
permission.
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
@@ -1,83 +1,68 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
import math
|
4 |
|
5 |
import gradio as gr
|
|
|
6 |
import pandas as pd
|
7 |
-
import plotly.graph_objects as go
|
8 |
from apscheduler.schedulers.background import BackgroundScheduler
|
9 |
-
|
10 |
-
from huggingface_hub import
|
11 |
-
|
12 |
-
|
13 |
-
from src.about import WHAT_IS_F1_HTML_AFTER_VIDEO # text immediately after the video
|
14 |
-
from src.about import WHAT_IS_F1_HTML_BOTTOM_A_AFTER_TABS # text after the heading, before the first figure
|
15 |
-
from src.about import WHAT_IS_F1_HTML_BOTTOM_A_BEFORE_TABS # up to (and including) the "Infinite Well" heading
|
16 |
-
from src.about import WHAT_IS_F1_HTML_EVAL_BEFORE_WARMUPFIG # evaluation section up to before Warmup fig
|
17 |
-
from src.about import ( # tail after Tier1 fig; ⬅️ split to insert the tabs right after the heading
|
18 |
CITATION_BUTTON_LABEL,
|
19 |
CITATION_BUTTON_TEXT,
|
20 |
EVALUATION_QUEUE_TEXT,
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
)
|
25 |
from src.datamodel.data import F1Data
|
26 |
-
from src.display.css_html_js import custom_css
|
27 |
-
from src.display.formatting import styled_error
|
28 |
-
from src.display.utils import AutoEvalColumn, fields
|
29 |
-
from src.envs import API, CODE_PROBLEMS_REPO, REPO_ID, RESULTS_REPO, SUBMISSIONS_REPO
|
30 |
-
from src.logger import get_logger
|
31 |
-
from src.populate import get_leaderboard_df
|
32 |
-
from src.submission.submit import add_new_solutions, fetch_user_info
|
33 |
-
from src.validation.validate import MAX_INPUT_LENGTH, MIN_INPUT_LENGTH, is_submission_file_valid, is_valid
|
34 |
-
|
35 |
-
logger = get_logger(__name__)
|
36 |
|
37 |
-
|
38 |
-
SPLIT = "hard" # warmup for debug
|
39 |
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
|
|
|
|
|
|
|
|
|
|
45 |
)
|
|
|
|
|
46 |
|
47 |
-
|
|
|
48 |
|
49 |
-
logger
|
|
|
|
|
|
|
50 |
|
51 |
|
52 |
def restart_space():
|
53 |
-
logger.info("Restarting space")
|
54 |
API.restart_space(repo_id=REPO_ID)
|
55 |
|
56 |
|
57 |
-
|
58 |
-
|
59 |
-
global leaderboard_df
|
60 |
-
try:
|
61 |
-
logger.info("Loading leaderboard data...")
|
62 |
-
new_leaderboard_df = get_leaderboard_df(RESULTS_REPO)
|
63 |
|
64 |
-
if new_leaderboard_df is not None:
|
65 |
-
logger.info("Leaderboard data refreshed successfully")
|
66 |
-
leaderboard_df = new_leaderboard_df
|
67 |
-
else:
|
68 |
-
logger.warning("No new leaderboard data found")
|
69 |
-
return None
|
70 |
-
except Exception as e:
|
71 |
-
logger.error(f"Error refreshing leaderboard data: {e}")
|
72 |
-
return None
|
73 |
|
|
|
74 |
|
75 |
-
|
|
|
|
|
|
|
|
|
76 |
|
77 |
-
if dataframe is None:
|
78 |
-
raise ValueError("Leaderboard DataFrame is None.")
|
79 |
|
80 |
-
|
|
|
|
|
|
|
81 |
value=dataframe,
|
82 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
83 |
select_columns=SelectColumns(
|
@@ -85,510 +70,166 @@ def init_leaderboard(dataframe: pd.DataFrame):
|
|
85 |
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
86 |
label="Select Columns to Display:",
|
87 |
),
|
88 |
-
search_columns=[AutoEvalColumn.system.name, AutoEvalColumn.
|
89 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
bool_checkboxgroup_label="Hide models",
|
91 |
interactive=False,
|
92 |
)
|
93 |
-
lb.col_count = (1, "fixed")
|
94 |
-
return lb
|
95 |
-
|
96 |
-
|
97 |
-
def add_solution_cbk(
|
98 |
-
system_name: str,
|
99 |
-
org: str,
|
100 |
-
submission_path: str,
|
101 |
-
profile: gr.OAuthProfile | None,
|
102 |
-
oauth_token: gr.OAuthToken | None,
|
103 |
-
):
|
104 |
-
logger.info("Fetching user details for submission")
|
105 |
-
logger.info("PROFILE %s", profile)
|
106 |
-
logger.info("TOKEN %s", oauth_token)
|
107 |
-
|
108 |
-
if profile is None or oauth_token is None:
|
109 |
-
return styled_error("Please sign in with Hugging Face before submitting.")
|
110 |
-
|
111 |
-
# Display handle and display name (may change over time)
|
112 |
-
logger.info(f"User handle: {profile.username}")
|
113 |
-
display_name = profile.name or profile.username
|
114 |
-
logger.info(f"Display name: {display_name}")
|
115 |
-
|
116 |
-
# Stable account id
|
117 |
-
user_info = fetch_user_info(oauth_token)
|
118 |
-
logger.info("Logged in user info: %s", user_info)
|
119 |
-
stable_id = user_info.get("id") if user_info else None
|
120 |
-
logger.info(f"User stable ID: {stable_id}")
|
121 |
-
|
122 |
-
if not stable_id:
|
123 |
-
return styled_error("Could not retrieve your stable user ID. Please try signing in again.")
|
124 |
-
user_id = stable_id
|
125 |
-
|
126 |
-
if not profile.username:
|
127 |
-
return styled_error("Could not retrieve username. Please try signing in again.")
|
128 |
-
|
129 |
-
try:
|
130 |
-
# Validating the submission file.
|
131 |
-
if not submission_path:
|
132 |
-
return styled_error("Please upload JSONL submission file.")
|
133 |
-
|
134 |
-
if not is_submission_file_valid(
|
135 |
-
submission_path,
|
136 |
-
is_warmup_dataset=(SPLIT == "warmup"),
|
137 |
-
):
|
138 |
-
return styled_error("Failed to read JSONL submission file. Please try again later.")
|
139 |
-
|
140 |
-
# Validating all user-supplied arguments.
|
141 |
-
for val, val_name in [
|
142 |
-
(system_name, "System name"),
|
143 |
-
(org, "Organisation name"),
|
144 |
-
]:
|
145 |
-
if len(val) == 0:
|
146 |
-
return styled_error(f"Please fill in the '{val_name}' field.")
|
147 |
-
|
148 |
-
if not is_valid(val):
|
149 |
-
return styled_error(
|
150 |
-
f"{val_name} is invalid! Must only contain characters [a-zA-Z0-9], spaces, "
|
151 |
-
+ "or the special characters '-' and '.', and be of length between "
|
152 |
-
+ f"{MIN_INPUT_LENGTH} and {MAX_INPUT_LENGTH}."
|
153 |
-
)
|
154 |
-
except Exception:
|
155 |
-
logger.warning("Failed to process user submission", exc_info=True)
|
156 |
-
return styled_error("An error occurred. Please try again later.") # Intentionally vague.
|
157 |
-
|
158 |
-
return add_new_solutions(
|
159 |
-
lbdb,
|
160 |
-
profile.username,
|
161 |
-
user_id,
|
162 |
-
system_name,
|
163 |
-
org,
|
164 |
-
submission_path,
|
165 |
-
is_warmup_dataset=(SPLIT == "warmup"),
|
166 |
-
ensure_all_present=ENSURE_ALL_PRESENT,
|
167 |
-
)
|
168 |
|
169 |
|
170 |
-
|
171 |
-
|
172 |
-
@brief Toggles the visibility of the login box and submission panel based on the user's login status.
|
173 |
-
"""
|
174 |
-
logger.info("GATE TOKEN %s", oauth_token)
|
175 |
-
if oauth_token is None:
|
176 |
-
logger.info("GATE: NO TOKEN")
|
177 |
-
return gr.update(visible=True), gr.update(visible=False)
|
178 |
-
try:
|
179 |
-
whoami(oauth_token.token)
|
180 |
-
logger.info("GATE: TOKEN IS VALID")
|
181 |
-
return gr.update(visible=False), gr.update(visible=True)
|
182 |
-
except Exception:
|
183 |
-
logger.info("GATE: TOKEN HAS EXPIRED")
|
184 |
-
return gr.update(visible=True), gr.update(visible=False)
|
185 |
-
|
186 |
-
|
187 |
-
def get_theme():
|
188 |
-
# return gr.themes.Soft(
|
189 |
-
# primary_hue=gr.themes.colors.blue,
|
190 |
-
# secondary_hue=gr.themes.colors.sky,
|
191 |
-
# neutral_hue=gr.themes.colors.gray,
|
192 |
-
# ).set(
|
193 |
-
# body_background_fill="#FFFFFF",
|
194 |
-
# panel_background_fill="#f3f4f6",
|
195 |
-
# )
|
196 |
-
return "light"
|
197 |
-
|
198 |
-
|
199 |
-
# --- Gradio-based tabs for examples (no JS in HTML) ---
|
200 |
-
def _select_example_tab(choice: str):
|
201 |
-
return (
|
202 |
-
gr.update(visible=(choice == "Shallow")),
|
203 |
-
gr.update(visible=(choice == "Deeper")),
|
204 |
-
gr.update(visible=(choice == "Deepest")),
|
205 |
-
)
|
206 |
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
"o3 Pro": "2025-06-10",
|
216 |
-
}
|
217 |
-
|
218 |
-
TIER_TOTALS = {"Shallow Tier": 100, "Deeper Tier": 100, "Deepest Tier": 20}
|
219 |
-
MODELS_ORDER = ["GPT-5", "Gemini 2.5 Pro", "Grok 4", "Claude Opus 4", "o3 Pro"]
|
220 |
-
|
221 |
-
ACCURACY_PCT = {
|
222 |
-
"Shallow Tier": {
|
223 |
-
"GPT-5": 49,
|
224 |
-
"Gemini 2.5 Pro": 30,
|
225 |
-
"Grok 4": 28,
|
226 |
-
"Claude Opus 4": 30,
|
227 |
-
"o3 Pro": 41,
|
228 |
-
},
|
229 |
-
"Deeper Tier": {
|
230 |
-
"GPT-5": 4,
|
231 |
-
"Gemini 2.5 Pro": 0,
|
232 |
-
"Grok 4": 0,
|
233 |
-
"Claude Opus 4": 0,
|
234 |
-
"o3 Pro": 1,
|
235 |
-
},
|
236 |
-
"Deepest Tier": {
|
237 |
-
"GPT-5": 0,
|
238 |
-
"Gemini 2.5 Pro": 0,
|
239 |
-
"Grok 4": 0,
|
240 |
-
"Claude Opus 4": 0,
|
241 |
-
"o3 Pro": 0,
|
242 |
-
},
|
243 |
-
}
|
244 |
-
|
245 |
-
|
246 |
-
def build_accuracy_figure(tier: str):
|
247 |
-
"""Interactive scatter: x = release date (ISO str), y = accuracy (%). Hover shows solved/total."""
|
248 |
-
total = TIER_TOTALS[tier]
|
249 |
-
fig = go.Figure()
|
250 |
-
|
251 |
-
for model in MODELS_ORDER:
|
252 |
-
date_str = MODEL_RELEASES[model] # e.g., "2025-08-07"
|
253 |
-
y = ACCURACY_PCT[tier][model] # percent
|
254 |
-
solved = round(y * total / 100)
|
255 |
-
|
256 |
-
fig.add_trace(
|
257 |
-
go.Scatter(
|
258 |
-
x=[date_str],
|
259 |
-
y=[y],
|
260 |
-
mode="markers",
|
261 |
-
opacity=0.85,
|
262 |
-
name=model, # distinct legend entry & color per model
|
263 |
-
marker=dict(size=8, opacity=0.85, line=dict(width=0.5)),
|
264 |
-
cliponaxis=False, # let markers render over axes
|
265 |
-
hovertemplate=(
|
266 |
-
f"<b>{model}</b><br>"
|
267 |
-
"Release: %{x|%b %d, %Y}<br>"
|
268 |
-
"Accuracy: %{y:.1f}%<br>"
|
269 |
-
f"Solved: {solved}/{total}"
|
270 |
-
"<extra></extra>"
|
271 |
-
),
|
272 |
-
)
|
273 |
-
)
|
274 |
-
|
275 |
-
fig.update_layout(
|
276 |
-
template="plotly_white",
|
277 |
-
height=420,
|
278 |
-
margin=dict(l=30, r=120, t=10, b=40), # extra right room for legend
|
279 |
-
xaxis=dict(
|
280 |
-
title="Model Release Date",
|
281 |
-
type="date",
|
282 |
-
tickformat="%b %Y",
|
283 |
-
showgrid=True,
|
284 |
-
title_standoff=10, # small gap so the label doesn’t crowd the ticks
|
285 |
-
),
|
286 |
-
yaxis=dict(
|
287 |
-
title="Accuracy (%)",
|
288 |
-
range=[0, 100], # fixed 0–100
|
289 |
-
tick0=0,
|
290 |
-
dtick=10,
|
291 |
-
showgrid=True,
|
292 |
-
layer="below traces", # draw axis below points so dots aren't “cut”
|
293 |
-
),
|
294 |
-
legend=dict(title="Models", orientation="v", y=1, x=1.02, yanchor="top"),
|
295 |
-
hovermode="closest",
|
296 |
)
|
297 |
-
return fig
|
298 |
-
|
299 |
-
|
300 |
-
_initial_accuracy_fig = build_accuracy_figure("Deeper Tier")
|
301 |
-
|
302 |
-
# Force light theme even if HF user prefers dark
|
303 |
-
blocks = gr.Blocks(
|
304 |
-
css=custom_css,
|
305 |
-
theme=get_theme(),
|
306 |
-
js="""
|
307 |
-
() => {
|
308 |
-
// Force light theme (your original)
|
309 |
-
document.body.classList.remove('dark');
|
310 |
-
document.documentElement.setAttribute('data-theme','light');
|
311 |
-
document.documentElement.setAttribute('data-color-mode','light');
|
312 |
-
|
313 |
-
// Handle <a data-tab-target="..."> to switch Gradio tabs by panel id
|
314 |
-
document.addEventListener('click', (e) => {
|
315 |
-
const a = e.target.closest('a[data-tab-target]');
|
316 |
-
if (!a) return;
|
317 |
-
e.preventDefault();
|
318 |
-
const id = a.getAttribute('data-tab-target'); // e.g., "what-is"
|
319 |
-
const panel = document.getElementById(id);
|
320 |
-
if (!panel) return;
|
321 |
-
|
322 |
-
// Find the tab header button that controls this panel and click it
|
323 |
-
const btn = document.querySelector(`[role="tab"][aria-controls="${panel.id}"]`);
|
324 |
-
if (btn) btn.click();
|
325 |
-
}, true);
|
326 |
-
}
|
327 |
-
""",
|
328 |
-
)
|
329 |
-
with blocks:
|
330 |
-
|
331 |
-
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
332 |
-
with gr.TabItem("FormulaOne", id=0, elem_id="landing-accuracy-tab"):
|
333 |
-
|
334 |
-
gr.HTML(
|
335 |
-
'<div align="center"><header class="text-center mb-12"><h1 class="text-4xl md:text-5xl font-bold text-gray-900 f1-h1" style="margin:0; display:inline;">FormulaOne</h1><span style="display:inline-block; margin-left:0.5em;"><h3 style="margin:0; display:inline;" class="text-4xl md:text-5xl font-bold text-gray-900 f1-h3 style=">by <a href="https://doubleai.com/">AAI</a></h3></header></div>'
|
336 |
-
)
|
337 |
-
with gr.Row(elem_id="landing-hero-row"):
|
338 |
-
with gr.Column(scale=7, elem_id="landing-hero-left"):
|
339 |
-
gr.Markdown(
|
340 |
-
"""
|
341 |
-
<div class="f1-container">
|
342 |
-
<p class="f1-hero">
|
343 |
-
A benchmark of novel, expert-level algorithmic problems over graphs that demand deep dynamic
|
344 |
-
programming and logical reasoning. <strong>Shallow</strong> and <strong>Deeper</strong> tiers span moderate through
|
345 |
-
challenging problems, while <strong>Deepest</strong> is research-level.
|
346 |
-
</p>
|
347 |
-
</div>
|
348 |
-
""",
|
349 |
-
elem_classes="markdown-text",
|
350 |
-
)
|
351 |
-
with gr.Column(scale=3, elem_id="landing-hero-right"):
|
352 |
-
learn_more_btn = gr.Button(
|
353 |
-
"Learn More about FormulaOne",
|
354 |
-
elem_id="learn-more-pill",
|
355 |
-
variant="secondary",
|
356 |
-
)
|
357 |
-
|
358 |
-
# Make the pill switch to the "What is FormulaOne" tab
|
359 |
-
learn_more_btn.click(
|
360 |
-
lambda: gr.Tabs(selected="what-is"), # switches tabs
|
361 |
-
inputs=None,
|
362 |
-
outputs=tabs, # 'tabs' is your Tabs handle
|
363 |
-
)
|
364 |
-
# Pill-style selector aligned to the top-right
|
365 |
-
with gr.Row(elem_id="f1-tier-select-row"):
|
366 |
-
tier_selector = gr.Radio(
|
367 |
-
choices=list(reversed(list(TIER_TOTALS.keys()))),
|
368 |
-
value="Deeper Tier",
|
369 |
-
label=None,
|
370 |
-
show_label=False,
|
371 |
-
elem_id="f1-tier-select",
|
372 |
-
)
|
373 |
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
</div>
|
396 |
-
""",
|
397 |
-
elem_classes="markdown-text",
|
398 |
-
)
|
399 |
-
|
400 |
-
# Existing "What is FormulaOne" tab
|
401 |
-
with gr.TabItem("What is FormulaOne", id="what-is", elem_id="what-is-tab"):
|
402 |
-
gr.Image(
|
403 |
-
"assets/banner.png",
|
404 |
-
show_label=False,
|
405 |
-
elem_classes=["f1-image"],
|
406 |
-
show_share_button=False,
|
407 |
-
show_download_button=False,
|
408 |
-
show_fullscreen_button=False,
|
409 |
-
width=550,
|
410 |
-
)
|
411 |
-
|
412 |
-
# Top content and categories table
|
413 |
-
gr.HTML(WHAT_IS_F1_HTML_TOP)
|
414 |
-
|
415 |
-
# ---- Bottom content pieces interleaved with real Gradio media ----
|
416 |
-
# Up to and including the "An Infinite Well" heading
|
417 |
-
gr.HTML(WHAT_IS_F1_HTML_BOTTOM_A_BEFORE_TABS)
|
418 |
-
|
419 |
-
# ===== Examples (now right after the “Infinite Well” heading; inner width 710px via CSS) =====
|
420 |
-
with gr.Group(elem_id="f1-examples", elem_classes=["f1-container"]):
|
421 |
-
gr.HTML(
|
422 |
-
'<div class="f1-tabs-body"><div class="f1-examples-chip">Examples of FormulaOne problems</div></div>'
|
423 |
-
)
|
424 |
|
425 |
-
|
426 |
-
|
427 |
-
{"left": "$", "right": "$", "display": False},
|
428 |
-
{"left": "\\(", "right": "\\)", "display": False},
|
429 |
-
{"left": "\\[", "right": "\\]", "display": True},
|
430 |
-
]
|
431 |
-
|
432 |
-
md_warmup = gr.Markdown(
|
433 |
-
value=(
|
434 |
-
'<p style="text-align: center;"><code>Union-of-Paths-and-Cycles</code></p>\n'
|
435 |
-
"Given a tree-like graph $G=(V,E)$ and a weight function $w:V\\to\\mathbb{N}$, compute the sum of all weights of sets $S\\subseteq V$ such that the induced subgraph $G[S]$ is a <b>disjoint union of paths and cycles</b>."
|
436 |
-
),
|
437 |
-
latex_delimiters=_latex,
|
438 |
-
elem_classes=["f1-problem-markdown"],
|
439 |
-
)
|
440 |
-
md_tier1 = gr.Markdown(
|
441 |
-
value=(
|
442 |
-
'<p style="text-align: center;"><code>Maximal-Union-of-Paths-and-Cycles</code></p>\n'
|
443 |
-
"Given a tree-like graph $G=(V,E)$ and a weight function $w:V\\to\\mathbb{N}$, compute the sum of all weights of sets $S\\subseteq V$ such that $G[S]$ is a <b>disjoint union of paths and cycles</b> and $S$ is <b>maximal</b> with respect to this property."
|
444 |
-
),
|
445 |
-
visible=False,
|
446 |
-
latex_delimiters=_latex,
|
447 |
-
elem_classes=["f1-problem-markdown"],
|
448 |
-
)
|
449 |
-
md_tier2 = gr.Markdown(
|
450 |
-
value=(
|
451 |
-
'<p style="text-align: center;"><code>Maximal-Union-of-Cycles</code></p>\n'
|
452 |
-
"Given a tree-like graph $G=(V,E)$ and a weight function $w:V\\to\\mathbb{N}$, compute the sum of all weights of sets $S\\subseteq V$ such that $G[S]$ is a <b>disjoint union of cycles</b> and $S$ is <b>maximal</b> with respect to this property."
|
453 |
-
),
|
454 |
-
visible=False,
|
455 |
-
latex_delimiters=_latex,
|
456 |
-
elem_classes=["f1-problem-markdown"],
|
457 |
-
)
|
458 |
|
459 |
-
|
460 |
-
|
461 |
-
|
462 |
-
label=None,
|
463 |
-
show_label=False,
|
464 |
-
elem_id="f1-example-radio",
|
465 |
-
)
|
466 |
-
tab_radio.change(_select_example_tab, inputs=tab_radio, outputs=[md_warmup, md_tier1, md_tier2])
|
467 |
-
|
468 |
-
# Continue the text after the heading (before the first figure)
|
469 |
-
gr.HTML(WHAT_IS_F1_HTML_BOTTOM_A_AFTER_TABS)
|
470 |
-
|
471 |
-
# Video (no autoplay/loop), smaller gap to caption via CSS
|
472 |
-
gr.Video(
|
473 |
-
"assets/DominatingSetAnimation.mp4",
|
474 |
-
autoplay=False,
|
475 |
-
loop=False,
|
476 |
-
show_label=False,
|
477 |
-
interactive=False,
|
478 |
-
elem_classes=["f1-video"],
|
479 |
-
show_share_button=False,
|
480 |
-
show_download_button=False,
|
481 |
-
)
|
482 |
-
gr.HTML(
|
483 |
-
'<div class="f1-figcaption f1-figcaption-video">Brief explanation showcasing the design of a compressed dynamic programming state-space.</div>'
|
484 |
-
)
|
485 |
|
486 |
-
|
487 |
-
|
488 |
-
|
489 |
-
gr.HTML(WHAT_IS_F1_HTML_EVAL_BEFORE_WARMUPFIG, padding=False)
|
490 |
-
gr.Image(
|
491 |
-
"assets/perf_plot.png",
|
492 |
-
width=600,
|
493 |
-
show_label=False,
|
494 |
-
elem_classes=["f1-image"],
|
495 |
-
show_share_button=False,
|
496 |
-
show_download_button=False,
|
497 |
-
show_fullscreen_button=False,
|
498 |
-
)
|
499 |
-
gr.HTML('<div class="f1-figcaption">Performance of frontier models on the FormulaOne dataset.</div>')
|
500 |
-
|
501 |
-
# Tail after Deeper Tier fig
|
502 |
-
gr.HTML(WHAT_IS_F1_HTML_AFTER_TIER1FIG_TAIL)
|
503 |
-
|
504 |
-
# Rename tab to "Leaderboard" and cap at 800px width
|
505 |
-
with gr.TabItem("Leaderboard", elem_id="formulaone-leaderboard-tab-table", id=2):
|
506 |
-
gr.Markdown(
|
507 |
-
"""
|
508 |
-
Welcome to the FormulaOne leaderboard. This table tracks performance on the core FormulaOne benchmark, covering the **deeper** and **deepest** tiers (120 problems).
|
509 |
-
Use the 'Select Columns to Display' dropdown to customize your view, and the search bar to find specific models or organizations.
|
510 |
-
""",
|
511 |
-
elem_classes="markdown-text",
|
512 |
-
)
|
513 |
-
refresh_leaderboard_data()
|
514 |
-
assert leaderboard_df is not None
|
515 |
-
leaderboard_component = init_leaderboard(leaderboard_df)
|
516 |
|
517 |
-
with gr.TabItem("Submit
|
518 |
logger.info("Tab submission")
|
519 |
with gr.Column():
|
520 |
with gr.Row():
|
521 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
522 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
523 |
with gr.Row():
|
524 |
-
gr.Markdown("# ✉️✨ Submit your solutions", elem_classes="markdown-text")
|
525 |
-
|
526 |
-
gr.Markdown(SUBMISSION_TERMS_TEXT, elem_classes="markdown-text")
|
527 |
-
|
528 |
-
login_box = gr.Group(visible=True, elem_id="f1-login-box")
|
529 |
-
with login_box:
|
530 |
-
gr.Markdown("Please sign in with Hugging Face to submit")
|
531 |
-
gr.LoginButton(elem_id="hf-login-btn")
|
532 |
|
533 |
-
|
534 |
-
|
535 |
-
|
536 |
-
|
537 |
-
|
538 |
-
|
539 |
-
|
540 |
-
|
541 |
-
|
542 |
-
|
543 |
-
|
544 |
-
value=False,
|
545 |
-
elem_classes="markdown-text",
|
546 |
-
)
|
547 |
-
|
548 |
-
privacy_checkbox = gr.Checkbox(
|
549 |
-
label="I have read the Privacy Notice.", value=False, elem_classes="markdown-text"
|
550 |
-
)
|
551 |
-
|
552 |
-
security_checkbox = gr.Checkbox(
|
553 |
-
label="I confirm this submission does not attempt to access private tests or exfiltrate data.",
|
554 |
-
value=False,
|
555 |
-
elem_classes="markdown-text",
|
556 |
-
)
|
557 |
-
|
558 |
-
privacy_link = "https://huggingface.co/spaces/double-ai/FormulaOne-Leaderboard/blob/main/docs/privacy-policy.md"
|
559 |
-
submission_agreement_link = "https://huggingface.co/spaces/double-ai/FormulaOne-Leaderboard/blob/main/terms/submission-agreement.md"
|
560 |
-
|
561 |
-
gr.Markdown(
|
562 |
-
f'<a href="{privacy_link}" target="_blank" rel="noopener noreferrer">Privacy Notice</a>; '
|
563 |
-
f'<a href="{submission_agreement_link}" target="_blank" rel="noopener noreferrer">Submission Agreement</a>',
|
564 |
-
elem_classes="markdown-text",
|
565 |
-
)
|
566 |
-
|
567 |
-
logger.info("Submit button")
|
568 |
-
submit_button = gr.Button("Submit", variant="primary", interactive=False)
|
569 |
-
submission_result = gr.Markdown()
|
570 |
-
|
571 |
-
# Update submit button interactivity based on checkboxes
|
572 |
-
def update_submit_button(agreement, privacy, security):
|
573 |
-
return gr.update(interactive=agreement and privacy and security)
|
574 |
-
|
575 |
-
for checkbox in [agreement_checkbox, privacy_checkbox, security_checkbox]:
|
576 |
-
checkbox.change(
|
577 |
-
update_submit_button,
|
578 |
-
inputs=[agreement_checkbox, privacy_checkbox, security_checkbox],
|
579 |
-
outputs=submit_button,
|
580 |
)
|
581 |
|
582 |
-
|
583 |
-
|
584 |
-
|
585 |
-
|
586 |
-
|
587 |
-
|
588 |
-
|
589 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
590 |
)
|
591 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
592 |
with gr.Row():
|
593 |
logger.info("Citation")
|
594 |
with gr.Accordion(CITATION_BUTTON_LABEL, open=False):
|
@@ -596,17 +237,18 @@ with blocks:
|
|
596 |
value=CITATION_BUTTON_TEXT.strip(),
|
597 |
elem_id="citation-block",
|
598 |
)
|
599 |
-
|
600 |
-
|
601 |
-
|
602 |
-
|
|
|
|
|
|
|
603 |
|
604 |
logger.info("Scheduler")
|
605 |
scheduler = BackgroundScheduler()
|
606 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
607 |
-
scheduler.add_job(refresh_leaderboard_data, "interval", seconds=120)
|
608 |
scheduler.start()
|
609 |
logger.info("Launch")
|
610 |
-
|
611 |
-
|
612 |
logger.info("Done")
|
|
|
1 |
+
from functools import partial
|
|
|
|
|
2 |
|
3 |
import gradio as gr
|
4 |
+
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
5 |
import pandas as pd
|
|
|
6 |
from apscheduler.schedulers.background import BackgroundScheduler
|
7 |
+
|
8 |
+
# from huggingface_hub import snapshot_download
|
9 |
+
|
10 |
+
from src.about import (
|
|
|
|
|
|
|
|
|
|
|
11 |
CITATION_BUTTON_LABEL,
|
12 |
CITATION_BUTTON_TEXT,
|
13 |
EVALUATION_QUEUE_TEXT,
|
14 |
+
INTRODUCTION_TEXT,
|
15 |
+
LLM_BENCHMARKS_TEXT,
|
16 |
+
TITLE,
|
17 |
)
|
18 |
from src.datamodel.data import F1Data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
+
from src.display.css_html_js import custom_css
|
|
|
21 |
|
22 |
+
from src.display.utils import (
|
23 |
+
# BENCHMARK_COLS,
|
24 |
+
COLS,
|
25 |
+
EVAL_COLS,
|
26 |
+
EVAL_TYPES,
|
27 |
+
AutoEvalColumn,
|
28 |
+
ModelType,
|
29 |
+
fields,
|
30 |
+
WeightType,
|
31 |
+
Precision,
|
32 |
)
|
33 |
+
from src.envs import API, REPO_ID, TOKEN, CODE_PROBLEMS_REPO, SUBMISSIONS_REPO, RESULTS_REPO
|
34 |
+
from src.logger import get_logger
|
35 |
|
36 |
+
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
37 |
+
from src.submission.submit import add_new_solutions
|
38 |
|
39 |
+
logger = get_logger(__name__)
|
40 |
+
|
41 |
+
SPLIT = "warmup" # TODO temp
|
42 |
+
SKIP_VALIDATION = True # TODO temp
|
43 |
|
44 |
|
45 |
def restart_space():
|
|
|
46 |
API.restart_space(repo_id=REPO_ID)
|
47 |
|
48 |
|
49 |
+
lbdb = F1Data(cp_ds_name=CODE_PROBLEMS_REPO, sub_ds_name=SUBMISSIONS_REPO, res_ds_name=RESULTS_REPO, split=SPLIT)
|
50 |
+
leaderboard_df = get_leaderboard_df(RESULTS_REPO)
|
|
|
|
|
|
|
|
|
51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
+
logger.info("Initialized LBDB")
|
54 |
|
55 |
+
# (
|
56 |
+
# finished_eval_queue_df,
|
57 |
+
# running_eval_queue_df,
|
58 |
+
# pending_eval_queue_df,
|
59 |
+
# ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
60 |
|
|
|
|
|
61 |
|
62 |
+
def init_leaderboard(dataframe):
|
63 |
+
if dataframe is None or dataframe.empty:
|
64 |
+
raise ValueError("Leaderboard DataFrame is empty or None.")
|
65 |
+
return Leaderboard(
|
66 |
value=dataframe,
|
67 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
68 |
select_columns=SelectColumns(
|
|
|
70 |
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
71 |
label="Select Columns to Display:",
|
72 |
),
|
73 |
+
search_columns=[AutoEvalColumn.system.name, AutoEvalColumn.system_type.name],
|
74 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
75 |
+
filter_columns=[
|
76 |
+
ColumnFilter(AutoEvalColumn.system_type.name, type="checkboxgroup", label="Model types"),
|
77 |
+
# ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
78 |
+
# ColumnFilter(
|
79 |
+
# AutoEvalColumn.params.name,
|
80 |
+
# type="slider",
|
81 |
+
# min=0.01,
|
82 |
+
# max=150,
|
83 |
+
# label="Select the number of parameters (B)",
|
84 |
+
# ),
|
85 |
+
# ColumnFilter(AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True),
|
86 |
+
],
|
87 |
bool_checkboxgroup_label="Hide models",
|
88 |
interactive=False,
|
89 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
|
91 |
|
92 |
+
# Display image using Markdown
|
93 |
+
# banner = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
|
95 |
+
demo = gr.Blocks(css=custom_css)
|
96 |
+
with demo:
|
97 |
+
gr.Image(
|
98 |
+
"assets/banner.png",
|
99 |
+
interactive=False,
|
100 |
+
show_label=False,
|
101 |
+
show_download_button=False,
|
102 |
+
container=False,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
|
105 |
+
# gr.Markdown(banner)
|
106 |
+
gr.HTML(
|
107 |
+
"""
|
108 |
+
<style>
|
109 |
+
body {
|
110 |
+
background-color: #121212;
|
111 |
+
color: white;
|
112 |
+
margin: 0; /* Reset browser default */
|
113 |
+
}
|
114 |
+
|
115 |
+
/* Outer container margin & spacing */
|
116 |
+
.gradio-container {
|
117 |
+
max-width: 1100px;
|
118 |
+
margin: 2rem auto; /* top/bottom spacing + horizontal centering */
|
119 |
+
padding: 2rem; /* inner spacing */
|
120 |
+
background-color: rgba(0, 0, 0, 0.6); /* optional: semi-transparent panel */
|
121 |
+
border-radius: 12px; /* rounded corners */
|
122 |
+
}
|
123 |
+
</style>
|
124 |
+
"""
|
125 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
|
127 |
+
gr.HTML(TITLE)
|
128 |
+
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
|
130 |
+
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
131 |
+
with gr.TabItem("🏅 FormulaOne Leaderboard", elem_id="formulaone-leaderboar-tab-table", id=0):
|
132 |
+
leaderboard = init_leaderboard(leaderboard_df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
|
134 |
+
# with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=1):
|
135 |
+
# logger.info("Tab about")
|
136 |
+
# gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
|
138 |
+
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=2):
|
139 |
logger.info("Tab submission")
|
140 |
with gr.Column():
|
141 |
with gr.Row():
|
142 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
143 |
|
144 |
+
# with gr.Column():
|
145 |
+
# with gr.Accordion(
|
146 |
+
# f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
147 |
+
# open=False,
|
148 |
+
# ):
|
149 |
+
# with gr.Row():
|
150 |
+
# finished_eval_table = gr.components.Dataframe(
|
151 |
+
# value=finished_eval_queue_df,
|
152 |
+
# headers=EVAL_COLS,
|
153 |
+
# datatype=EVAL_TYPES,
|
154 |
+
# row_count=5,
|
155 |
+
# )
|
156 |
+
# with gr.Accordion(
|
157 |
+
# f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
158 |
+
# open=False,
|
159 |
+
# ):
|
160 |
+
# with gr.Row():
|
161 |
+
# running_eval_table = gr.components.Dataframe(
|
162 |
+
# value=running_eval_queue_df,
|
163 |
+
# headers=EVAL_COLS,
|
164 |
+
# datatype=EVAL_TYPES,
|
165 |
+
# row_count=5,
|
166 |
+
# )
|
167 |
+
|
168 |
+
# with gr.Accordion(
|
169 |
+
# f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
170 |
+
# open=False,
|
171 |
+
# ):
|
172 |
+
# with gr.Row():
|
173 |
+
# pending_eval_table = gr.components.Dataframe(
|
174 |
+
# value=pending_eval_queue_df,
|
175 |
+
# headers=EVAL_COLS,
|
176 |
+
# datatype=EVAL_TYPES,
|
177 |
+
# row_count=5,
|
178 |
+
# )
|
179 |
with gr.Row():
|
180 |
+
gr.Markdown("# ✉️✨ Submit your solutions here!", elem_classes="markdown-text")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
181 |
|
182 |
+
with gr.Row():
|
183 |
+
with gr.Column():
|
184 |
+
system_name_textbox = gr.Textbox(label=AutoEvalColumn.system.name)
|
185 |
+
org_textbox = gr.Textbox(label=AutoEvalColumn.organization.name)
|
186 |
+
# revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
187 |
+
sys_type_dropdown = gr.Dropdown(
|
188 |
+
choices=[t.to_str(" ") for t in ModelType],
|
189 |
+
label=AutoEvalColumn.system_type.name,
|
190 |
+
multiselect=False,
|
191 |
+
value=ModelType.LLM.to_str(" "),
|
192 |
+
interactive=True,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
)
|
194 |
|
195 |
+
# with gr.Column():
|
196 |
+
submission_file = gr.File(label="JSONL solutions file", file_types=[".jsonl"])
|
197 |
+
# precision = gr.Dropdown(
|
198 |
+
# choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
199 |
+
# label="Precision",
|
200 |
+
# multiselect=False,
|
201 |
+
# value="float16",
|
202 |
+
# interactive=True,
|
203 |
+
# )
|
204 |
+
# weight_type = gr.Dropdown(
|
205 |
+
# choices=[i.value.name for i in WeightType],
|
206 |
+
# label="Weights type",
|
207 |
+
# multiselect=False,
|
208 |
+
# value="Original",
|
209 |
+
# interactive=True,
|
210 |
+
# )
|
211 |
+
# base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
212 |
+
|
213 |
+
logger.info("Submit button")
|
214 |
+
submit_button = gr.Button("Submit")
|
215 |
+
submission_result = gr.Markdown()
|
216 |
+
|
217 |
+
def add_solution_cbk(system_name, org, sys_type, submission_path):
|
218 |
+
return add_new_solutions(
|
219 |
+
lbdb, system_name, org, sys_type, submission_path, skip_validation=SKIP_VALIDATION
|
220 |
)
|
221 |
|
222 |
+
submit_button.click(
|
223 |
+
add_solution_cbk,
|
224 |
+
[
|
225 |
+
system_name_textbox,
|
226 |
+
org_textbox,
|
227 |
+
sys_type_dropdown,
|
228 |
+
submission_file,
|
229 |
+
],
|
230 |
+
submission_result,
|
231 |
+
)
|
232 |
+
|
233 |
with gr.Row():
|
234 |
logger.info("Citation")
|
235 |
with gr.Accordion(CITATION_BUTTON_LABEL, open=False):
|
|
|
237 |
value=CITATION_BUTTON_TEXT.strip(),
|
238 |
elem_id="citation-block",
|
239 |
)
|
240 |
+
# citation_button = gr.Textbox(
|
241 |
+
# value=CITATION_BUTTON_TEXT,
|
242 |
+
# # label=CITATION_BUTTON_LABEL,
|
243 |
+
# lines=20,
|
244 |
+
# elem_id="citation-button",
|
245 |
+
# show_copy_button=True,
|
246 |
+
# )
|
247 |
|
248 |
logger.info("Scheduler")
|
249 |
scheduler = BackgroundScheduler()
|
250 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
|
|
251 |
scheduler.start()
|
252 |
logger.info("Launch")
|
253 |
+
demo.queue(default_concurrency_limit=40).launch()
|
|
|
254 |
logger.info("Done")
|
assets/DominatingSetAnimation.mp4
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:1b4f64a0f3d3e4a662214dfda8b1cf3a13dffbc7a2698a70e6bef15b0bddd3f6
|
3 |
-
size 10836895
|
|
|
|
|
|
|
|
assets/banner.png
CHANGED
![]() |
Git LFS Details
|
![]() |
Git LFS Details
|
assets/{bag_modifications.png → banner_backup.png}
RENAMED
File without changes
|
assets/perf_plot.png
DELETED
Git LFS Details
|
docs/privacy-policy.md
DELETED
@@ -1,56 +0,0 @@
|
|
1 |
-
FormulaOne AI Coding Challenge - Privacy Notice v1.2
|
2 |
-
Last updated: 6 Aug 2025
|
3 |
-
|
4 |
-
1. Controller
|
5 |
-
FormulaOne Team, <legal entity and address>
|
6 |
-
Contact: privacy@formulaone-ai.org
|
7 |
-
|
8 |
-
2. Data We Collect
|
9 |
-
- GitHub or Hugging Face username and public profile ID
|
10 |
-
- Email address (if supplied via OAuth)
|
11 |
-
- Uploaded Submission files, compile/runtime logs, numeric scores
|
12 |
-
- IP address and browser telemetry for security and rate limiting
|
13 |
-
|
14 |
-
3. Purposes and GDPR Legal Bases
|
15 |
-
|
16 |
-
| Purpose | Legal basis (GDPR) |
|
17 |
-
|----------------------------------------------|-------------------------------|
|
18 |
-
| Evaluate and rank submissions | Contract (6(1)(b)) |
|
19 |
-
| Site security and abuse prevention | Legitimate interests (6(1)(f))|
|
20 |
-
| Academic, non-commercial research on models | Legitimate interests (6(1)(f))|
|
21 |
-
| Contacting participants about results/collab | Legitimate interests (6(1)(f))|
|
22 |
-
|
23 |
-
|
24 |
-
4. Public Disclosure
|
25 |
-
Your display name, score and rank appear on a public leaderboard.
|
26 |
-
You may use a pseudonym.
|
27 |
-
5. Data Sharing
|
28 |
-
Processors: Hugging Face Inc., GitHub Inc., <cloud host> - under
|
29 |
-
written contracts.
|
30 |
-
Independent controllers: GitHub and Hugging Face regarding your
|
31 |
-
platform accounts.
|
32 |
-
6. International Transfers
|
33 |
-
Where data moves outside the EEA/UK, we rely on EU Standard
|
34 |
-
Contractual Clauses or adequacy decisions.
|
35 |
-
7. Retention
|
36 |
-
- Leaderboard entries: indefinitely.
|
37 |
-
- Raw submission files and logs: up to 5 years, then deleted or
|
38 |
-
anonymised.
|
39 |
-
- Security logs: 12 months.
|
40 |
-
8. Your Rights
|
41 |
-
Access, correction, erasure, restriction, portability, and
|
42 |
-
objection. Contact us at the address above.
|
43 |
-
You may lodge a complaint with your supervisory authority.
|
44 |
-
9. Security
|
45 |
-
Submissions run in network-restricted containers. Access is limited
|
46 |
-
to authorised staff using multi-factor authentication.
|
47 |
-
10. Cookies
|
48 |
-
If you use only GitHub and Hugging Face, their cookies apply under
|
49 |
-
their policies. If you later self-host a site for the leaderboard,
|
50 |
-
publish your own cookie notice.
|
51 |
-
11. Children
|
52 |
-
The Competition is not directed to children under 16. Do not submit
|
53 |
-
personal data if you are under 16.
|
54 |
-
12. Changes
|
55 |
-
We will post updates here and notify registered participants by
|
56 |
-
email when material.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dummy_submission.jsonl
DELETED
@@ -1,2 +0,0 @@
|
|
1 |
-
{"problem_id": "142", "solution": "from dataclasses import dataclass\nfrom typing import TypeAlias\n\nfrom evaluation.graph_structure import Graph\n\nMOD = 1_000_000_007\n\n\n@dataclass(frozen=True)\nclass DPState:\n \"\"\"\n Dynamic Programming state for the Dominating Set problem on tree decompositions.\n This dataclass encapsulates the complete state information needed at each bag\n during the tree decomposition-based dynamic programming algorithm.\n\n Fields:\n ( 1 ) assign_mask : int Bitmask indicating which vertices in the current bag\n are assigned to the dominating set (1 = IN, 0 = OUT).\n Bit i corresponds to the i-th vertex in the sorted bag.\n\n ( 2 ) need_mask : int Bitmask indicating which OUT vertices in the current bag\n still require domination (1 = needs domination, 0 = already dominated).\n Only meaningful for OUT vertices (assign_mask bit = 0).\n IN vertices never have the need bit set since they dominate themselves.\n\n State Invariants:\n - For any bit position i: if (assign_mask >> i) & 1 == 1, then (need_mask >> i) & 1 == 0\n - The need_mask only tracks vertices that are OUT and not yet dominated by adjacent IN vertices\n - When a vertex is forgotten, it must not have the need bit set (invalid state otherwise)\n\n Usage in Algorithm:\n - LEAF: Initialize with vertex either IN (assign=1, need=0) or OUT (assign=0, need=1)\n - INTRODUCE: Insert new bit at appropriate position, update domination status\n - FORGET: Remove bit at appropriate position, reject states with undominated OUT vertices\n - JOIN: Merge compatible states (same assignment), combine need masks with AND operation\n\n All fields are immutable and hashable, making this object suitable as a dictionary key.\n \"\"\"\n\n assign_mask: int\n need_mask: int\n\n\n@dataclass\nclass DPValue:\n \"\"\"\n Dynamic Programming value representing the computational result for a given state.\n This dataclass replaces the previous Tuple[int, int] representation with a more\n structured and self-documenting approach for weighted model counting.\n\n Fields:\n ( 1 ) count : int Number of distinct vertex subsets (dominating sets) that\n achieve the current state configuration. This counts the\n multiplicity of solutions that lead to the same DP state.\n\n ( 2 ) weight : int Total weighted sum across all vertex subsets that achieve\n the current state configuration. Each dominating set contributes\n its total vertex weight sum to this field.\n\n Implementation Details:\n - Both fields are maintained modulo MOD (1,000,000,007)\n - The count field enables tracking the number of valid dominating sets\n - The weight field accumulates the total weight contribution from all valid sets\n - When combining values from different DP branches:\n * Counts are multiplied for independent choices\n * Weights are combined using inclusion-exclusion principle to avoid double-counting\n\n This structure enables simultaneous tracking of both solution count and cumulative\n weight during the tree decomposition DP computation.\n \"\"\"\n\n count: int\n weight: int\n\n\nBag: TypeAlias = set[int] # a bag is a set of vertices\n\n\ndef insert_bit(\n mask: int,\n pos: int,\n bit: int,\n) -> int:\n \"\"\"\n Insert `bit` (0/1) at position `pos` (LSB == position 0) in `mask`\n shifting higher bits left by one.\n \"\"\"\n lower = mask & ((1 << pos) - 1)\n higher = mask >> pos\n return lower | (bit << pos) | (higher << (pos + 1))\n\n\ndef remove_bit(\n mask: int,\n pos: int,\n) -> int:\n \"\"\"\n Delete the bit at position `pos` from `mask`, shifting higher bits right.\n \"\"\"\n lower = mask & ((1 << pos) - 1)\n higher = mask >> (pos + 1)\n return lower | (higher << pos)\n\n\ndef bag_tuple(bag: Bag) -> tuple[int, ...]:\n return tuple(sorted(bag))\n\n\ndef bag_selected_weight(\n assign_mask: int,\n bag_vertices: tuple[int, ...],\n vertex_weights: dict[int, int],\n) -> int:\n \"\"\"Sum of weights of vertices in the bag that are selected (IN).\"\"\"\n s = 0\n for idx, v in enumerate(bag_vertices):\n if (assign_mask >> idx) & 1:\n s += vertex_weights[v]\n return s % MOD\n\n\ndef accumulate(\n table: dict[DPState, DPValue],\n state: DPState,\n cnt: int,\n wsum: int,\n) -> None:\n \"\"\"Add (cnt, wsum) to existing entry of state inside table (MOD arithmetic).\"\"\"\n cnt %= MOD\n wsum %= MOD\n if state in table:\n existing = table[state]\n table[state] = DPValue(count=(existing.count + cnt) % MOD, weight=(existing.weight + wsum) % MOD)\n else:\n table[state] = DPValue(count=cnt, weight=wsum)\n\n\ndef leaf_callback(\n graph: Graph,\n cur_table: dict[DPState, DPValue],\n cur_bag_info: tuple[int, Bag],\n leaf_vertex: int,\n):\n bag_vertices = bag_tuple(cur_bag_info[1]) # (leaf_vertex,)\n assert len(bag_vertices) == 1 and bag_vertices[0] == leaf_vertex\n\n w_v = graph.vertex_weights[leaf_vertex] if graph.vertex_weights else 1\n\n # Case 1: vertex is IN the dominating set\n assign_mask = 1 # bit 0 == 1\n need_mask = 0 # IN vertices never need domination\n state_in = DPState(assign_mask=assign_mask, need_mask=need_mask)\n accumulate(cur_table, state_in, cnt=1, wsum=w_v % MOD)\n\n # Case 2: vertex is OUT - needs domination\n assign_mask = 0\n need_mask = 1 # NEEDS_DOMINATION\n state_out = DPState(assign_mask=assign_mask, need_mask=need_mask)\n accumulate(cur_table, state_out, cnt=1, wsum=0)\n\n\ndef introduce_callback(\n graph: Graph,\n cur_table: dict[DPState, DPValue],\n cur_bag_info: tuple[int, Bag],\n child_table: dict[DPState, DPValue],\n child_bag_info: tuple[int, Bag],\n introduced_vertex: int,\n):\n parent_vertices = bag_tuple(cur_bag_info[1])\n\n # index at which the new vertex was inserted\n idx_new = parent_vertices.index(introduced_vertex)\n w_new = graph.vertex_weights[introduced_vertex] if graph.vertex_weights else 1\n\n # pre-compute adjacency between introduced vertex and vertices in parent bag\n is_adj = [(v in graph.neighbors(introduced_vertex)) for v in parent_vertices]\n\n for child_state, dp_value in child_table.items():\n child_assign, child_need = child_state.assign_mask, child_state.need_mask\n cnt_child, wsum_child = dp_value.count, dp_value.weight\n # \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n # Choice A: new vertex is IN_X\n # \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n assign_in = insert_bit(child_assign, idx_new, 1)\n need_in = insert_bit(child_need, idx_new, 0)\n\n # when y is IN it may dominate some previously undominated OUT vertices\n for idx, adj in enumerate(is_adj):\n if idx == idx_new or not adj:\n continue\n # vertex idx is OUT?\n if (assign_in >> idx) & 1:\n continue # IN vertices never carry NEED flag\n # if it was NEED, clear it\n if (need_in >> idx) & 1:\n need_in &= ~(1 << idx)\n\n cnt_new = cnt_child\n wsum_new = (wsum_child + cnt_child * w_new) % MOD\n state_in = DPState(assign_mask=assign_in, need_mask=need_in)\n accumulate(cur_table, state_in, cnt_new, wsum_new)\n\n # \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n # Choice B: new vertex is NOT_IN_X\n # \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n assign_out = insert_bit(child_assign, idx_new, 0)\n\n # Determine if introduced vertex is already dominated by some\n # IN vertex present in the (extended) bag.\n dominated = False\n for idx, adj in enumerate(is_adj):\n if idx == idx_new or not adj:\n continue\n if (assign_out >> idx) & 1: # neighbor is IN\n dominated = True\n break\n need_bit = 0 if dominated else 1\n need_out = insert_bit(child_need, idx_new, need_bit)\n\n # ( no other vertices change status )\n state_out = DPState(assign_mask=assign_out, need_mask=need_out)\n accumulate(cur_table, state_out, cnt_child, wsum_child)\n\n\ndef forget_callback(\n graph: Graph,\n cur_table: dict[DPState, DPValue],\n cur_bag_info: tuple[int, Bag],\n child_table: dict[DPState, DPValue],\n child_bag_info: tuple[int, Bag],\n forgotten_vertex: int,\n):\n child_vertices = bag_tuple(child_bag_info[1])\n idx_forgot = child_vertices.index(forgotten_vertex)\n\n for child_state, dp_value in child_table.items():\n assign_child, need_child = child_state.assign_mask, child_state.need_mask\n cnt_child, wsum_child = dp_value.count, dp_value.weight\n bit_assign = (assign_child >> idx_forgot) & 1\n bit_need = (need_child >> idx_forgot) & 1\n\n # If forgotten vertex is OUT and still needs domination -> invalid state\n if bit_assign == 0 and bit_need == 1:\n continue\n\n assign_par = remove_bit(assign_child, idx_forgot)\n need_par = remove_bit(need_child, idx_forgot)\n\n state_par = DPState(assign_mask=assign_par, need_mask=need_par)\n accumulate(cur_table, state_par, cnt_child, wsum_child)\n\n\ndef join_callback(\n graph: Graph,\n cur_table: dict[DPState, DPValue],\n cur_bag_info: tuple[int, Bag],\n left_child_table: dict[DPState, DPValue],\n left_child_bag_info: tuple[int, Bag],\n right_child_table: dict[DPState, DPValue],\n right_child_bag_info: tuple[int, Bag],\n):\n bag_vertices = bag_tuple(cur_bag_info[1])\n vertex_weights = graph.vertex_weights\n assert vertex_weights is not None\n\n # Group right states by assignment mask for O(|L| + |R|) compatibility\n right_by_assign: dict[int, list[tuple[int, int, int]]] = {}\n for right_state, dp_value in right_child_table.items():\n assign_r, need_r = right_state.assign_mask, right_state.need_mask\n cnt_r, wsum_r = dp_value.count, dp_value.weight\n right_by_assign.setdefault(assign_r, []).append((need_r, cnt_r, wsum_r))\n\n for left_state, dp_value in left_child_table.items():\n assign_l, need_l = left_state.assign_mask, left_state.need_mask\n cnt_l, wsum_l = dp_value.count, dp_value.weight\n if assign_l not in right_by_assign:\n continue\n for need_r, cnt_r, wsum_r in right_by_assign[assign_l]:\n # Merge NEED flags: dominated if dominated in either side\n need_merge = need_l & need_r # bitwise AND keeps 1 only if both have NEED\n\n cnt_merge = (cnt_l * cnt_r) % MOD\n\n w_bag_sel = bag_selected_weight(assign_l, bag_vertices, vertex_weights)\n w_merge = (wsum_l * cnt_r + wsum_r * cnt_l - cnt_merge * w_bag_sel) % MOD\n if w_merge < 0:\n w_merge += MOD\n\n state_merge = DPState(assign_mask=assign_l, need_mask=need_merge)\n accumulate(cur_table, state_merge, cnt_merge, w_merge)\n\n\ndef extract_solution(root_table: dict[DPState, DPValue]) -> int:\n \"\"\"\n Sum the total weights of all globally valid dominating sets.\n Return -1 if none exist.\n \"\"\"\n answer = 0\n found = False\n for state, dp_value in root_table.items():\n assign_mask, need_mask = state.assign_mask, state.need_mask\n cnt, wsum = dp_value.count, dp_value.weight\n # Bag may be empty or not\n if assign_mask == 0 and need_mask == 0 and cnt == 0:\n # Defensive - shouldn't happen\n continue\n if need_mask != 0:\n # some vertex in root bag still needs domination -> invalid\n continue\n answer = (answer + wsum) % MOD\n found = True\n return answer if found else -1\n"}
|
2 |
-
{"problem_id": "120", "solution": "from dataclasses import dataclass\nfrom typing import TypeAlias\n\nfrom evaluation.graph_structure import Graph\n\nMOD = 1_000_000_007\n\n\n@dataclass(frozen=True)\nclass DPState:\n \"\"\"\n Dynamic Programming state for the Dominating Set problem on tree decompositions.\n This dataclass encapsulates the complete state information needed at each bag\n during the tree decomposition-based dynamic programming algorithm.\n\n Fields:\n ( 1 ) assign_mask : int Bitmask indicating which vertices in the current bag\n are assigned to the dominating set (1 = IN, 0 = OUT).\n Bit i corresponds to the i-th vertex in the sorted bag.\n\n ( 2 ) need_mask : int Bitmask indicating which OUT vertices in the current bag\n still require domination (1 = needs domination, 0 = already dominated).\n Only meaningful for OUT vertices (assign_mask bit = 0).\n IN vertices never have the need bit set since they dominate themselves.\n\n State Invariants:\n - For any bit position i: if (assign_mask >> i) & 1 == 1, then (need_mask >> i) & 1 == 0\n - The need_mask only tracks vertices that are OUT and not yet dominated by adjacent IN vertices\n - When a vertex is forgotten, it must not have the need bit set (invalid state otherwise)\n\n Usage in Algorithm:\n - LEAF: Initialize with vertex either IN (assign=1, need=0) or OUT (assign=0, need=1)\n - INTRODUCE: Insert new bit at appropriate position, update domination status\n - FORGET: Remove bit at appropriate position, reject states with undominated OUT vertices\n - JOIN: Merge compatible states (same assignment), combine need masks with AND operation\n\n All fields are immutable and hashable, making this object suitable as a dictionary key.\n \"\"\"\n\n assign_mask: int\n need_mask: int\n\n\n@dataclass\nclass DPValue:\n \"\"\"\n Dynamic Programming value representing the computational result for a given state.\n This dataclass replaces the previous Tuple[int, int] representation with a more\n structured and self-documenting approach for weighted model counting.\n\n Fields:\n ( 1 ) count : int Number of distinct vertex subsets (dominating sets) that\n achieve the current state configuration. This counts the\n multiplicity of solutions that lead to the same DP state.\n\n ( 2 ) weight : int Total weighted sum across all vertex subsets that achieve\n the current state configuration. Each dominating set contributes\n its total vertex weight sum to this field.\n\n Implementation Details:\n - Both fields are maintained modulo MOD (1,000,000,007)\n - The count field enables tracking the number of valid dominating sets\n - The weight field accumulates the total weight contribution from all valid sets\n - When combining values from different DP branches:\n * Counts are multiplied for independent choices\n * Weights are combined using inclusion-exclusion principle to avoid double-counting\n\n This structure enables simultaneous tracking of both solution count and cumulative\n weight during the tree decomposition DP computation.\n \"\"\"\n\n count: int\n weight: int\n\n\nBag: TypeAlias = set[int] # a bag is a set of vertices\n\n\ndef insert_bit(\n mask: int,\n pos: int,\n bit: int,\n) -> int:\n \"\"\"\n Insert `bit` (0/1) at position `pos` (LSB == position 0) in `mask`\n shifting higher bits left by one.\n \"\"\"\n lower = mask & ((1 << pos) - 1)\n higher = mask >> pos\n return lower | (bit << pos) | (higher << (pos + 1))\n\n\ndef remove_bit(\n mask: int,\n pos: int,\n) -> int:\n \"\"\"\n Delete the bit at position `pos` from `mask`, shifting higher bits right.\n \"\"\"\n lower = mask & ((1 << pos) - 1)\n higher = mask >> (pos + 1)\n return lower | (higher << pos)\n\n\ndef bag_tuple(bag: Bag) -> tuple[int, ...]:\n return tuple(sorted(bag))\n\n\ndef bag_selected_weight(\n assign_mask: int,\n bag_vertices: tuple[int, ...],\n vertex_weights: dict[int, int],\n) -> int:\n \"\"\"Sum of weights of vertices in the bag that are selected (IN).\"\"\"\n s = 0\n for idx, v in enumerate(bag_vertices):\n if (assign_mask >> idx) & 1:\n s += vertex_weights[v]\n return s % MOD\n\n\ndef accumulate(\n table: dict[DPState, DPValue],\n state: DPState,\n cnt: int,\n wsum: int,\n) -> None:\n \"\"\"Add (cnt, wsum) to existing entry of state inside table (MOD arithmetic).\"\"\"\n cnt %= MOD\n wsum %= MOD\n if state in table:\n existing = table[state]\n table[state] = DPValue(count=(existing.count + cnt) % MOD, weight=(existing.weight + wsum) % MOD)\n else:\n table[state] = DPValue(count=cnt, weight=wsum)\n\n\ndef leaf_callback(\n graph: Graph,\n cur_table: dict[DPState, DPValue],\n cur_bag_info: tuple[int, Bag],\n leaf_vertex: int,\n):\n bag_vertices = bag_tuple(cur_bag_info[1]) # (leaf_vertex,)\n assert len(bag_vertices) == 1 and bag_vertices[0] == leaf_vertex\n\n w_v = graph.vertex_weights[leaf_vertex] if graph.vertex_weights else 1\n\n # Case 1: vertex is IN the dominating set\n assign_mask = 1 # bit 0 == 1\n need_mask = 0 # IN vertices never need domination\n state_in = DPState(assign_mask=assign_mask, need_mask=need_mask)\n accumulate(cur_table, state_in, cnt=1, wsum=w_v % MOD)\n\n # Case 2: vertex is OUT - needs domination\n assign_mask = 0\n need_mask = 1 # NEEDS_DOMINATION\n state_out = DPState(assign_mask=assign_mask, need_mask=need_mask)\n accumulate(cur_table, state_out, cnt=1, wsum=0)\n\n\ndef introduce_callback(\n graph: Graph,\n cur_table: dict[DPState, DPValue],\n cur_bag_info: tuple[int, Bag],\n child_table: dict[DPState, DPValue],\n child_bag_info: tuple[int, Bag],\n introduced_vertex: int,\n):\n parent_vertices = bag_tuple(cur_bag_info[1])\n\n # index at which the new vertex was inserted\n idx_new = parent_vertices.index(introduced_vertex)\n w_new = graph.vertex_weights[introduced_vertex] if graph.vertex_weights else 1\n\n # pre-compute adjacency between introduced vertex and vertices in parent bag\n is_adj = [(v in graph.neighbors(introduced_vertex)) for v in parent_vertices]\n\n for child_state, dp_value in child_table.items():\n child_assign, child_need = child_state.assign_mask, child_state.need_mask\n cnt_child, wsum_child = dp_value.count, dp_value.weight\n # \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n # Choice A: new vertex is IN_X\n # \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n assign_in = insert_bit(child_assign, idx_new, 1)\n need_in = insert_bit(child_need, idx_new, 0)\n\n # when y is IN it may dominate some previously undominated OUT vertices\n for idx, adj in enumerate(is_adj):\n if idx == idx_new or not adj:\n continue\n # vertex idx is OUT?\n if (assign_in >> idx) & 1:\n continue # IN vertices never carry NEED flag\n # if it was NEED, clear it\n if (need_in >> idx) & 1:\n need_in &= ~(1 << idx)\n\n cnt_new = cnt_child\n wsum_new = (wsum_child + cnt_child * w_new) % MOD\n state_in = DPState(assign_mask=assign_in, need_mask=need_in)\n accumulate(cur_table, state_in, cnt_new, wsum_new)\n\n # \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n # Choice B: new vertex is NOT_IN_X\n # \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n assign_out = insert_bit(child_assign, idx_new, 0)\n\n # Determine if introduced vertex is already dominated by some\n # IN vertex present in the (extended) bag.\n dominated = False\n for idx, adj in enumerate(is_adj):\n if idx == idx_new or not adj:\n continue\n if (assign_out >> idx) & 1: # neighbor is IN\n dominated = True\n break\n need_bit = 0 if dominated else 1\n need_out = insert_bit(child_need, idx_new, need_bit)\n\n # ( no other vertices change status )\n state_out = DPState(assign_mask=assign_out, need_mask=need_out)\n accumulate(cur_table, state_out, cnt_child, wsum_child)\n\n\ndef forget_callback(\n graph: Graph,\n cur_table: dict[DPState, DPValue],\n cur_bag_info: tuple[int, Bag],\n child_table: dict[DPState, DPValue],\n child_bag_info: tuple[int, Bag],\n forgotten_vertex: int,\n):\n child_vertices = bag_tuple(child_bag_info[1])\n idx_forgot = child_vertices.index(forgotten_vertex)\n\n for child_state, dp_value in child_table.items():\n assign_child, need_child = child_state.assign_mask, child_state.need_mask\n cnt_child, wsum_child = dp_value.count, dp_value.weight\n bit_assign = (assign_child >> idx_forgot) & 1\n bit_need = (need_child >> idx_forgot) & 1\n\n # If forgotten vertex is OUT and still needs domination -> invalid state\n if bit_assign == 0 and bit_need == 1:\n continue\n\n assign_par = remove_bit(assign_child, idx_forgot)\n need_par = remove_bit(need_child, idx_forgot)\n\n state_par = DPState(assign_mask=assign_par, need_mask=need_par)\n accumulate(cur_table, state_par, cnt_child, wsum_child)\n\n\ndef join_callback(\n graph: Graph,\n cur_table: dict[DPState, DPValue],\n cur_bag_info: tuple[int, Bag],\n left_child_table: dict[DPState, DPValue],\n left_child_bag_info: tuple[int, Bag],\n right_child_table: dict[DPState, DPValue],\n right_child_bag_info: tuple[int, Bag],\n):\n bag_vertices = bag_tuple(cur_bag_info[1])\n vertex_weights = graph.vertex_weights\n assert vertex_weights is not None\n\n # Group right states by assignment mask for O(|L| + |R|) compatibility\n right_by_assign: dict[int, list[tuple[int, int, int]]] = {}\n for right_state, dp_value in right_child_table.items():\n assign_r, need_r = right_state.assign_mask, right_state.need_mask\n cnt_r, wsum_r = dp_value.count, dp_value.weight\n right_by_assign.setdefault(assign_r, []).append((need_r, cnt_r, wsum_r))\n\n for left_state, dp_value in left_child_table.items():\n assign_l, need_l = left_state.assign_mask, left_state.need_mask\n cnt_l, wsum_l = dp_value.count, dp_value.weight\n if assign_l not in right_by_assign:\n continue\n for need_r, cnt_r, wsum_r in right_by_assign[assign_l]:\n # Merge NEED flags: dominated if dominated in either side\n need_merge = need_l & need_r # bitwise AND keeps 1 only if both have NEED\n\n cnt_merge = (cnt_l * cnt_r) % MOD\n\n w_bag_sel = bag_selected_weight(assign_l, bag_vertices, vertex_weights)\n w_merge = (wsum_l * cnt_r + wsum_r * cnt_l - cnt_merge * w_bag_sel) % MOD\n if w_merge < 0:\n w_merge += MOD\n\n state_merge = DPState(assign_mask=assign_l, need_mask=need_merge)\n accumulate(cur_table, state_merge, cnt_merge, w_merge)\n\n\ndef extract_solution(root_table: dict[DPState, DPValue]) -> int:\n \"\"\"\n Sum the total weights of all globally valid dominating sets.\n Return -1 if none exist.\n \"\"\"\n answer = 0\n found = False\n for state, dp_value in root_table.items():\n assign_mask, need_mask = state.assign_mask, state.need_mask\n cnt, wsum = dp_value.count, dp_value.weight\n # Bag may be empty or not\n if assign_mask == 0 and need_mask == 0 and cnt == 0:\n # Defensive - shouldn't happen\n continue\n if need_mask != 0:\n # some vertex in root bag still needs domination -> invalid\n continue\n answer = (answer + wsum) % MOD\n found = True\n return answer if found else -1\n"}
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -2,7 +2,7 @@ APScheduler
|
|
2 |
black
|
3 |
datasets
|
4 |
pydantic==2.10.6
|
5 |
-
gradio
|
6 |
gradio[oauth]
|
7 |
gradio_leaderboard==0.0.13
|
8 |
gradio_client
|
@@ -14,5 +14,4 @@ python-dateutil
|
|
14 |
tqdm
|
15 |
transformers
|
16 |
tokenizers>=0.15.0
|
17 |
-
sentencepiece
|
18 |
-
plotly>=5
|
|
|
2 |
black
|
3 |
datasets
|
4 |
pydantic==2.10.6
|
5 |
+
gradio
|
6 |
gradio[oauth]
|
7 |
gradio_leaderboard==0.0.13
|
8 |
gradio_client
|
|
|
14 |
tqdm
|
15 |
transformers
|
16 |
tokenizers>=0.15.0
|
17 |
+
sentencepiece
|
|
scripts/upload_f1_dataset.py
CHANGED
@@ -2,7 +2,6 @@ import argparse
|
|
2 |
import fnmatch
|
3 |
import json
|
4 |
import os
|
5 |
-
from typing import Iterator
|
6 |
|
7 |
from datasets import Dataset
|
8 |
|
@@ -14,23 +13,9 @@ logger = get_logger(__name__)
|
|
14 |
|
15 |
def get_args() -> argparse.Namespace:
|
16 |
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
17 |
-
parser.add_argument(
|
18 |
-
|
19 |
-
|
20 |
-
help="Dir with .json files",
|
21 |
-
required=True,
|
22 |
-
)
|
23 |
-
parser.add_argument(
|
24 |
-
"--dataset_name",
|
25 |
-
type=str,
|
26 |
-
default=f"{CODE_PROBLEMS_REPO}",
|
27 |
-
)
|
28 |
-
parser.add_argument(
|
29 |
-
"--split",
|
30 |
-
type=str,
|
31 |
-
choices=["hard", "warmup"],
|
32 |
-
default="hard",
|
33 |
-
)
|
34 |
return parser.parse_args()
|
35 |
|
36 |
|
@@ -41,7 +26,7 @@ def main(args: argparse.Namespace) -> None:
|
|
41 |
raise ValueError(f"No .json files in input dir {args.input_dir}")
|
42 |
logger.info("Found %d code problems in %s", len(input_files), args.input_dir)
|
43 |
|
44 |
-
def ds_generator()
|
45 |
for fname in sorted(input_files):
|
46 |
formula_name = os.path.splitext(fname)[0]
|
47 |
cp_path = os.path.join(args.input_dir, fname)
|
@@ -50,7 +35,7 @@ def main(args: argparse.Namespace) -> None:
|
|
50 |
logger.info("Read code problem for formula %s from %s", formula_name, cp_path)
|
51 |
yield dict(id=code_problem["id"], code_problem=code_problem)
|
52 |
|
53 |
-
ds
|
54 |
logger.info("Created dataset")
|
55 |
|
56 |
ds.push_to_hub(args.dataset_name, split=args.split, private=True)
|
|
|
2 |
import fnmatch
|
3 |
import json
|
4 |
import os
|
|
|
5 |
|
6 |
from datasets import Dataset
|
7 |
|
|
|
13 |
|
14 |
def get_args() -> argparse.Namespace:
|
15 |
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
16 |
+
parser.add_argument("--input_dir", type=str, help="Dir with .json files", required=True)
|
17 |
+
parser.add_argument("--dataset_name", type=str, default=f"{CODE_PROBLEMS_REPO}")
|
18 |
+
parser.add_argument("--split", type=str, choices=["hard", "warmup"], default="hard")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
return parser.parse_args()
|
20 |
|
21 |
|
|
|
26 |
raise ValueError(f"No .json files in input dir {args.input_dir}")
|
27 |
logger.info("Found %d code problems in %s", len(input_files), args.input_dir)
|
28 |
|
29 |
+
def ds_generator():
|
30 |
for fname in sorted(input_files):
|
31 |
formula_name = os.path.splitext(fname)[0]
|
32 |
cp_path = os.path.join(args.input_dir, fname)
|
|
|
35 |
logger.info("Read code problem for formula %s from %s", formula_name, cp_path)
|
36 |
yield dict(id=code_problem["id"], code_problem=code_problem)
|
37 |
|
38 |
+
ds = Dataset.from_generator(ds_generator)
|
39 |
logger.info("Created dataset")
|
40 |
|
41 |
ds.push_to_hub(args.dataset_name, split=args.split, private=True)
|
src/about.py
CHANGED
@@ -1,122 +1,66 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
WHAT_IS_F1_HTML_TOP = f"""
|
5 |
-
<div class="f1-container">
|
6 |
-
<div align="center"><header class="text-center mb-12"><h1 class="text-4xl md:text-5xl font-bold text-gray-900 f1-h1" style="margin:0; display:inline;">FormulaOne</h1><span style="display:inline-block; margin-left:0.5em;"><h3 style="margin:0; display:inline;" class="text-4xl md:text-5xl font-bold text-gray-900 f1-h3 style=">by <a href="https://doubleai.com/">AAI</a></h3></header></div>
|
7 |
-
<section>
|
8 |
-
<br/>
|
9 |
-
<p class="text-lg mb-4 f1-p">Frontier AI models have recently demonstrated strong performance on mathematical and algorithmic benchmarks, including earning <a href="https://deepmind.google/discover/blog/advanced-version-of-gemini-with-deep-think-officially-achieves-gold-medal-standard-at-the-international-mathematical-olympiad/" target="_blank" rel="noopener noreferrer" class="f1-a">gold medals in olympiads</a>, and attaining <a href="https://arxiv.org/html/2502.06807v1" target="_blank" rel="noopener noreferrer" class="f1-a">top percentile ratings</a> in competitive programming contests. How well do such benchmarks capture the true depth of algorithmic reasoning, as it arises in real-world research problems?</p>
|
10 |
-
|
11 |
-
<p class="text-lg mb-4 f1-p">We believe that existing benchmarks fail to capture the deep reasoning skills required for complex, research-level algorithmic problems. To address this gap, <a href="{PAPER_URL}" target="_blank" rel="noopener noreferrer" class="f1-a">we introduce <strong>FormulaOne</strong></a>.</p>
|
12 |
-
|
13 |
-
<p class="mb-4 f1-p"><strong>FormulaOne</strong> consists of 220 novel dynamic programming problems over graphs. The problems are organised into three categories, ranging from moderate difficulty and all the way up to research-level.</p>
|
14 |
-
|
15 |
-
<!-- Clean, centered "table" using a single grid -->
|
16 |
-
<div class="f1-grid-wrap" role="region" aria-label="FormulaOne categories">
|
17 |
-
<div class="f1-grid-table" role="table">
|
18 |
-
<div class="f1-grid-row f1-grid-head" role="row">
|
19 |
-
<div class="f1-grid-cell" role="columnheader">Category</div>
|
20 |
-
<div class="f1-grid-cell" role="columnheader">Size</div>
|
21 |
-
<div class="f1-grid-cell" role="columnheader">Description</div>
|
22 |
-
</div>
|
23 |
-
<div class="f1-grid-row" role="row">
|
24 |
-
<div class="f1-grid-cell" role="cell">Shallow</div>
|
25 |
-
<div class="f1-grid-cell" role="cell">100</div>
|
26 |
-
<div class="f1-grid-cell" role="cell">A set of “easier” problems.</div>
|
27 |
-
</div>
|
28 |
-
<div class="f1-grid-row" role="row">
|
29 |
-
<div class="f1-grid-cell" role="cell">Deeper</div>
|
30 |
-
<div class="f1-grid-cell" role="cell">100</div>
|
31 |
-
<div class="f1-grid-cell" role="cell">A set of challenging problems.</div>
|
32 |
-
</div>
|
33 |
-
<div class="f1-grid-row" role="row">
|
34 |
-
<div class="f1-grid-cell" role="cell">Deepest</div>
|
35 |
-
<div class="f1-grid-cell" role="cell">20</div>
|
36 |
-
<div class="f1-grid-cell" role="cell">A set of highly challenging problems.</div>
|
37 |
-
</div>
|
38 |
-
</div>
|
39 |
-
</div>
|
40 |
-
</section>
|
41 |
-
</div>
|
42 |
-
"""
|
43 |
|
44 |
-
# Bottom is split so we can insert real Gradio media (images/video) from app.py.
|
45 |
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
<p class="f1-p">Despite <a href="https://epoch.ai/frontiermath" target="_blank" rel="noopener noreferrer" class="f1-a">impressive</a> <a href="https://artificialanalysis.ai/evaluations/gpqa-diamond" target="_blank" rel="noopener noreferrer" class="f1-a">performance</a> on existing benchmarks, presently <strong>no model solves even a single 'Deepest Tier' problem</strong>.</p>
|
52 |
-
</section>
|
53 |
|
54 |
-
<section>
|
55 |
-
<h2 class="f1-h2">An “Infinite Well” of Problems</h2>
|
56 |
-
"""
|
57 |
|
58 |
-
#
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
|
69 |
-
# Text immediately after the video; opens Evaluation section header/content (up to before Warmup figure)
|
70 |
-
WHAT_IS_F1_HTML_AFTER_VIDEO = """
|
71 |
-
<p class="f1-p">The deceptive simplicity of the problem statements belies the <strong>extraordinary difficulty</strong> of discovering the correct dynamic programming solution. This process is riddled with subtle combinatorial and logical pitfalls, demanding a profound understanding of the problem’s underlying structure. For a detailed walkthrough of the fifteen interdependent reasoning steps required to solve a single hard problem — <code>Maximal-Cluster-Graph</code> — <a href="https://arxiv.org/pdf/2507.13337#appendix.A" target="_blank" rel="noopener noreferrer" class="f1-a">see the appendix of our paper</a>.</p>
|
72 |
-
</section>
|
73 |
-
|
74 |
-
<section id="evaluation">
|
75 |
-
<h2 class="f1-h2">Evaluation</h2>
|
76 |
-
<p class="mb-4 f1-p">All models were evaluated using their highest available reasoning settings and with the maximum context length permitted. To give models the best possible chance of success, we provide a generous few-shot prompt that covers a broad array of the ideas and techniques involved in solving these problems.</p>
|
77 |
-
<p class="mb-4 f1-p">Each submitted solution is subjected to a rigorous and automated <a href="https://arxiv.org/pdf/2507.13337#section.4" target="_blank" rel="noopener noreferrer" class="f1-a">test suite</a> that measures three key aspects of its validity:</p>
|
78 |
-
<ul class="list-disc list-inside space-y-2 mb-6">
|
79 |
-
<li class="f1-li"><strong>Correctness:</strong> The output of the submitted algorithm must be correct on all graphs.</li>
|
80 |
-
<li class="f1-li"><strong>Consistency:</strong> The solution must produce the same output for a given graph, regardless of the specific tree decomposition.</li>
|
81 |
-
<li class="f1-li"><strong>Efficiency:</strong> The solution must be truly <a href="https://en.wikipedia.org/wiki/Parameterized_complexity" target="_blank" rel="noopener noreferrer" class="f1-a">fixed-parameter linear</a>.</li>
|
82 |
-
</ul>
|
83 |
-
<p class="mb-4 f1-p">To support research and encourage community contributions, the <code>FormulaOne-Shallow</code> ("warmup") dataset is released as a public resource for training and fine-tuning models. The complete test suite for all 100 'Shallow' problems is available, alongside a standalone evaluation environment, in our <a href="https://github.com/double-ai/formulaone-dataset/tree/main" target="_blank" rel="noopener noreferrer" class="f1-a">GitHub repository</a>.</p>
|
84 |
-
<p class="f1-p">To maintain the integrity of the core benchmark, only a minimal subset of tests is released for the Deeper and Deepest Tier problems. Solutions submitted for evaluation on our benchmark are evaluated against a withheld comprehensive test-suite.</p>
|
85 |
-
"""
|
86 |
|
87 |
-
#
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
"""
|
93 |
|
94 |
-
#
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
|
|
|
|
|
|
|
|
99 |
"""
|
100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
|
102 |
-
SUBMISSION_TERMS_TEXT = """
|
103 |
-
### Competition terms
|
104 |
-
- By submitting, you agree to the **FormulaOne Submission Agreement
|
105 |
-
(v1.2)** and our **Privacy Notice**.
|
106 |
-
- Your uploaded file remains yours; we only use it to evaluate, score,
|
107 |
-
and contact you about your result.
|
108 |
-
**Licensing for the public benchmark assets (informational)**
|
109 |
-
- **Evaluator code:** Apache License 2.0
|
110 |
-
- **Problem statements & public tests:** Creative Commons **CC BY 4.0**
|
111 |
-
See the project's **README licence section** and full texts: `LICENSE-
|
112 |
-
APACHE2`, `LICENSE-CC-BY` in our GitHub repo.
|
113 |
-
**Platform**
|
114 |
-
- Your use of Hugging Face is also governed by Hugging Face's Terms and
|
115 |
-
Privacy Policy.
|
116 |
"""
|
117 |
|
118 |
EVALUATION_QUEUE_TEXT = """
|
119 |
-
## Submitting to the FormulaOne Leaderboard
|
120 |
|
121 |
This leaderboard evaluates systems on the FormulaOne core dataset. Submissions consist of a .jsonl file with solution code for each problem.
|
122 |
|
@@ -134,7 +78,7 @@ Your submission must be a .jsonl file with one entry per problem:
|
|
134 |
- solution: A Python code implementing the required callback functions.
|
135 |
|
136 |
📄 Full list of problem_ids:
|
137 |
-
View the [FormulaOne core dataset](https://github.com/double-ai/formulaone-dataset-release/
|
138 |
|
139 |
⚠️ Validation Rules:
|
140 |
Submissions must:
|
@@ -151,7 +95,7 @@ Submissions must:
|
|
151 |
- **Organization**
|
152 |
- **System Type**
|
153 |
- Click **Submit**.
|
154 |
-
|
155 |
### ⏱️ After Submission
|
156 |
|
157 |
Submissions are validated and evaluated within ~24 hours. Results will appear on the leaderboard once processed.
|
@@ -161,12 +105,12 @@ Submissions are validated and evaluated within ~24 hours. Results will appear on
|
|
161 |
CITATION_BUTTON_LABEL = """📚 How to cite FormulaOne"""
|
162 |
CITATION_BUTTON_TEXT = r"""
|
163 |
@misc{beniamini2025formulaonemeasuringdepthalgorithmic,
|
164 |
-
title={FormulaOne: Measuring the Depth of Algorithmic Reasoning Beyond Competitive Programming},
|
165 |
-
author={Gal Beniamini and Yuval Dor and Alon Vinnikov and Shir Granot Peled and Or Weinstein and Or Sharir and Noam Wies and Tomer Nussbaum and
|
166 |
year={2025},
|
167 |
eprint={2507.13337},
|
168 |
archivePrefix={arXiv},
|
169 |
primaryClass={cs.AI},
|
170 |
-
url={https://arxiv.org/abs/2507.13337},
|
171 |
}
|
172 |
"""
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
from enum import Enum
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
|
|
4 |
|
5 |
+
@dataclass
|
6 |
+
class Task:
|
7 |
+
benchmark: str
|
8 |
+
metric: str
|
9 |
+
col_name: str
|
|
|
|
|
10 |
|
|
|
|
|
|
|
11 |
|
12 |
+
# Select your tasks here
|
13 |
+
# ---------------------------------------------------
|
14 |
+
class Tasks(Enum):
|
15 |
+
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
16 |
+
task0 = Task("FormulaOne", "success_rate", "Success Rate (%)")
|
17 |
+
# task1 = Task("logiqa", "acc_norm", "LogiQA")
|
18 |
+
|
19 |
+
|
20 |
+
NUM_FEWSHOT = 0 # Change with your few shot
|
21 |
+
# ---------------------------------------------------
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
+
# Your leaderboard name
|
25 |
+
# TITLE = """<h1 align="center" id="space-title">AAI FormulaOne Leaderboard</h1>"""
|
26 |
+
|
27 |
+
TITLE = """
|
28 |
+
<h1 id="space-title" style="
|
29 |
+
text-align: center;
|
30 |
+
font-family: 'Segoe UI', 'Helvetica Neue', sans-serif;
|
31 |
+
font-weight: 300;
|
32 |
+
letter-spacing: 0.05em;
|
33 |
+
color: white;
|
34 |
+
text-transform: none;
|
35 |
+
margin-top: 2rem;
|
36 |
+
font-size: 2.6rem;
|
37 |
+
">
|
38 |
+
FormulaOne Leaderboard
|
39 |
+
</h1>
|
40 |
"""
|
41 |
|
42 |
+
# What does your leaderboard evaluate?
|
43 |
+
INTRODUCTION_TEXT = """
|
44 |
+
Welcome to the official leaderboard for the paper:
|
45 |
+
|
46 |
+
**FormulaOne: Measuring the Depth of Algorithmic Reasoning Beyond Competitive Programming** <br>
|
47 |
+
*Gal Beniamini, Yuval Dor, Alon Vinnikov, Shir Granot Peled, Or Weinstein, Or Sharir, Noam Wies, Tomer Nussbaum, Ido Ben Shaul, Tomer Zekharya, Yoav Levine, Shai Shalev-Shwartz, Amnon Shashua* <br>
|
48 |
+
**AAI, July 2025**
|
49 |
+
|
50 |
+
FormulaOne is a new benchmark designed to challenge frontier AI models. The benchmark is constructed from a vast and conceptually diverse family of dynamic programming problems derived from Monadic Second-Order (MSO) logic on graphs, a framework with profound connections to theoretical computer science.
|
51 |
"""
|
52 |
|
53 |
+
# Which evaluations are you running? how can people reproduce what you have?
|
54 |
+
LLM_BENCHMARKS_TEXT = f"""
|
55 |
+
## How it works
|
56 |
+
|
57 |
+
## Reproducibility
|
58 |
+
To reproduce our results, here is the commands you can run:
|
59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
"""
|
61 |
|
62 |
EVALUATION_QUEUE_TEXT = """
|
63 |
+
## 🧪 Submitting to the FormulaOne Leaderboard
|
64 |
|
65 |
This leaderboard evaluates systems on the FormulaOne core dataset. Submissions consist of a .jsonl file with solution code for each problem.
|
66 |
|
|
|
78 |
- solution: A Python code implementing the required callback functions.
|
79 |
|
80 |
📄 Full list of problem_ids:
|
81 |
+
View the [FormulaOne core dataset](https://github.com/double-ai/formulaone-dataset-release/dataset/formulaone) for the complete list of problem IDs.
|
82 |
|
83 |
⚠️ Validation Rules:
|
84 |
Submissions must:
|
|
|
95 |
- **Organization**
|
96 |
- **System Type**
|
97 |
- Click **Submit**.
|
98 |
+
|
99 |
### ⏱️ After Submission
|
100 |
|
101 |
Submissions are validated and evaluated within ~24 hours. Results will appear on the leaderboard once processed.
|
|
|
105 |
CITATION_BUTTON_LABEL = """📚 How to cite FormulaOne"""
|
106 |
CITATION_BUTTON_TEXT = r"""
|
107 |
@misc{beniamini2025formulaonemeasuringdepthalgorithmic,
|
108 |
+
title={FormulaOne: Measuring the Depth of Algorithmic Reasoning Beyond Competitive Programming},
|
109 |
+
author={Gal Beniamini and Yuval Dor and Alon Vinnikov and Shir Granot Peled and Or Weinstein and Or Sharir and Noam Wies and Tomer Nussbaum and Ido Ben Shaul and Tomer Zekharya and Yoav Levine and Shai Shalev-Shwartz and Amnon Shashua},
|
110 |
year={2025},
|
111 |
eprint={2507.13337},
|
112 |
archivePrefix={arXiv},
|
113 |
primaryClass={cs.AI},
|
114 |
+
url={https://arxiv.org/abs/2507.13337},
|
115 |
}
|
116 |
"""
|
src/datamodel/data.py
CHANGED
@@ -3,20 +3,14 @@ import time
|
|
3 |
|
4 |
from datasets import load_dataset
|
5 |
|
6 |
-
from src.envs import CODE_PROBLEMS_REPO, RESULTS_REPO, SUBMISSIONS_REPO
|
7 |
from src.logger import get_logger
|
8 |
|
9 |
logger = get_logger(__name__)
|
10 |
|
11 |
|
12 |
class F1Data:
|
13 |
-
def __init__(
|
14 |
-
self,
|
15 |
-
cp_ds_name: str, # Name of the dataset. Fixed.
|
16 |
-
sub_ds_name: str, # Name of subdataset. Fixed.
|
17 |
-
res_ds_name: str, # Name of results repository. Fixed.
|
18 |
-
split: str = "hard", # Split is either 'hard' or 'easy'.
|
19 |
-
):
|
20 |
self.cp_dataset_name = cp_ds_name
|
21 |
self.submissions_dataset_name = sub_ds_name
|
22 |
self.results_dataset_name = res_ds_name
|
@@ -25,16 +19,16 @@ class F1Data:
|
|
25 |
self._initialize()
|
26 |
|
27 |
def _initialize(self):
|
28 |
-
logger.info(
|
29 |
start_time = time.monotonic()
|
30 |
-
cp_ds = load_dataset(
|
|
|
|
|
31 |
self.cp_dataset_name,
|
32 |
-
|
33 |
-
token=TOKEN,
|
34 |
)
|
35 |
-
|
36 |
-
|
37 |
-
logger.info(f"Loaded {len(self.code_problems)} code problems")
|
38 |
|
39 |
@functools.cached_property
|
40 |
def code_problem_ids(self) -> set[str]:
|
@@ -43,11 +37,6 @@ class F1Data:
|
|
43 |
|
44 |
if __name__ == "__main__":
|
45 |
split = "hard"
|
46 |
-
f1_data = F1Data(
|
47 |
-
cp_ds_name=CODE_PROBLEMS_REPO,
|
48 |
-
sub_ds_name=SUBMISSIONS_REPO,
|
49 |
-
res_ds_name=RESULTS_REPO,
|
50 |
-
split=split,
|
51 |
-
)
|
52 |
|
53 |
print(f"Found {len(f1_data.code_problem_ids)} code problems in {split} split of {f1_data.cp_dataset_name}")
|
|
|
3 |
|
4 |
from datasets import load_dataset
|
5 |
|
6 |
+
from src.envs import TOKEN, CODE_PROBLEMS_REPO, RESULTS_REPO, SUBMISSIONS_REPO
|
7 |
from src.logger import get_logger
|
8 |
|
9 |
logger = get_logger(__name__)
|
10 |
|
11 |
|
12 |
class F1Data:
|
13 |
+
def __init__(self, cp_ds_name: str, sub_ds_name: str, res_ds_name: str, split: str = "hard"):
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
self.cp_dataset_name = cp_ds_name
|
15 |
self.submissions_dataset_name = sub_ds_name
|
16 |
self.results_dataset_name = res_ds_name
|
|
|
19 |
self._initialize()
|
20 |
|
21 |
def _initialize(self):
|
22 |
+
logger.info("Initialize F1Data TOKEN='%s'", TOKEN)
|
23 |
start_time = time.monotonic()
|
24 |
+
cp_ds = load_dataset(self.cp_dataset_name, split=self.split, token=TOKEN)
|
25 |
+
logger.info(
|
26 |
+
"Loaded code-problems dataset from %s in %f sec",
|
27 |
self.cp_dataset_name,
|
28 |
+
time.monotonic() - start_time,
|
|
|
29 |
)
|
30 |
+
self.code_problems: dict[str, str] = {r["id"]: r["code_problem"] for r in cp_ds}
|
31 |
+
logger.info(f"Loaded %d code problems {len(self.code_problems)}")
|
|
|
32 |
|
33 |
@functools.cached_property
|
34 |
def code_problem_ids(self) -> set[str]:
|
|
|
37 |
|
38 |
if __name__ == "__main__":
|
39 |
split = "hard"
|
40 |
+
f1_data = F1Data(cp_ds_name=CODE_PROBLEMS_REPO, sub_ds_name=SUBMISSIONS_REPO, res_ds_name=RESULTS_REPO, split=split)
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
print(f"Found {len(f1_data.code_problem_ids)} code problems in {split} split of {f1_data.cp_dataset_name}")
|
src/display/__init__.py
DELETED
File without changes
|
src/display/css_html_js.py
CHANGED
@@ -1,219 +1,99 @@
|
|
1 |
custom_css = """
|
2 |
-
:root, [data-theme="light"] {
|
3 |
-
--f1-text: #111827;
|
4 |
-
--f1-subtle: #6b7280;
|
5 |
-
--f1-border: #e5e7eb;
|
6 |
-
--f1-bg: #ffffff;
|
7 |
-
--f1-bg-muted: #f9fafb;
|
8 |
|
9 |
-
|
10 |
-
|
11 |
-
--background-fill-primary: transparent !important;
|
12 |
}
|
13 |
|
14 |
-
|
15 |
-
|
16 |
-
.markdown-text { font-size: 16px !important; max-width: 800px; margin: 0 auto; }
|
17 |
-
#what-is-tab { max-width: 800px; margin-left: auto; margin-right: auto; }
|
18 |
-
/* requested 710px */
|
19 |
-
#f1-examples { max-width: 710px; margin: 0 auto; }
|
20 |
-
|
21 |
-
/* NEW: landing tab width + tier selector alignment */
|
22 |
-
/* Landing tab width + tier selector alignment */
|
23 |
-
#landing-accuracy-tab { max-width: 800px; margin-left: auto; margin-right: auto; }
|
24 |
-
|
25 |
-
/* Right-align the switcher row; transparent background & border */
|
26 |
-
#f1-tier-select-row {
|
27 |
-
justify-content: flex-end;
|
28 |
-
margin-bottom: 6px;
|
29 |
-
background: white !important;
|
30 |
-
border: none !important;
|
31 |
}
|
32 |
|
33 |
-
|
34 |
-
|
35 |
-
#f1-tier-select .wrap {
|
36 |
-
display: inline-flex;
|
37 |
-
gap: 6px;
|
38 |
-
padding: 4px;
|
39 |
-
background: #ffffff; /* white background for the switcher itself */
|
40 |
-
border: 1px solid var(--f1-border);
|
41 |
-
border-radius: 999px;
|
42 |
-
}
|
43 |
-
#f1-tier-select input[type="radio"] { display: none; }
|
44 |
-
#f1-tier-select label {
|
45 |
-
border: none;
|
46 |
-
border-radius: 999px;
|
47 |
-
padding: 6px 12px;
|
48 |
-
background: transparent;
|
49 |
-
cursor: pointer;
|
50 |
-
}
|
51 |
-
#f1-tier-select input[type="radio"]:checked + span {
|
52 |
-
background: #eef2ff; /* only selected pill is tinted */
|
53 |
-
border-radius: 999px;
|
54 |
-
padding: 6px 12px;
|
55 |
-
box-shadow: 0 1px 2px rgba(0,0,0,0.04);
|
56 |
}
|
57 |
|
58 |
-
#
|
59 |
-
|
60 |
-
#f1-tier-select { background: white; }
|
61 |
-
|
62 |
-
#f1-accuracy-plot label { display: false; }
|
63 |
-
|
64 |
-
#learn-more-btn, #learn-more-btn button {
|
65 |
-
background: transparent !important;
|
66 |
-
border: none !important;
|
67 |
-
color: #2563eb !important;
|
68 |
-
font-weight: 700 !important;
|
69 |
-
font-size: 1.05rem !important;
|
70 |
-
padding: 0 !important;
|
71 |
-
box-shadow: none !important;
|
72 |
-
}
|
73 |
-
#learn-more-btn button:hover { text-decoration: underline !important; background: transparent !important; }
|
74 |
-
|
75 |
-
#landing-hero-row {
|
76 |
-
max-width: 800px;
|
77 |
-
margin-left: auto;
|
78 |
-
margin-right: auto;
|
79 |
-
margin-bottom: 8px;
|
80 |
-
align-items: center;
|
81 |
-
}
|
82 |
-
#landing-hero-left { padding-right: 8px; }
|
83 |
-
#landing-hero-right { display: flex; justify-content: flex-end; }
|
84 |
-
|
85 |
-
/* Blurb text */
|
86 |
-
.f1-hero { margin: 0; font-size: 1.02rem; color: var(--f1-text); }
|
87 |
-
|
88 |
-
/* Learn More pill — compact */
|
89 |
-
#learn-more-pill,
|
90 |
-
#learn-more-pill button {
|
91 |
-
width: auto !important;
|
92 |
-
min-width: 0 !important;
|
93 |
-
background: #2563eb !important;
|
94 |
-
color: #ffffff !important;
|
95 |
-
border: 0 !important;
|
96 |
-
border-radius: 999px !important;
|
97 |
-
padding: 6px 12px !important;
|
98 |
-
box-shadow: 0 1px 2px rgba(0,0,0,0.06) !important;
|
99 |
-
font-weight: 700 !important;
|
100 |
-
font-size: 0.95rem !important;
|
101 |
-
white-space: nowrap !important;
|
102 |
-
}
|
103 |
-
#learn-more-pill button:hover { filter: brightness(0.95) !important; }
|
104 |
-
|
105 |
-
#landing-hero-left { min-width: 75%; }
|
106 |
-
|
107 |
-
/* Text */
|
108 |
-
.f1-p, .f1-li { line-height: 1.75; color: #374151; text-wrap: pretty; overflow-wrap: break-word; hyphens: auto; }
|
109 |
-
|
110 |
-
/* Headings */
|
111 |
-
.f1-h1 { font-weight: 700; font-size: 2.25rem; line-height: 2.5rem; color: var(--f1-text); text-align: center; margin-bottom: 1.25rem !important; }
|
112 |
-
.f1-h2 { font-weight: 700; border-bottom: 1px solid var(--f1-border); padding-bottom: 0.45rem; margin-top: 1.75rem; margin-bottom: 0.9rem; color: var(--f1-text); font-size: 1.5rem; line-height: 2rem; }
|
113 |
-
|
114 |
-
/* Links */
|
115 |
-
.f1-a { color: #2563eb; text-decoration: none; font-weight: 500; }
|
116 |
-
.f1-a:hover { text-decoration: underline; }
|
117 |
-
|
118 |
-
/* Captions (centered + dark) */
|
119 |
-
.f1-figcaption { margin-top: 4px; font-size: 0.875rem; color: #111827; text-align: center; }
|
120 |
-
.f1-figcaption-video { margin-top: 2px; } /* tighter under the video */
|
121 |
-
|
122 |
-
/* Problem name — force center from first render; code bg color #f9fafb */
|
123 |
-
#f1-examples .f1-problem-markdown .markdown p:first-child { text-align: center !important; margin: 0 0 8px 0; }
|
124 |
-
.f1-problem-markdown p code,
|
125 |
-
#f1-examples .f1-problem-markdown .markdown p:first-child code {
|
126 |
-
display: inline-block; background: #f9fafb !important; padding: 2px 8px; border-radius: 6px; margin-left: auto; margin-right: auto;
|
127 |
}
|
128 |
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
#f1-examples .f1-problem-markdown .markdown p { margin: 0.35rem 0; }
|
134 |
-
|
135 |
-
/* Pills (Radio) — compact spacing at bottom */
|
136 |
-
#f1-example-radio { border-top: 1px solid var(--f1-border); padding: 8px 10px 4px 10px; margin: 0 8px 2px; }
|
137 |
-
#f1-example-radio input[type="radio"] { display: none; }
|
138 |
-
#f1-example-radio .wrap { display: flex; gap: 6px; flex-wrap: wrap; justify-content: flex-start; }
|
139 |
-
#f1-example-radio label { border: 1px solid var(--f1-border); border-radius: 999px; padding: 6px 10px; cursor: pointer; background: #f3f4f6; }
|
140 |
-
#f1-example-radio input[type="radio"]:checked + span { background: #e5e7eb; border-color: var(--f1-border); border-radius: 999px; padding: 6px 10px; }
|
141 |
-
|
142 |
-
/* Examples card chrome */
|
143 |
-
#f1-examples { background: var(--f1-bg-muted); border: 1px solid var(--f1-border); border-radius: 10px; box-shadow: 0 1px 2px rgba(0,0,0,0.04); margin-bottom: 12px; }
|
144 |
-
#f1-examples .form { background: transparent !important; } /* no bg on inner form container */
|
145 |
-
.f1-tabs-body { padding-top: 12px; text-align: center; }
|
146 |
-
.f1-examples-chip { display: inline-block; background: #e5e7eb; color: #111827; padding: 6px 12px; border-radius: 999px; font-weight: 700; }
|
147 |
-
|
148 |
-
/* Figures via Gradio components (centered, rounded, shadow) */
|
149 |
-
.f1-image img, .f1-video video { width: 100%; max-width: 42rem; display: block; margin: 0 auto; border-radius: 12px; box-shadow: 0 2px 8px rgba(0,0,0,0.08); }
|
150 |
-
|
151 |
-
/* Categories "table" — equal width per column set; compact first two, flexible third */
|
152 |
-
.f1-grid-wrap { text-align: center; margin: 10px auto 8px auto; }
|
153 |
-
.f1-grid-table {
|
154 |
-
display: inline-grid;
|
155 |
-
grid-template-columns: max-content 12ch minmax(360px, 1fr); /* 2nd col fixed width; 3rd grows */
|
156 |
-
border: 1px solid var(--f1-border);
|
157 |
-
background: var(--f1-bg);
|
158 |
-
border-radius: 8px;
|
159 |
-
overflow: hidden;
|
160 |
-
}
|
161 |
-
.f1-grid-row { display: contents; }
|
162 |
-
.f1-grid-cell { padding: 8px 12px; text-align: left; border-left: 1px solid var(--f1-border); border-top: 1px solid var(--f1-border); white-space: normal; }
|
163 |
-
.f1-grid-cell:nth-child(3n+1) { border-left: none; }
|
164 |
-
.f1-grid-head .f1-grid-cell { font-weight: 600; text-align: center; border-top: none; }
|
165 |
|
166 |
-
#
|
|
|
|
|
167 |
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
#formulaone-leaderboard-tab-table .row, #formulaone-leaderboard-tab-table .column { width: 100% !important; max-width: 100% !important; }
|
172 |
-
#formulaone-leaderboard-tab-table [data-testid="dropdown"], #formulaone-leaderboard-tab-table input[type="text"] { width: 100% !important; }
|
173 |
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
|
|
|
|
|
|
178 |
}
|
179 |
-
#hf-login-btn:hover, #hf-login-btn button:hover, button[data-testid="login-button"]:hover, [data-testid="login-button"] button:hover, div[data-testid="login-button"] > button:hover { background: #f9fafb !important; }
|
180 |
|
181 |
-
/*
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
cursor: pointer !important;
|
188 |
}
|
189 |
|
190 |
-
.
|
191 |
-
|
192 |
-
font-size: 14px !important;
|
193 |
-
line-height: 1.5 !important;
|
194 |
-
cursor: pointer !important;
|
195 |
-
display: flex !important;
|
196 |
-
align-items: flex-start !important;
|
197 |
-
margin-bottom: 8px !important;
|
198 |
}
|
199 |
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
}
|
208 |
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
217 |
"""
|
218 |
|
219 |
get_window_url_params = """
|
|
|
1 |
custom_css = """
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
+
.markdown-text {
|
4 |
+
font-size: 16px !important;
|
|
|
5 |
}
|
6 |
|
7 |
+
#models-to-add-text {
|
8 |
+
font-size: 18px !important;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
}
|
10 |
|
11 |
+
#citation-button span {
|
12 |
+
font-size: 16px !important;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
}
|
14 |
|
15 |
+
#citation-button textarea {
|
16 |
+
font-size: 16px !important;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
}
|
18 |
|
19 |
+
#citation-button > label > button {
|
20 |
+
margin: 6px;
|
21 |
+
transform: scale(1.3);
|
22 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
+
#leaderboard-table {
|
25 |
+
margin-top: 15px
|
26 |
+
}
|
27 |
|
28 |
+
#leaderboard-table-lite {
|
29 |
+
margin-top: 15px
|
30 |
+
}
|
|
|
|
|
31 |
|
32 |
+
#search-bar-table-box > div:first-child {
|
33 |
+
background: none;
|
34 |
+
border: none;
|
35 |
+
}
|
36 |
+
|
37 |
+
#search-bar {
|
38 |
+
padding: 0px;
|
39 |
}
|
|
|
40 |
|
41 |
+
/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
|
42 |
+
#leaderboard-table td:nth-child(2),
|
43 |
+
#leaderboard-table th:nth-child(2) {
|
44 |
+
max-width: 400px;
|
45 |
+
overflow: auto;
|
46 |
+
white-space: nowrap;
|
|
|
47 |
}
|
48 |
|
49 |
+
.tab-buttons button {
|
50 |
+
font-size: 20px;
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
}
|
52 |
|
53 |
+
#scale-logo {
|
54 |
+
border-style: none !important;
|
55 |
+
box-shadow: none;
|
56 |
+
display: block;
|
57 |
+
margin-left: auto;
|
58 |
+
margin-right: auto;
|
59 |
+
max-width: 600px;
|
60 |
}
|
61 |
|
62 |
+
#scale-logo .download {
|
63 |
+
display: none;
|
64 |
+
}
|
65 |
+
#filter_type{
|
66 |
+
border: 0;
|
67 |
+
padding-left: 0;
|
68 |
+
padding-top: 0;
|
69 |
+
}
|
70 |
+
#filter_type label {
|
71 |
+
display: flex;
|
72 |
+
}
|
73 |
+
#filter_type label > span{
|
74 |
+
margin-top: var(--spacing-lg);
|
75 |
+
margin-right: 0.5em;
|
76 |
+
}
|
77 |
+
#filter_type label > .wrap{
|
78 |
+
width: 103px;
|
79 |
+
}
|
80 |
+
#filter_type label > .wrap .wrap-inner{
|
81 |
+
padding: 2px;
|
82 |
+
}
|
83 |
+
#filter_type label > .wrap .wrap-inner input{
|
84 |
+
width: 1px
|
85 |
+
}
|
86 |
+
#filter-columns-type{
|
87 |
+
border:0;
|
88 |
+
padding:0.5;
|
89 |
+
}
|
90 |
+
#filter-columns-size{
|
91 |
+
border:0;
|
92 |
+
padding:0.5;
|
93 |
+
}
|
94 |
+
#box-filter > .form{
|
95 |
+
border: 0
|
96 |
+
}
|
97 |
"""
|
98 |
|
99 |
get_window_url_params = """
|
src/display/formatting.py
CHANGED
@@ -1,3 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
def styled_error(error):
|
2 |
return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
|
3 |
|
@@ -8,3 +17,11 @@ def styled_warning(warn):
|
|
8 |
|
9 |
def styled_message(message):
|
10 |
return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def model_hyperlink(link, model_name):
|
2 |
+
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
3 |
+
|
4 |
+
|
5 |
+
def make_clickable_model(model_name):
|
6 |
+
link = f"https://huggingface.co/{model_name}"
|
7 |
+
return model_hyperlink(link, model_name)
|
8 |
+
|
9 |
+
|
10 |
def styled_error(error):
|
11 |
return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
|
12 |
|
|
|
17 |
|
18 |
def styled_message(message):
|
19 |
return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
|
20 |
+
|
21 |
+
|
22 |
+
def has_no_nan_values(df, columns):
|
23 |
+
return df[columns].notna().all(axis=1)
|
24 |
+
|
25 |
+
|
26 |
+
def has_nan_values(df, columns):
|
27 |
+
return df[columns].isna().any(axis=1)
|
src/display/utils.py
CHANGED
@@ -1,15 +1,19 @@
|
|
1 |
-
from dataclasses import dataclass
|
|
|
2 |
from enum import Enum
|
3 |
|
|
|
4 |
|
5 |
-
|
6 |
-
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
7 |
|
8 |
|
9 |
-
|
10 |
-
|
11 |
|
12 |
|
|
|
|
|
|
|
13 |
@dataclass
|
14 |
class ColumnContent:
|
15 |
name: str
|
@@ -19,57 +23,116 @@ class ColumnContent:
|
|
19 |
never_hidden: bool = False
|
20 |
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
@dataclass(frozen=True)
|
23 |
class AutoEvalColumn:
|
24 |
system = ColumnContent("System Name", "markdown", True, never_hidden=True)
|
|
|
25 |
organization = ColumnContent("Organization", "str", True, never_hidden=True)
|
26 |
-
|
27 |
-
|
28 |
-
success_rate_tier2 = ColumnContent("Deepest Tier Success (%)", "number", True)
|
29 |
submitted_on = ColumnContent("Submitted On", "datetime", True)
|
30 |
|
31 |
|
32 |
-
|
33 |
@dataclass(frozen=True)
|
34 |
class EvalQueueColumn: # Queue column
|
35 |
model = ColumnContent("model", "markdown", True)
|
36 |
revision = ColumnContent("revision", "str", True)
|
37 |
private = ColumnContent("private", "bool", True)
|
38 |
precision = ColumnContent("precision", "str", True)
|
39 |
-
weight_type = ColumnContent("weight_type", "str",
|
40 |
status = ColumnContent("status", "str", True)
|
41 |
|
42 |
|
43 |
-
|
44 |
@dataclass
|
45 |
class ModelDetails:
|
46 |
name: str
|
47 |
display_name: str = ""
|
|
|
48 |
|
49 |
|
50 |
class ModelType(Enum):
|
51 |
-
LLM = ModelDetails(name="LLM")
|
52 |
-
AgenticLLM = ModelDetails(name="AgenticLLM")
|
53 |
-
|
|
|
|
|
54 |
|
55 |
-
def to_str(self):
|
56 |
-
return self.value.name
|
57 |
|
58 |
@staticmethod
|
59 |
-
def from_str(type
|
60 |
-
if type
|
61 |
return ModelType.AgenticLLM
|
62 |
-
if type
|
63 |
return ModelType.LLM
|
|
|
|
|
|
|
|
|
64 |
return ModelType.Other
|
65 |
|
66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
class Precision(Enum):
|
68 |
float16 = ModelDetails("float16")
|
69 |
bfloat16 = ModelDetails("bfloat16")
|
70 |
Unknown = ModelDetails("?")
|
71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
|
|
|
73 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
|
|
74 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
75 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass, field, make_dataclass
|
2 |
+
from typing import ClassVar
|
3 |
from enum import Enum
|
4 |
|
5 |
+
import pandas as pd
|
6 |
|
7 |
+
from src.about import Tasks
|
|
|
8 |
|
9 |
|
10 |
+
def fields(raw_class):
|
11 |
+
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
12 |
|
13 |
|
14 |
+
# These classes are for user facing column names,
|
15 |
+
# to avoid having to change them all around the code
|
16 |
+
# when a modif is needed
|
17 |
@dataclass
|
18 |
class ColumnContent:
|
19 |
name: str
|
|
|
23 |
never_hidden: bool = False
|
24 |
|
25 |
|
26 |
+
## Leaderboard columns
|
27 |
+
# auto_eval_column_fields = []
|
28 |
+
# # Init
|
29 |
+
# auto_eval_column_fields.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
30 |
+
# auto_eval_column_fields.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
31 |
+
# # Scores
|
32 |
+
# auto_eval_column_fields.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
33 |
+
# for task in Tasks:
|
34 |
+
# auto_eval_column_fields.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
35 |
+
# # Model information
|
36 |
+
# auto_eval_column_fields.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
37 |
+
# auto_eval_column_fields.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
38 |
+
# auto_eval_column_fields.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
39 |
+
# auto_eval_column_fields.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
40 |
+
# auto_eval_column_fields.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
41 |
+
# auto_eval_column_fields.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
42 |
+
# auto_eval_column_fields.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
43 |
+
# auto_eval_column_fields.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
44 |
+
# auto_eval_column_fields.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
45 |
+
#
|
46 |
+
#
|
47 |
+
#
|
48 |
+
# def make_classvar_dataclass(name: str, spec: list):
|
49 |
+
# ns = {"__annotations__": {}}
|
50 |
+
# for field_name, field_type, default in spec:
|
51 |
+
# # Mark as ClassVar so dataclass doesn't treat it as an instance field
|
52 |
+
# ns["__annotations__"][field_name] = ClassVar[field_type]
|
53 |
+
# ns[field_name] = default
|
54 |
+
# # No instance fields; just class-level descriptors
|
55 |
+
# return make_dataclass(name, [], frozen=True, namespace=ns)
|
56 |
+
#
|
57 |
+
# # We use make dataclass to dynamically fill the scores from Tasks
|
58 |
+
# AutoEvalColumn = make_classvar_dataclass("AutoEvalColumn", auto_eval_column_fields)
|
59 |
+
|
60 |
+
|
61 |
@dataclass(frozen=True)
|
62 |
class AutoEvalColumn:
|
63 |
system = ColumnContent("System Name", "markdown", True, never_hidden=True)
|
64 |
+
system_type = ColumnContent("System Type", "str", True)
|
65 |
organization = ColumnContent("Organization", "str", True, never_hidden=True)
|
66 |
+
success_rate = ColumnContent("Success Rate (%)", "number", True)
|
67 |
+
problems_solved = ColumnContent("Problems Solved", "number", True)
|
|
|
68 |
submitted_on = ColumnContent("Submitted On", "datetime", True)
|
69 |
|
70 |
|
71 |
+
## For the queue columns in the submission tab
|
72 |
@dataclass(frozen=True)
|
73 |
class EvalQueueColumn: # Queue column
|
74 |
model = ColumnContent("model", "markdown", True)
|
75 |
revision = ColumnContent("revision", "str", True)
|
76 |
private = ColumnContent("private", "bool", True)
|
77 |
precision = ColumnContent("precision", "str", True)
|
78 |
+
weight_type = ColumnContent("weight_type", "str", "Original")
|
79 |
status = ColumnContent("status", "str", True)
|
80 |
|
81 |
|
82 |
+
## All the model information that we might need
|
83 |
@dataclass
|
84 |
class ModelDetails:
|
85 |
name: str
|
86 |
display_name: str = ""
|
87 |
+
symbol: str = "" # emoji
|
88 |
|
89 |
|
90 |
class ModelType(Enum):
|
91 |
+
LLM = ModelDetails(name="LLM", symbol="🟢")
|
92 |
+
AgenticLLM = ModelDetails(name="AgenticLLM", symbol="🔶")
|
93 |
+
# IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
|
94 |
+
# RL = ModelDetails(name="RL-tuned", symbol="🟦")
|
95 |
+
Other = ModelDetails(name="Other", symbol="?")
|
96 |
|
97 |
+
def to_str(self, separator=" "):
|
98 |
+
return f"{self.value.symbol}{separator}{self.value.name}"
|
99 |
|
100 |
@staticmethod
|
101 |
+
def from_str(type):
|
102 |
+
if "AgenticLLM" in type or "🔶" in type:
|
103 |
return ModelType.AgenticLLM
|
104 |
+
if "LLM" in type or "🟢" in type:
|
105 |
return ModelType.LLM
|
106 |
+
# if "RL-tuned" in type or "🟦" in type:
|
107 |
+
# return ModelType.RL
|
108 |
+
# if "instruction-tuned" in type or "⭕" in type:
|
109 |
+
# return ModelType.IFT
|
110 |
return ModelType.Other
|
111 |
|
112 |
|
113 |
+
class WeightType(Enum):
|
114 |
+
Adapter = ModelDetails("Adapter")
|
115 |
+
Original = ModelDetails("Original")
|
116 |
+
Delta = ModelDetails("Delta")
|
117 |
+
|
118 |
+
|
119 |
class Precision(Enum):
|
120 |
float16 = ModelDetails("float16")
|
121 |
bfloat16 = ModelDetails("bfloat16")
|
122 |
Unknown = ModelDetails("?")
|
123 |
|
124 |
+
def from_str(precision):
|
125 |
+
if precision in ["torch.float16", "float16"]:
|
126 |
+
return Precision.float16
|
127 |
+
if precision in ["torch.bfloat16", "bfloat16"]:
|
128 |
+
return Precision.bfloat16
|
129 |
+
return Precision.Unknown
|
130 |
+
|
131 |
|
132 |
+
# Column selection
|
133 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
134 |
+
|
135 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
136 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
137 |
+
|
138 |
+
# BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
src/envs.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
import os
|
2 |
-
import gradio as gr
|
3 |
|
4 |
from huggingface_hub import HfApi
|
5 |
|
@@ -17,7 +16,6 @@ RESULTS_REPO = f"{OWNER}/dev-f1-leaderboard-results"
|
|
17 |
# If you setup a cache later, just change HF_HOME
|
18 |
CACHE_PATH = os.getenv("HF_HOME", ".")
|
19 |
|
20 |
-
print(f"RUNNING gradio {gr.__version__}")
|
21 |
print(f"{TOKEN=}")
|
22 |
print(f"{REPO_ID=}")
|
23 |
|
|
|
1 |
import os
|
|
|
2 |
|
3 |
from huggingface_hub import HfApi
|
4 |
|
|
|
16 |
# If you setup a cache later, just change HF_HOME
|
17 |
CACHE_PATH = os.getenv("HF_HOME", ".")
|
18 |
|
|
|
19 |
print(f"{TOKEN=}")
|
20 |
print(f"{REPO_ID=}")
|
21 |
|
src/leaderboard/read_evals.py
ADDED
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import glob
|
2 |
+
import json
|
3 |
+
import math
|
4 |
+
import os
|
5 |
+
from dataclasses import dataclass
|
6 |
+
|
7 |
+
import dateutil
|
8 |
+
import numpy as np
|
9 |
+
|
10 |
+
from src.display.formatting import make_clickable_model
|
11 |
+
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
12 |
+
from src.submission.check_validity import is_model_on_hub
|
13 |
+
|
14 |
+
|
15 |
+
@dataclass
|
16 |
+
class EvalResult:
|
17 |
+
"""Represents one full evaluation. Built from a combination of the result and request file for a given run.
|
18 |
+
"""
|
19 |
+
eval_name: str # org_model_precision (uid)
|
20 |
+
full_model: str # org/model (path on hub)
|
21 |
+
org: str
|
22 |
+
model: str
|
23 |
+
revision: str # commit hash, "" if main
|
24 |
+
results: dict
|
25 |
+
precision: Precision = Precision.Unknown
|
26 |
+
model_type: ModelType = ModelType.LLM # Pretrained, fine tuned, ...
|
27 |
+
weight_type: WeightType = WeightType.Original # Original or Adapter
|
28 |
+
architecture: str = "Unknown"
|
29 |
+
license: str = "?"
|
30 |
+
likes: int = 0
|
31 |
+
num_params: int = 0
|
32 |
+
date: str = "" # submission date of request file
|
33 |
+
still_on_hub: bool = False
|
34 |
+
|
35 |
+
@classmethod
|
36 |
+
def init_from_json_file(self, json_filepath):
|
37 |
+
"""Inits the result from the specific model result file"""
|
38 |
+
with open(json_filepath) as fp:
|
39 |
+
data = json.load(fp)
|
40 |
+
|
41 |
+
config = data.get("config")
|
42 |
+
|
43 |
+
# Precision
|
44 |
+
precision = Precision.from_str(config.get("model_dtype"))
|
45 |
+
|
46 |
+
# Get model and org
|
47 |
+
org_and_model = config.get("model_name", config.get("model_args", None))
|
48 |
+
org_and_model = org_and_model.split("/", 1)
|
49 |
+
|
50 |
+
if len(org_and_model) == 1:
|
51 |
+
org = None
|
52 |
+
model = org_and_model[0]
|
53 |
+
result_key = f"{model}_{precision.value.name}"
|
54 |
+
else:
|
55 |
+
org = org_and_model[0]
|
56 |
+
model = org_and_model[1]
|
57 |
+
result_key = f"{org}_{model}_{precision.value.name}"
|
58 |
+
full_model = "/".join(org_and_model)
|
59 |
+
|
60 |
+
still_on_hub, _, model_config = is_model_on_hub(
|
61 |
+
full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
|
62 |
+
)
|
63 |
+
architecture = "?"
|
64 |
+
if model_config is not None:
|
65 |
+
architectures = getattr(model_config, "architectures", None)
|
66 |
+
if architectures:
|
67 |
+
architecture = ";".join(architectures)
|
68 |
+
|
69 |
+
# Extract results available in this file (some results are split in several files)
|
70 |
+
results = {}
|
71 |
+
for task in Tasks:
|
72 |
+
task = task.value
|
73 |
+
|
74 |
+
# We average all scores of a given metric (not all metrics are present in all files)
|
75 |
+
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
|
76 |
+
if accs.size == 0 or any([acc is None for acc in accs]):
|
77 |
+
continue
|
78 |
+
|
79 |
+
mean_acc = np.mean(accs) * 100.0
|
80 |
+
results[task.benchmark] = mean_acc
|
81 |
+
|
82 |
+
return self(
|
83 |
+
eval_name=result_key,
|
84 |
+
full_model=full_model,
|
85 |
+
org=org,
|
86 |
+
model=model,
|
87 |
+
results=results,
|
88 |
+
precision=precision,
|
89 |
+
revision= config.get("model_sha", ""),
|
90 |
+
still_on_hub=still_on_hub,
|
91 |
+
architecture=architecture
|
92 |
+
)
|
93 |
+
|
94 |
+
def update_with_request_file(self, requests_path):
|
95 |
+
"""Finds the relevant request file for the current model and updates info with it"""
|
96 |
+
request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
|
97 |
+
|
98 |
+
try:
|
99 |
+
with open(request_file, "r") as f:
|
100 |
+
request = json.load(f)
|
101 |
+
self.model_type = ModelType.from_str(request.get("model_type", ""))
|
102 |
+
self.weight_type = WeightType[request.get("weight_type", "Original")]
|
103 |
+
self.license = request.get("license", "?")
|
104 |
+
self.likes = request.get("likes", 0)
|
105 |
+
self.num_params = request.get("params", 0)
|
106 |
+
self.date = request.get("submitted_time", "")
|
107 |
+
except Exception:
|
108 |
+
print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
|
109 |
+
|
110 |
+
def to_dict(self):
|
111 |
+
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
112 |
+
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
113 |
+
data_dict = {
|
114 |
+
"eval_name": self.eval_name, # not a column, just a save name,
|
115 |
+
AutoEvalColumn.precision.name: self.precision.value.name,
|
116 |
+
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
117 |
+
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
118 |
+
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
119 |
+
AutoEvalColumn.architecture.name: self.architecture,
|
120 |
+
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
121 |
+
AutoEvalColumn.revision.name: self.revision,
|
122 |
+
AutoEvalColumn.average.name: average,
|
123 |
+
AutoEvalColumn.license.name: self.license,
|
124 |
+
AutoEvalColumn.likes.name: self.likes,
|
125 |
+
AutoEvalColumn.params.name: self.num_params,
|
126 |
+
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
127 |
+
}
|
128 |
+
|
129 |
+
for task in Tasks:
|
130 |
+
data_dict[task.value.col_name] = self.results[task.value.benchmark]
|
131 |
+
|
132 |
+
return data_dict
|
133 |
+
|
134 |
+
|
135 |
+
def get_request_file_for_model(requests_path, model_name, precision):
|
136 |
+
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
137 |
+
request_files = os.path.join(
|
138 |
+
requests_path,
|
139 |
+
f"{model_name}_eval_request_*.json",
|
140 |
+
)
|
141 |
+
request_files = glob.glob(request_files)
|
142 |
+
|
143 |
+
# Select correct request file (precision)
|
144 |
+
request_file = ""
|
145 |
+
request_files = sorted(request_files, reverse=True)
|
146 |
+
for tmp_request_file in request_files:
|
147 |
+
with open(tmp_request_file, "r") as f:
|
148 |
+
req_content = json.load(f)
|
149 |
+
if (
|
150 |
+
req_content["status"] in ["FINISHED"]
|
151 |
+
and req_content["precision"] == precision.split(".")[-1]
|
152 |
+
):
|
153 |
+
request_file = tmp_request_file
|
154 |
+
return request_file
|
155 |
+
|
156 |
+
|
157 |
+
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
158 |
+
"""From the path of the results folder root, extract all needed info for results"""
|
159 |
+
model_result_filepaths = []
|
160 |
+
|
161 |
+
for root, _, files in os.walk(results_path):
|
162 |
+
# We should only have json files in model results
|
163 |
+
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
164 |
+
continue
|
165 |
+
|
166 |
+
# Sort the files by date
|
167 |
+
try:
|
168 |
+
files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
169 |
+
except dateutil.parser._parser.ParserError:
|
170 |
+
files = [files[-1]]
|
171 |
+
|
172 |
+
for file in files:
|
173 |
+
model_result_filepaths.append(os.path.join(root, file))
|
174 |
+
|
175 |
+
eval_results = {}
|
176 |
+
for model_result_filepath in model_result_filepaths:
|
177 |
+
# Creation of result
|
178 |
+
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
179 |
+
eval_result.update_with_request_file(requests_path)
|
180 |
+
|
181 |
+
# Store results of same eval together
|
182 |
+
eval_name = eval_result.eval_name
|
183 |
+
if eval_name in eval_results.keys():
|
184 |
+
eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
185 |
+
else:
|
186 |
+
eval_results[eval_name] = eval_result
|
187 |
+
|
188 |
+
results = []
|
189 |
+
for v in eval_results.values():
|
190 |
+
try:
|
191 |
+
v.to_dict() # we test if the dict version is complete
|
192 |
+
results.append(v)
|
193 |
+
except KeyError: # not all eval values present
|
194 |
+
continue
|
195 |
+
|
196 |
+
return results
|
src/logger.py
CHANGED
@@ -1,11 +1,7 @@
|
|
1 |
import logging
|
2 |
import sys
|
3 |
|
4 |
-
|
5 |
-
def get_logger(
|
6 |
-
filename: str,
|
7 |
-
level=logging.INFO,
|
8 |
-
) -> logging.Logger:
|
9 |
new_logger = logging.getLogger(filename)
|
10 |
fmt = logging.Formatter(fmt="%(asctime)s - %(name)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s")
|
11 |
handler = logging.StreamHandler(sys.stderr)
|
|
|
1 |
import logging
|
2 |
import sys
|
3 |
|
4 |
+
def get_logger(filename: str, level=logging.INFO) -> logging.Logger:
|
|
|
|
|
|
|
|
|
5 |
new_logger = logging.getLogger(filename)
|
6 |
fmt = logging.Formatter(fmt="%(asctime)s - %(name)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s")
|
7 |
handler = logging.StreamHandler(sys.stderr)
|
src/populate.py
CHANGED
@@ -1,98 +1,110 @@
|
|
|
|
|
|
|
|
1 |
import pandas as pd
|
2 |
-
from datasets import
|
3 |
from datasets.exceptions import DatasetNotFoundError
|
4 |
from tqdm.auto import tqdm
|
5 |
|
6 |
-
from src.display.
|
|
|
7 |
from src.envs import TOKEN
|
|
|
8 |
from src.logger import get_logger
|
9 |
|
10 |
logger = get_logger(__name__)
|
11 |
|
12 |
|
13 |
def get_leaderboard_df(results_dataset_name: str) -> pd.DataFrame:
|
14 |
-
"""
|
15 |
-
@brief Creates a dataframe from all the individual experiment results.
|
16 |
-
"""
|
17 |
-
|
18 |
-
empty_df = pd.DataFrame(
|
19 |
-
columns=[
|
20 |
-
AutoEvalColumn.system.name,
|
21 |
-
AutoEvalColumn.organization.name,
|
22 |
-
AutoEvalColumn.success_rate_overall.name,
|
23 |
-
AutoEvalColumn.success_rate_tier1.name,
|
24 |
-
AutoEvalColumn.success_rate_tier2.name,
|
25 |
-
AutoEvalColumn.submitted_on.name,
|
26 |
-
]
|
27 |
-
)
|
28 |
|
29 |
try:
|
30 |
-
configs = get_dataset_config_names(
|
31 |
-
results_dataset_name,
|
32 |
-
token=TOKEN,
|
33 |
-
)
|
34 |
except (DatasetNotFoundError, FileNotFoundError):
|
35 |
-
|
36 |
# Return an empty DataFrame with expected columns
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
|
|
|
|
|
|
|
|
43 |
|
44 |
rows = []
|
45 |
-
for submission_id in tqdm(
|
46 |
-
|
47 |
-
total=len(configs),
|
48 |
-
desc="Processing Submission Results",
|
49 |
-
):
|
50 |
-
submission_ds = load_dataset(
|
51 |
-
results_dataset_name,
|
52 |
-
submission_id,
|
53 |
-
split="train",
|
54 |
-
token=TOKEN,
|
55 |
-
)
|
56 |
submission_df = pd.DataFrame(submission_ds)
|
57 |
|
58 |
if submission_df.empty or "did_pass" not in submission_df.columns or submission_df.did_pass.isna().any():
|
59 |
logger.warning(f"Skipping {submission_id} due to invalid did_pass values")
|
60 |
continue
|
61 |
|
62 |
-
assert submission_df["tier"].isin([1, 2]).all(), "Invalid tier values found in submission_df"
|
63 |
success_rate = 100 * submission_df["did_pass"].mean()
|
64 |
-
|
65 |
-
tier2_success_rate = 100 * submission_df[submission_df["tier"] == 2]["did_pass"].mean()
|
66 |
first_row = submission_df.iloc[0]
|
67 |
|
68 |
rows.append(
|
69 |
{
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
}
|
77 |
)
|
78 |
|
79 |
full_df = pd.DataFrame(rows)
|
80 |
|
81 |
-
|
82 |
-
|
83 |
# Keep only the latest entry per unique (System Name, System Type, Organization) triplet
|
84 |
final_df = (
|
85 |
full_df.sort_values("Submitted On", ascending=False)
|
86 |
-
.drop_duplicates(subset=[
|
87 |
-
.sort_values(by=[AutoEvalColumn.
|
88 |
.reset_index(drop=True)
|
89 |
)
|
90 |
|
91 |
-
cols_to_round = [
|
92 |
-
AutoEvalColumn.success_rate_overall.name,
|
93 |
-
AutoEvalColumn.success_rate_tier1.name,
|
94 |
-
AutoEvalColumn.success_rate_tier2.name,
|
95 |
-
]
|
96 |
final_df[cols_to_round] = final_df[cols_to_round].round(decimals=2)
|
97 |
|
98 |
return final_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
|
4 |
import pandas as pd
|
5 |
+
from datasets import load_dataset, get_dataset_config_names
|
6 |
from datasets.exceptions import DatasetNotFoundError
|
7 |
from tqdm.auto import tqdm
|
8 |
|
9 |
+
from src.display.formatting import has_no_nan_values, make_clickable_model
|
10 |
+
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
11 |
from src.envs import TOKEN
|
12 |
+
from src.leaderboard.read_evals import get_raw_eval_results
|
13 |
from src.logger import get_logger
|
14 |
|
15 |
logger = get_logger(__name__)
|
16 |
|
17 |
|
18 |
def get_leaderboard_df(results_dataset_name: str) -> pd.DataFrame:
|
19 |
+
"""Creates a dataframe from all the individual experiment results"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
try:
|
22 |
+
configs = get_dataset_config_names(results_dataset_name, token=TOKEN)
|
|
|
|
|
|
|
23 |
except (DatasetNotFoundError, FileNotFoundError):
|
|
|
24 |
# Return an empty DataFrame with expected columns
|
25 |
+
return pd.DataFrame(
|
26 |
+
columns=[
|
27 |
+
"System Name",
|
28 |
+
"System Type",
|
29 |
+
"Organization",
|
30 |
+
"Success Rate (%)",
|
31 |
+
"Problems Solved",
|
32 |
+
"Submitted On",
|
33 |
+
]
|
34 |
+
)
|
35 |
|
36 |
rows = []
|
37 |
+
for submission_id in tqdm(configs, total=len(configs), desc="Processing Submission Results"):
|
38 |
+
submission_ds = load_dataset(results_dataset_name, submission_id, split="train", token=TOKEN)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
submission_df = pd.DataFrame(submission_ds)
|
40 |
|
41 |
if submission_df.empty or "did_pass" not in submission_df.columns or submission_df.did_pass.isna().any():
|
42 |
logger.warning(f"Skipping {submission_id} due to invalid did_pass values")
|
43 |
continue
|
44 |
|
|
|
45 |
success_rate = 100 * submission_df["did_pass"].mean()
|
46 |
+
num_solved = submission_df["did_pass"].sum()
|
|
|
47 |
first_row = submission_df.iloc[0]
|
48 |
|
49 |
rows.append(
|
50 |
{
|
51 |
+
"System Name": first_row["system_name"],
|
52 |
+
"System Type": first_row["system_type"],
|
53 |
+
"Organization": first_row["organization"],
|
54 |
+
"Success Rate (%)": success_rate,
|
55 |
+
"Problems Solved": num_solved,
|
56 |
+
"Submitted On": pd.to_datetime(first_row["submission_ts"]).strftime("%Y-%m-%d %H:%M"),
|
57 |
}
|
58 |
)
|
59 |
|
60 |
full_df = pd.DataFrame(rows)
|
61 |
|
62 |
+
# TODO: forbid multiple submissions under the same name?
|
|
|
63 |
# Keep only the latest entry per unique (System Name, System Type, Organization) triplet
|
64 |
final_df = (
|
65 |
full_df.sort_values("Submitted On", ascending=False)
|
66 |
+
.drop_duplicates(subset=["System Name", "System Type", "Organization"], keep="first")
|
67 |
+
.sort_values(by=[AutoEvalColumn.success_rate.name], ascending=False)
|
68 |
.reset_index(drop=True)
|
69 |
)
|
70 |
|
71 |
+
cols_to_round = ["Success Rate (%)"]
|
|
|
|
|
|
|
|
|
72 |
final_df[cols_to_round] = final_df[cols_to_round].round(decimals=2)
|
73 |
|
74 |
return final_df
|
75 |
+
|
76 |
+
|
77 |
+
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
78 |
+
"""Creates the different dataframes for the evaluation queues requestes"""
|
79 |
+
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
|
80 |
+
all_evals = []
|
81 |
+
|
82 |
+
for entry in entries:
|
83 |
+
if ".json" in entry:
|
84 |
+
file_path = os.path.join(save_path, entry)
|
85 |
+
with open(file_path) as fp:
|
86 |
+
data = json.load(fp)
|
87 |
+
|
88 |
+
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
89 |
+
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
90 |
+
|
91 |
+
all_evals.append(data)
|
92 |
+
elif ".md" not in entry:
|
93 |
+
# this is a folder
|
94 |
+
sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
|
95 |
+
for sub_entry in sub_entries:
|
96 |
+
file_path = os.path.join(save_path, entry, sub_entry)
|
97 |
+
with open(file_path) as fp:
|
98 |
+
data = json.load(fp)
|
99 |
+
|
100 |
+
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
101 |
+
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
102 |
+
all_evals.append(data)
|
103 |
+
|
104 |
+
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
105 |
+
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
106 |
+
finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
|
107 |
+
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
108 |
+
df_running = pd.DataFrame.from_records(running_list, columns=cols)
|
109 |
+
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
110 |
+
return df_finished[cols], df_running[cols], df_pending[cols]
|
src/submission/check_validity.py
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import re
|
4 |
+
from collections import defaultdict
|
5 |
+
from datetime import datetime, timedelta, timezone
|
6 |
+
|
7 |
+
from datasets import get_dataset_config_names
|
8 |
+
import huggingface_hub
|
9 |
+
from huggingface_hub import ModelCard
|
10 |
+
from huggingface_hub.hf_api import ModelInfo
|
11 |
+
from transformers import AutoConfig
|
12 |
+
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
13 |
+
|
14 |
+
from src.envs import SUBMISSIONS_REPO
|
15 |
+
|
16 |
+
def check_model_card(repo_id: str) -> tuple[bool, str]:
|
17 |
+
"""Checks if the model card and license exist and have been filled"""
|
18 |
+
try:
|
19 |
+
card = ModelCard.load(repo_id)
|
20 |
+
except huggingface_hub.utils.EntryNotFoundError:
|
21 |
+
return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
|
22 |
+
|
23 |
+
# Enforce license metadata
|
24 |
+
if card.data.license is None:
|
25 |
+
if not ("license_name" in card.data and "license_link" in card.data):
|
26 |
+
return False, (
|
27 |
+
"License not found. Please add a license to your model card using the `license` metadata or a"
|
28 |
+
" `license_name`/`license_link` pair."
|
29 |
+
)
|
30 |
+
|
31 |
+
# Enforce card content
|
32 |
+
if len(card.text) < 200:
|
33 |
+
return False, "Please add a description to your model card, it is too short."
|
34 |
+
|
35 |
+
return True, ""
|
36 |
+
|
37 |
+
def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
|
38 |
+
"""Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
|
39 |
+
try:
|
40 |
+
config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
41 |
+
if test_tokenizer:
|
42 |
+
try:
|
43 |
+
tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
44 |
+
except ValueError as e:
|
45 |
+
return (
|
46 |
+
False,
|
47 |
+
f"uses a tokenizer which is not in a transformers release: {e}",
|
48 |
+
None
|
49 |
+
)
|
50 |
+
except Exception as e:
|
51 |
+
return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
|
52 |
+
return True, None, config
|
53 |
+
|
54 |
+
except ValueError:
|
55 |
+
return (
|
56 |
+
False,
|
57 |
+
"needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
|
58 |
+
None
|
59 |
+
)
|
60 |
+
|
61 |
+
except Exception as e:
|
62 |
+
return False, "was not found on hub!", None
|
63 |
+
|
64 |
+
|
65 |
+
def get_model_size(model_info: ModelInfo, precision: str):
|
66 |
+
"""Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
|
67 |
+
try:
|
68 |
+
model_size = round(model_info.safetensors["total"] / 1e9, 3)
|
69 |
+
except (AttributeError, TypeError):
|
70 |
+
return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
|
71 |
+
|
72 |
+
size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
|
73 |
+
model_size = size_factor * model_size
|
74 |
+
return model_size
|
75 |
+
|
76 |
+
def get_model_arch(model_info: ModelInfo):
|
77 |
+
"""Gets the model architecture from the configuration"""
|
78 |
+
return model_info.config.get("architectures", "Unknown")
|
79 |
+
|
80 |
+
def already_submitted_models(requested_models_dir: str) -> set[str]:
|
81 |
+
"""Gather a list of already submitted models to avoid duplicates"""
|
82 |
+
depth = 1
|
83 |
+
file_names = []
|
84 |
+
users_to_submission_dates = defaultdict(list)
|
85 |
+
|
86 |
+
for root, _, files in os.walk(requested_models_dir):
|
87 |
+
current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
|
88 |
+
if current_depth == depth:
|
89 |
+
for file in files:
|
90 |
+
if not file.endswith(".json"):
|
91 |
+
continue
|
92 |
+
with open(os.path.join(root, file), "r") as f:
|
93 |
+
info = json.load(f)
|
94 |
+
file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
|
95 |
+
|
96 |
+
# Select organisation
|
97 |
+
if info["model"].count("/") == 0 or "submitted_time" not in info:
|
98 |
+
continue
|
99 |
+
organisation, _ = info["model"].split("/")
|
100 |
+
users_to_submission_dates[organisation].append(info["submitted_time"])
|
101 |
+
|
102 |
+
return set(file_names), users_to_submission_dates
|
src/submission/submit.py
CHANGED
@@ -1,110 +1,88 @@
|
|
1 |
-
import
|
2 |
-
import time
|
3 |
-
from datetime import datetime, timezone, timedelta
|
4 |
import os
|
5 |
-
import
|
|
|
6 |
|
|
|
7 |
import pandas as pd
|
8 |
-
from
|
9 |
-
from datasets.exceptions import DatasetNotFoundError
|
10 |
-
from pandas.api.types import is_integer_dtype
|
11 |
-
import gradio as gr
|
12 |
|
13 |
from src.datamodel.data import F1Data
|
14 |
-
from src.display.formatting import styled_error, styled_message
|
15 |
from src.display.utils import ModelType
|
16 |
-
from src.envs import SUBMISSIONS_REPO, TOKEN
|
17 |
from src.logger import get_logger
|
18 |
-
from src.validation.validate import is_submission_file_valid, is_valid
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
logger = get_logger(__name__)
|
22 |
|
23 |
-
MIN_WAIT_TIME_PER_USER_HRS = 24
|
24 |
-
RATE_LIMIT_WINDOW_HRS = 24
|
25 |
-
MAX_SUBMISSIONS_PER_WINDOW = 10
|
26 |
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
|
30 |
def add_new_solutions(
|
31 |
lbdb: F1Data,
|
32 |
-
username: str,
|
33 |
-
user_id: str,
|
34 |
system_name: str,
|
35 |
org: str,
|
|
|
36 |
submission_path: str,
|
37 |
-
|
38 |
-
ensure_all_present: bool = False,
|
39 |
):
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
except (DatasetNotFoundError, FileNotFoundError):
|
44 |
-
submitted_ids = []
|
45 |
-
|
46 |
-
if submitted_ids == ["default"]:
|
47 |
-
# means empty dataset
|
48 |
-
submitted_ids = []
|
49 |
-
|
50 |
-
logger.info(f"Found {len(submitted_ids)} submissions")
|
51 |
-
|
52 |
-
# Rate limits:
|
53 |
-
# 1. Users must wait MIN_WAIT_TIME_PER_USER_HRS hours between submissions.
|
54 |
-
# 2. No more than MAX_SUBMISSIONS_PER_WINDOW submissions RATE_LIMIT_WINDOW_HRS hours overall.
|
55 |
-
|
56 |
-
sub_df = pd.DataFrame.from_dict(
|
57 |
-
{
|
58 |
-
"submission_id": submitted_ids,
|
59 |
-
"user_id": map(_submission_id_to_user_id, submitted_ids),
|
60 |
-
"timestamp": map(_submission_id_to_timestamp, submitted_ids),
|
61 |
-
}
|
62 |
-
)
|
63 |
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
remaining_hrs = (user_last_submission_ts - cutoff_user).total_seconds() / 3600
|
71 |
-
logger.info(f"{username} must wait {remaining_hrs:.2f} more hours.")
|
72 |
-
return styled_error(
|
73 |
-
f"You must wait {MIN_WAIT_TIME_PER_USER_HRS} hours between submissions. "
|
74 |
-
f"Remaining wait time: {remaining_hrs:.2f} hours"
|
75 |
-
)
|
76 |
-
|
77 |
-
# Overall limit
|
78 |
-
cutoff_overall = now - timedelta(hours=RATE_LIMIT_WINDOW_HRS)
|
79 |
-
if len(sub_df.timestamp > cutoff_overall) >= MAX_SUBMISSIONS_PER_WINDOW:
|
80 |
-
logger.info(
|
81 |
-
f"Too many submissions in the last {RATE_LIMIT_WINDOW_HRS} hours: {len(sub_df.timestamp > cutoff_overall)}."
|
82 |
-
)
|
83 |
-
return styled_error("The leaderboard has reached its submission capacity for now. Please try again later.")
|
84 |
-
|
85 |
-
logger.info(
|
86 |
-
f"Adding new submission: {system_name=}, {org=}, and {submission_path=}",
|
87 |
-
)
|
88 |
|
89 |
-
|
90 |
-
|
91 |
-
assert is_valid(val)
|
92 |
-
assert is_submission_file_valid(submission_path, is_warmup_dataset=is_warmup_dataset)
|
93 |
|
94 |
try:
|
95 |
submission_df = pd.read_json(submission_path, lines=True)
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
submission_id = f"{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}
|
105 |
-
|
106 |
-
# Seems good, creating the eval
|
107 |
-
|
108 |
submission_ts = time.time_ns()
|
109 |
|
110 |
def add_info(row):
|
@@ -112,103 +90,37 @@ def add_new_solutions(
|
|
112 |
**row,
|
113 |
"system_name": system_name,
|
114 |
"organization": org,
|
|
|
115 |
"submission_id": submission_id,
|
116 |
"submission_ts": submission_ts,
|
117 |
-
"evaluation_id": "", # This will be set later when the evaluation is launched in the backend
|
118 |
-
"evaluation_start_ts": "", # This will be set when the evaluation starts
|
119 |
}
|
120 |
|
121 |
ds = Dataset.from_pandas(submission_df).map(add_info)
|
122 |
|
123 |
-
|
124 |
-
|
125 |
-
SUBMISSIONS_REPO,
|
126 |
-
submission_id,
|
127 |
-
private=True,
|
128 |
-
)
|
129 |
-
|
130 |
-
return styled_message(
|
131 |
-
"Your request has been submitted to the evaluation queue!\n"
|
132 |
-
+ "Results may take up to 24 hours to be processed and shown in the leaderboard."
|
133 |
-
)
|
134 |
-
|
135 |
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
return None
|
142 |
-
try:
|
143 |
-
oidc_meta = requests.get(f"{provider}/.well-known/openid-configuration", timeout=5)
|
144 |
-
oidc_meta = oidc_meta.json()
|
145 |
-
userinfo_ep = oidc_meta["userinfo_endpoint"]
|
146 |
-
claims = requests.get(userinfo_ep, headers={"Authorization": f"Bearer {oauth_token.token}"}, timeout=5)
|
147 |
-
logger.info(f"userinfo_endpoint response: status={claims.status_code}\nheaders={dict(claims.headers)}")
|
148 |
-
claims = claims.json()
|
149 |
-
# Typical fields: sub (stable id), preferred_username, name, picture
|
150 |
-
return {
|
151 |
-
"sub": claims.get("sub"),
|
152 |
-
"preferred_username": claims.get("preferred_username"),
|
153 |
-
"name": claims.get("name"),
|
154 |
-
}
|
155 |
-
except Exception as e:
|
156 |
-
logger.warning(f"Failed to fetch user claims: {e}")
|
157 |
-
return None
|
158 |
|
|
|
|
|
159 |
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
logger.info("RESP content %s", r.text)
|
169 |
-
if r.status_code != 200:
|
170 |
-
return None
|
171 |
-
return r.json()
|
172 |
-
except:
|
173 |
-
logger.exception("Cannot get user info")
|
174 |
-
return None
|
175 |
-
|
176 |
-
|
177 |
-
def _validate_all_submissions_present(
|
178 |
-
lbdb: F1Data,
|
179 |
-
pd_ds: pd.DataFrame,
|
180 |
-
):
|
181 |
-
logger.info(f"Validating DS size {len(pd_ds)} columns {pd_ds.columns} set {set(pd_ds.columns)}")
|
182 |
-
expected_cols = ["problem_id", "solution"]
|
183 |
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
if not is_integer_dtype(pd_ds["problem_id"]):
|
188 |
-
return ValueError("problem_id must be str convertible to int")
|
189 |
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
submitted_ids = set(pd_ds.problem_id.astype(str))
|
194 |
-
if submitted_ids != lbdb.code_problem_ids:
|
195 |
-
missing = lbdb.code_problem_ids - submitted_ids
|
196 |
-
unknown = submitted_ids - lbdb.code_problem_ids
|
197 |
-
raise ValueError(f"Mismatched problem IDs: {len(missing)} missing, {len(unknown)} unknown")
|
198 |
-
if len(pd_ds) > len(lbdb.code_problem_ids):
|
199 |
-
return ValueError("Duplicate problem IDs exist in uploaded file")
|
200 |
-
|
201 |
-
|
202 |
-
def _submission_id_to_user_id(submission_id: str) -> str:
|
203 |
-
"""
|
204 |
-
Extracts the user ID from the submission ID: "YYYYMMDD_HHMMSS_username_userid"
|
205 |
-
"""
|
206 |
-
return submission_id.rsplit("_", 1)[-1]
|
207 |
-
|
208 |
-
|
209 |
-
def _submission_id_to_timestamp(submission_id: str) -> datetime:
|
210 |
-
"""
|
211 |
-
Extracts the timestamp from the submission ID: "YYYYMMDD_HHMMSS_username_userid"
|
212 |
-
"""
|
213 |
-
ts_str = "_".join(submission_id.split("_", 2)[:2])
|
214 |
-
return datetime.strptime(ts_str, "%Y%m%d_%H%M%S").replace(tzinfo=timezone.utc)
|
|
|
1 |
+
import json
|
|
|
|
|
2 |
import os
|
3 |
+
from datetime import datetime, timezone
|
4 |
+
import time
|
5 |
|
6 |
+
from datasets import Dataset, DatasetDict
|
7 |
import pandas as pd
|
8 |
+
from pandas.api.types import is_integer_dtype, is_string_dtype
|
|
|
|
|
|
|
9 |
|
10 |
from src.datamodel.data import F1Data
|
11 |
+
from src.display.formatting import styled_error, styled_message, styled_warning
|
12 |
from src.display.utils import ModelType
|
13 |
+
from src.envs import API, SUBMISSIONS_REPO, TOKEN
|
14 |
from src.logger import get_logger
|
|
|
15 |
|
16 |
+
# from src.submission.check_validity import (
|
17 |
+
# already_submitted_models,
|
18 |
+
# check_model_card,
|
19 |
+
# get_model_size,
|
20 |
+
# is_model_on_hub,
|
21 |
+
# )
|
22 |
|
23 |
logger = get_logger(__name__)
|
24 |
|
|
|
|
|
|
|
25 |
|
26 |
+
def validate_submission(lbdb: F1Data, pd_ds: pd.DataFrame) -> str | None:
|
27 |
+
logger.info("Validating DS size %d columns %s set %s", len(pd_ds), pd_ds.columns, set(pd_ds.columns))
|
28 |
+
expected_cols = ["problem_id", "solution"]
|
29 |
+
|
30 |
+
if set(pd_ds.columns) != set(expected_cols):
|
31 |
+
return f"Expected attributes: {expected_cols}, Got: {pd_ds.columns.tolist()}"
|
32 |
+
|
33 |
+
if not is_integer_dtype(pd_ds["problem_id"]):
|
34 |
+
return "problem_id must be str convertible to int"
|
35 |
+
|
36 |
+
if any(type(v) != str for v in pd_ds["solution"]):
|
37 |
+
return "solution must be of type str"
|
38 |
+
|
39 |
+
submitted_ids = set(pd_ds.problem_id.astype(str))
|
40 |
+
if submitted_ids != lbdb.code_problem_ids:
|
41 |
+
missing = lbdb.code_problem_ids - submitted_ids
|
42 |
+
unknown = submitted_ids - lbdb.code_problem_ids
|
43 |
+
return f"Mismatched problem IDs: {len(missing)} missing, {len(unknown)} unknown"
|
44 |
+
if len(pd_ds) > len(lbdb.code_problem_ids):
|
45 |
+
return "Duplicate problem IDs exist in uploaded file"
|
46 |
+
|
47 |
+
return None
|
48 |
|
49 |
|
50 |
def add_new_solutions(
|
51 |
lbdb: F1Data,
|
|
|
|
|
52 |
system_name: str,
|
53 |
org: str,
|
54 |
+
sys_type: str,
|
55 |
submission_path: str,
|
56 |
+
skip_validation: bool = False,
|
|
|
57 |
):
|
58 |
+
logger.info("ADD SUBMISSION! %s path %s", str((system_name, org, sys_type)), submission_path)
|
59 |
+
if not system_name:
|
60 |
+
return styled_error("Please fill system name")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
+
if not org:
|
63 |
+
return styled_error("Please fill organization name")
|
64 |
+
|
65 |
+
if not sys_type:
|
66 |
+
return styled_error("Please select system type")
|
67 |
+
sys_type = ModelType.from_str(sys_type).name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
|
69 |
+
if not submission_path:
|
70 |
+
return styled_error("Please upload JSONL solutions file")
|
|
|
|
|
71 |
|
72 |
try:
|
73 |
submission_df = pd.read_json(submission_path, lines=True)
|
74 |
+
except Exception as e:
|
75 |
+
return styled_error(f"Cannot read uploaded JSONL file: {str(e)}")
|
76 |
+
|
77 |
+
if not skip_validation:
|
78 |
+
validation_error = validate_submission(lbdb, submission_df)
|
79 |
+
if validation_error:
|
80 |
+
return styled_error(validation_error)
|
81 |
+
|
82 |
+
submission_id = f"{system_name}_{org}_{sys_type}_{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}"
|
83 |
+
|
84 |
+
# Seems good, creating the eval
|
85 |
+
print(f"Adding new submission: {submission_id}")
|
86 |
submission_ts = time.time_ns()
|
87 |
|
88 |
def add_info(row):
|
|
|
90 |
**row,
|
91 |
"system_name": system_name,
|
92 |
"organization": org,
|
93 |
+
"system_type": sys_type,
|
94 |
"submission_id": submission_id,
|
95 |
"submission_ts": submission_ts,
|
|
|
|
|
96 |
}
|
97 |
|
98 |
ds = Dataset.from_pandas(submission_df).map(add_info)
|
99 |
|
100 |
+
# dsdict = DatasetDict({submission_id: ds})
|
101 |
+
# dsdict.push_to_hub(SUBMISSIONS_REPO, private=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
+
ds.push_to_hub(SUBMISSIONS_REPO, submission_id, private=True)
|
104 |
+
# print("Creating eval file")
|
105 |
+
# OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
106 |
+
# os.makedirs(OUT_DIR, exist_ok=True)
|
107 |
+
# out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
|
109 |
+
# with open(out_path, "w") as f:
|
110 |
+
# f.write(json.dumps(eval_entry))
|
111 |
|
112 |
+
# print("Uploading eval file")
|
113 |
+
# API.upload_file(
|
114 |
+
# path_or_fileobj=out_path,
|
115 |
+
# path_in_repo=out_path.split("eval-queue/")[1],
|
116 |
+
# repo_id=QUEUE_REPO,
|
117 |
+
# repo_type="dataset",
|
118 |
+
# commit_message=f"Add {model} to eval queue",
|
119 |
+
# )
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
|
121 |
+
# # Remove the local file
|
122 |
+
# os.remove(out_path)
|
|
|
|
|
|
|
123 |
|
124 |
+
return styled_message(
|
125 |
+
"Your request has been submitted to the evaluation queue!\nResults may take up to 24 hours to be processed and shown in the leaderboard."
|
126 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/validation/__init__.py
DELETED
File without changes
|
src/validation/validate.py
DELETED
@@ -1,115 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import os
|
3 |
-
import string
|
4 |
-
|
5 |
-
from src.logger import get_logger
|
6 |
-
|
7 |
-
WARMUP_DATASET_SIZE = 100
|
8 |
-
DATASET_SIZE = 120
|
9 |
-
|
10 |
-
MIN_INPUT_LENGTH = 2
|
11 |
-
MAX_INPUT_LENGTH = 20
|
12 |
-
|
13 |
-
MIN_SUBMISSION_SIZE = 1
|
14 |
-
MAX_SUBMISSION_SIZE = 1024 * 1024 * 120 # 120 MB.
|
15 |
-
MAX_SINGLE_SUBMISSION_SIZE = 1024 * 1024 # 1MB.
|
16 |
-
MAX_SUBMISSION_LINES = DATASET_SIZE + 1 # Allow empty line.
|
17 |
-
|
18 |
-
logger = get_logger(__name__)
|
19 |
-
|
20 |
-
|
21 |
-
def is_valid(
|
22 |
-
s: str,
|
23 |
-
min_length: int = MIN_INPUT_LENGTH,
|
24 |
-
max_length: int = MAX_INPUT_LENGTH,
|
25 |
-
) -> bool:
|
26 |
-
"""
|
27 |
-
@brief Checks whether the given string is valid.
|
28 |
-
@param s The string to validate.
|
29 |
-
@return True iff all characters are in [a-zA-Z0-9], spaces, or '.' and '-', and the length if between
|
30 |
-
min length and max length.
|
31 |
-
"""
|
32 |
-
|
33 |
-
characters = [c for c in s] # Not using the length from len(.) as that includes unicode characters.
|
34 |
-
if len(characters) < min_length or len(characters) > max_length:
|
35 |
-
return False
|
36 |
-
|
37 |
-
# Very important: We delimit using underscores. So these _CANNOT_ be allowed in sanitised strings.
|
38 |
-
ALLOWED = (
|
39 |
-
[c for c in string.ascii_lowercase]
|
40 |
-
+ [c for c in string.ascii_uppercase]
|
41 |
-
+ [c for c in string.digits]
|
42 |
-
+ [" ", ".", "-"]
|
43 |
-
)
|
44 |
-
for c in s:
|
45 |
-
if c not in ALLOWED:
|
46 |
-
return False
|
47 |
-
return True
|
48 |
-
|
49 |
-
|
50 |
-
def is_submission_file_valid(
|
51 |
-
submission_path: str,
|
52 |
-
is_warmup_dataset: bool,
|
53 |
-
) -> bool:
|
54 |
-
"""
|
55 |
-
@brief Checks whether the given submission file is valid.
|
56 |
-
@param submission_path The path to the submission file.
|
57 |
-
@param is_warmup_dataset Whether we are working on the regular or the warmup dataset.
|
58 |
-
@return True iff the file is within the size constraints, a JSONL, and every line is no longer than
|
59 |
-
the fixed maximum bound.
|
60 |
-
"""
|
61 |
-
|
62 |
-
if not os.path.exists(submission_path):
|
63 |
-
logger.warning(f"Could not find submission file {submission_path=}")
|
64 |
-
return False
|
65 |
-
|
66 |
-
submission_size = os.stat(submission_path).st_size
|
67 |
-
if submission_size < MIN_SUBMISSION_SIZE or submission_size > MAX_SUBMISSION_SIZE:
|
68 |
-
logger.warning(f"Submission size was {submission_size}, exceeding [{MIN_SUBMISSION_SIZE, MAX_SUBMISSION_SIZE}]")
|
69 |
-
return False
|
70 |
-
|
71 |
-
with open(submission_path, "r") as f:
|
72 |
-
|
73 |
-
# Not using readlines() to avoid consuming a large buffer at once.
|
74 |
-
n_lines = 0
|
75 |
-
seen_ids = set()
|
76 |
-
while len(line := f.readline(MAX_SINGLE_SUBMISSION_SIZE)) > 0:
|
77 |
-
n_lines += 1
|
78 |
-
if n_lines > MAX_SUBMISSION_LINES:
|
79 |
-
logger.warning(f"Got submission with more than {MAX_SUBMISSION_LINES} lines")
|
80 |
-
return False
|
81 |
-
|
82 |
-
if not (line.startswith("{") and (line.endswith("}") or line.endswith("}\n"))):
|
83 |
-
logger.warning("Submission has line that does not appear to be a JSONL")
|
84 |
-
return False
|
85 |
-
|
86 |
-
d = json.loads(line)
|
87 |
-
if set(d.keys()) != set(["problem_id", "solution"]):
|
88 |
-
logger.warning("Found unexpected keys")
|
89 |
-
return False
|
90 |
-
|
91 |
-
if not ((type(d["problem_id"]) is str or type(d["problem_id"]) is int) and type(d["solution"] is str)):
|
92 |
-
logger.warning("Found unexpected types")
|
93 |
-
return False
|
94 |
-
|
95 |
-
try:
|
96 |
-
problem_id = int(d["problem_id"])
|
97 |
-
except Exception:
|
98 |
-
logger.warning("Could not convert problem ID to int")
|
99 |
-
return False
|
100 |
-
|
101 |
-
if is_warmup_dataset:
|
102 |
-
if problem_id < DATASET_SIZE or problem_id >= DATASET_SIZE + WARMUP_DATASET_SIZE:
|
103 |
-
logger.warning(f"Problem ID {problem_id} is beyond allowed bounds")
|
104 |
-
return False
|
105 |
-
else:
|
106 |
-
if problem_id < 0 or problem_id >= DATASET_SIZE:
|
107 |
-
logger.warning(f"Problem ID {problem_id} is beyond allowed bounds")
|
108 |
-
return False
|
109 |
-
|
110 |
-
if problem_id in seen_ids:
|
111 |
-
logger.warning(f"Got duplicate submission -- ID {problem_id} appears twice")
|
112 |
-
return False # Duplicate submission.
|
113 |
-
seen_ids.add(problem_id)
|
114 |
-
|
115 |
-
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
terms/submission-agreement.md
DELETED
@@ -1,71 +0,0 @@
|
|
1 |
-
FormulaOne AI Coding Challenge - Submission Agreement v1.2
|
2 |
-
|
3 |
-
Last updated: 6 Aug 2025
|
4 |
-
|
5 |
-
1. Definitions
|
6 |
-
|
7 |
-
"Submission" means the code file you upload for a listed problem.
|
8 |
-
|
9 |
-
"Organiser" means the FormulaOne Team, c/o <legal entity and address>.
|
10 |
-
|
11 |
-
"Competition Site" means the leaderboard hosted at <HF URL>.
|
12 |
-
|
13 |
-
2. Licence to Organiser
|
14 |
-
You retain all IP in your Submission. You grant the Organiser and its
|
15 |
-
academic partners a worldwide, royalty-free, non-exclusive licence to:
|
16 |
-
a) copy, run, test and modify the Submission solely to evaluate it
|
17 |
-
against public and private test cases;
|
18 |
-
b) store the Submission on secure servers;
|
19 |
-
c) manually review the Submission for non-commercial, scientific
|
20 |
-
research;
|
21 |
-
d) reproduce small code excerpts (<= 25 lines) in research papers or
|
22 |
-
blogs;
|
23 |
-
e) contact you via the email linked to your platform account to ask
|
24 |
-
clarification questions, explore collaboration, or publicise notable
|
25 |
-
results.
|
26 |
-
3. Security, Integrity and Acceptable Use
|
27 |
-
- You confirm the Submission contains no deliberately malicious code,
|
28 |
-
back-doors, attempts to exfiltrate data, or calls to external network
|
29 |
-
resources beyond what the submission guidelines permit.
|
30 |
-
- You will not attempt to discover, reconstruct, scrape, or share
|
31 |
-
private test cases, nor to probe the sandbox environment.
|
32 |
-
- The Organiser may refuse, delete, re-run or disqualify any
|
33 |
-
Submission at its sole discretion for suspected abuse, tampering or
|
34 |
-
rule violations.
|
35 |
-
4. Competition Rules
|
36 |
-
- One account per team.
|
37 |
-
- You must own or have rights to your Submission.
|
38 |
-
- We may re-execute Submissions to confirm scores; ties may be broken
|
39 |
-
by earliest valid submission or additional hidden tests.
|
40 |
-
- We may correct or withdraw scores if errors or rule breaches are
|
41 |
-
discovered later.
|
42 |
-
5. Leaderboard and Publicity
|
43 |
-
Your chosen display name, organisation (if provided), total score and
|
44 |
-
ranking may be shown publicly and archived indefinitely. You may use a
|
45 |
-
pseudonym.
|
46 |
-
6. Ownership of Benchmark Assets
|
47 |
-
All evaluation code, problem statements and test data remain (c) 2025
|
48 |
-
FormulaOne Team and are licensed under Apache 2.0 (code) and CC BY 4.0
|
49 |
-
(content) as described in the public repository.
|
50 |
-
7. Export Control and Sanctions Compliance
|
51 |
-
You represent that your participation is not prohibited by applicable
|
52 |
-
export-control or sanctions laws and that you are not located in, under
|
53 |
-
the control of, or a national or resident of any country or person
|
54 |
-
embargoed by relevant authorities.
|
55 |
-
|
56 |
-
8. No Warranty
|
57 |
-
The benchmark and infrastructure are provided "as is." To the maximum
|
58 |
-
extent permitted by law, the Organiser disclaims all warranties,
|
59 |
-
express or implied.
|
60 |
-
9. Limitation of Liability
|
61 |
-
The Organiser's total liability arising out of or in connection with
|
62 |
-
the Competition will not exceed USD 100.
|
63 |
-
10. Changes to Rules
|
64 |
-
We may update these terms and technical rules from time to time.
|
65 |
-
Material changes will be posted on the Competition Site and apply to
|
66 |
-
submissions made after the effective date.
|
67 |
-
11. Governing Law and Venue
|
68 |
-
This Agreement is governed by the laws of England and Wales. The courts
|
69 |
-
of London, UK, have exclusive jurisdiction.
|
70 |
-
12. Contact
|
71 |
-
Questions about these terms: legal@formulaone-ai.org
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|