Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
# app.py | |
import math | |
import gradio as gr | |
import pandas as pd | |
import plotly.graph_objects as go | |
from apscheduler.schedulers.background import BackgroundScheduler | |
from gradio_leaderboard import Leaderboard, SelectColumns | |
from huggingface_hub import whoami | |
# HTML is split so we can inject Gradio media (images/video) where needed. | |
from src.about import WHAT_IS_F1_HTML_AFTER_VIDEO # text immediately after the video | |
from src.about import WHAT_IS_F1_HTML_BOTTOM_A_AFTER_TABS # text after the heading, before the first figure | |
from src.about import WHAT_IS_F1_HTML_BOTTOM_A_BEFORE_TABS # up to (and including) the "Infinite Well" heading | |
from src.about import WHAT_IS_F1_HTML_EVAL_BEFORE_WARMUPFIG # evaluation section up to before Warmup fig | |
from src.about import ( # tail after Tier1 fig; ⬅️ split to insert the tabs right after the heading | |
CITATION_BUTTON_LABEL, | |
CITATION_BUTTON_TEXT, | |
EVALUATION_QUEUE_TEXT, | |
SUBMISSION_TERMS_TEXT, | |
WHAT_IS_F1_HTML_AFTER_TIER1FIG_TAIL, | |
WHAT_IS_F1_HTML_TOP, | |
) | |
from src.datamodel.data import F1Data | |
from src.display.css_html_js import custom_css | |
from src.display.formatting import styled_error | |
from src.display.utils import AutoEvalColumn, fields | |
from src.envs import API, CODE_PROBLEMS_REPO, REPO_ID, RESULTS_REPO, SUBMISSIONS_REPO | |
from src.logger import get_logger | |
from src.populate import get_leaderboard_df | |
from src.submission.submit import add_new_solutions, fetch_user_info | |
from src.validation.validate import MAX_INPUT_LENGTH, MIN_INPUT_LENGTH, is_submission_file_valid, is_valid | |
logger = get_logger(__name__) | |
ENSURE_ALL_PRESENT = True | |
SPLIT = "hard" # warmup for debug | |
lbdb = F1Data( | |
cp_ds_name=CODE_PROBLEMS_REPO, | |
sub_ds_name=SUBMISSIONS_REPO, | |
res_ds_name=RESULTS_REPO, | |
split=SPLIT, | |
) | |
leaderboard_df = None | |
logger.info("Initialized LBDB") | |
def restart_space(): | |
logger.info("Restarting space") | |
API.restart_space(repo_id=REPO_ID) | |
def refresh_leaderboard_data(): | |
"""Refresh the leaderboard data from the latest results""" | |
global leaderboard_df | |
try: | |
logger.info("Loading leaderboard data...") | |
new_leaderboard_df = get_leaderboard_df(RESULTS_REPO) | |
if new_leaderboard_df is not None: | |
logger.info("Leaderboard data refreshed successfully") | |
leaderboard_df = new_leaderboard_df | |
else: | |
logger.warning("No new leaderboard data found") | |
return None | |
except Exception as e: | |
logger.error(f"Error refreshing leaderboard data: {e}") | |
return None | |
def init_leaderboard(dataframe: pd.DataFrame): | |
if dataframe is None: | |
raise ValueError("Leaderboard DataFrame is None.") | |
lb = Leaderboard( | |
value=dataframe, | |
datatype=[c.type for c in fields(AutoEvalColumn)], | |
select_columns=SelectColumns( | |
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default], | |
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden], | |
label="Select Columns to Display:", | |
), | |
search_columns=[AutoEvalColumn.system.name, AutoEvalColumn.organization.name], | |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden], | |
bool_checkboxgroup_label="Hide models", | |
interactive=False, | |
) | |
lb.col_count = (1, "fixed") | |
return lb | |
def add_solution_cbk( | |
system_name: str, | |
org: str, | |
submission_path: str, | |
profile: gr.OAuthProfile | None, | |
oauth_token: gr.OAuthToken | None, | |
): | |
logger.info("Fetching user details for submission") | |
logger.info("PROFILE %s", profile) | |
logger.info("TOKEN %s", oauth_token) | |
if profile is None or oauth_token is None: | |
return styled_error("Please sign in with Hugging Face before submitting.") | |
# Display handle and display name (may change over time) | |
logger.info(f"User handle: {profile.username}") | |
display_name = profile.name or profile.username | |
logger.info(f"Display name: {display_name}") | |
# Stable account id | |
user_info = fetch_user_info(oauth_token) | |
logger.info("Logged in user info: %s", user_info) | |
stable_id = user_info.get("id") if user_info else None | |
logger.info(f"User stable ID: {stable_id}") | |
if not stable_id: | |
return styled_error("Could not retrieve your stable user ID. Please try signing in again.") | |
user_id = stable_id | |
if not profile.username: | |
return styled_error("Could not retrieve username. Please try signing in again.") | |
try: | |
# Validating the submission file. | |
if not submission_path: | |
return styled_error("Please upload JSONL submission file.") | |
if not is_submission_file_valid( | |
submission_path, | |
is_warmup_dataset=(SPLIT == "warmup"), | |
): | |
return styled_error("Failed to read JSONL submission file. Please try again later.") | |
# Validating all user-supplied arguments. | |
for val, val_name in [ | |
(system_name, "System name"), | |
(org, "Organisation name"), | |
]: | |
if len(val) == 0: | |
return styled_error(f"Please fill in the '{val_name}' field.") | |
if not is_valid(val): | |
return styled_error( | |
f"{val_name} is invalid! Must only contain characters [a-zA-Z0-9], spaces, " | |
+ "or the special characters '-' and '.', and be of length between " | |
+ f"{MIN_INPUT_LENGTH} and {MAX_INPUT_LENGTH}." | |
) | |
except Exception: | |
logger.warning("Failed to process user submission", exc_info=True) | |
return styled_error("An error occurred. Please try again later.") # Intentionally vague. | |
return add_new_solutions( | |
lbdb, | |
profile.username, | |
user_id, | |
system_name, | |
org, | |
submission_path, | |
is_warmup_dataset=(SPLIT == "warmup"), | |
ensure_all_present=ENSURE_ALL_PRESENT, | |
) | |
def gate_submission(oauth_token: gr.OAuthToken | None): | |
""" | |
@brief Toggles the visibility of the login box and submission panel based on the user's login status. | |
""" | |
logger.info("GATE TOKEN %s", oauth_token) | |
if oauth_token is None: | |
logger.info("GATE: NO TOKEN") | |
return gr.update(visible=True), gr.update(visible=False) | |
try: | |
whoami(oauth_token.token) | |
logger.info("GATE: TOKEN IS VALID") | |
return gr.update(visible=False), gr.update(visible=True) | |
except Exception: | |
logger.info("GATE: TOKEN HAS EXPIRED") | |
return gr.update(visible=True), gr.update(visible=False) | |
def get_theme(): | |
# return gr.themes.Soft( | |
# primary_hue=gr.themes.colors.blue, | |
# secondary_hue=gr.themes.colors.sky, | |
# neutral_hue=gr.themes.colors.gray, | |
# ).set( | |
# body_background_fill="#FFFFFF", | |
# panel_background_fill="#f3f4f6", | |
# ) | |
return "light" | |
# --- Gradio-based tabs for examples (no JS in HTML) --- | |
def _select_example_tab(choice: str): | |
return ( | |
gr.update(visible=(choice == "Shallow")), | |
gr.update(visible=(choice == "Deeper")), | |
gr.update(visible=(choice == "Deepest")), | |
) | |
# === Static, made-up results for the landing chart (not tied to leaderboard) === | |
MODEL_RELEASES = { | |
"GPT-5": "2025-08-07", | |
"Gemini 2.5 Pro": "2025-03-25", | |
"Grok 4": "2025-07-09", | |
"Claude Opus 4": "2025-05-22", | |
"o3 Pro": "2025-06-10", | |
} | |
TIER_TOTALS = {"Shallow Tier": 100, "Deeper Tier": 100, "Deepest Tier": 20} | |
MODELS_ORDER = ["GPT-5", "Gemini 2.5 Pro", "Grok 4", "Claude Opus 4", "o3 Pro"] | |
ACCURACY_PCT = { | |
"Shallow Tier": { | |
"GPT-5": 49, | |
"Gemini 2.5 Pro": 30, | |
"Grok 4": 28, | |
"Claude Opus 4": 30, | |
"o3 Pro": 41, | |
}, | |
"Deeper Tier": { | |
"GPT-5": 4, | |
"Gemini 2.5 Pro": 0, | |
"Grok 4": 0, | |
"Claude Opus 4": 0, | |
"o3 Pro": 1, | |
}, | |
"Deepest Tier": { | |
"GPT-5": 0, | |
"Gemini 2.5 Pro": 0, | |
"Grok 4": 0, | |
"Claude Opus 4": 0, | |
"o3 Pro": 0, | |
}, | |
} | |
def build_accuracy_figure(tier: str): | |
"""Interactive scatter: x = release date (ISO str), y = accuracy (%). Hover shows solved/total.""" | |
total = TIER_TOTALS[tier] | |
fig = go.Figure() | |
for model in MODELS_ORDER: | |
date_str = MODEL_RELEASES[model] # e.g., "2025-08-07" | |
y = ACCURACY_PCT[tier][model] # percent | |
solved = round(y * total / 100) | |
fig.add_trace( | |
go.Scatter( | |
x=[date_str], | |
y=[y], | |
mode="markers", | |
opacity=0.85, | |
name=model, # distinct legend entry & color per model | |
marker=dict(size=8, opacity=0.85, line=dict(width=0.5)), | |
cliponaxis=False, # let markers render over axes | |
hovertemplate=( | |
f"<b>{model}</b><br>" | |
"Release: %{x|%b %d, %Y}<br>" | |
"Accuracy: %{y:.1f}%<br>" | |
f"Solved: {solved}/{total}" | |
"<extra></extra>" | |
), | |
) | |
) | |
fig.update_layout( | |
template="plotly_white", | |
height=420, | |
margin=dict(l=30, r=120, t=10, b=40), # extra right room for legend | |
xaxis=dict( | |
title="Model Release Date", | |
type="date", | |
tickformat="%b %Y", | |
showgrid=True, | |
title_standoff=10, # small gap so the label doesn’t crowd the ticks | |
), | |
yaxis=dict( | |
title="Accuracy (%)", | |
range=[0, 100], # fixed 0–100 | |
tick0=0, | |
dtick=10, | |
showgrid=True, | |
layer="below traces", # draw axis below points so dots aren't “cut” | |
), | |
legend=dict(title="Models", orientation="v", y=1, x=1.02, yanchor="top"), | |
hovermode="closest", | |
) | |
return fig | |
_initial_accuracy_fig = build_accuracy_figure("Deeper Tier") | |
# Force light theme even if HF user prefers dark | |
blocks = gr.Blocks( | |
css=custom_css, | |
theme=get_theme(), | |
js=""" | |
() => { | |
// Force light theme (your original) | |
document.body.classList.remove('dark'); | |
document.documentElement.setAttribute('data-theme','light'); | |
document.documentElement.setAttribute('data-color-mode','light'); | |
// Handle <a data-tab-target="..."> to switch Gradio tabs by panel id | |
document.addEventListener('click', (e) => { | |
const a = e.target.closest('a[data-tab-target]'); | |
if (!a) return; | |
e.preventDefault(); | |
const id = a.getAttribute('data-tab-target'); // e.g., "what-is" | |
const panel = document.getElementById(id); | |
if (!panel) return; | |
// Find the tab header button that controls this panel and click it | |
const btn = document.querySelector(`[role="tab"][aria-controls="${panel.id}"]`); | |
if (btn) btn.click(); | |
}, true); | |
} | |
""", | |
) | |
with blocks: | |
with gr.Tabs(elem_classes="tab-buttons") as tabs: | |
with gr.TabItem("FormulaOne", id=0, elem_id="landing-accuracy-tab"): | |
gr.HTML( | |
'<div align="center"><header class="text-center mb-12"><h1 class="text-4xl md:text-5xl font-bold text-gray-900 f1-h1" style="margin:0; display:inline;">FormulaOne</h1><span style="display:inline-block; margin-left:0.5em;"><h3 style="margin:0; display:inline;" class="text-4xl md:text-5xl font-bold text-gray-900 f1-h3 style=">by <a href="https://doubleai.com/">AAI</a></h3></header></div>' | |
) | |
with gr.Row(elem_id="landing-hero-row"): | |
with gr.Column(scale=7, elem_id="landing-hero-left"): | |
gr.Markdown( | |
""" | |
<div class="f1-container"> | |
<p class="f1-hero"> | |
A benchmark of novel, expert-level algorithmic problems over graphs that demand deep dynamic | |
programming and logical reasoning. <strong>Shallow</strong> and <strong>Deeper</strong> tiers span moderate through | |
challenging problems, while <strong>Deepest</strong> is research-level. | |
</p> | |
</div> | |
""", | |
elem_classes="markdown-text", | |
) | |
with gr.Column(scale=3, elem_id="landing-hero-right"): | |
learn_more_btn = gr.Button( | |
"Learn More about FormulaOne", | |
elem_id="learn-more-pill", | |
variant="secondary", | |
) | |
# Make the pill switch to the "What is FormulaOne" tab | |
learn_more_btn.click( | |
lambda: gr.Tabs(selected="what-is"), # switches tabs | |
inputs=None, | |
outputs=tabs, # 'tabs' is your Tabs handle | |
) | |
# Pill-style selector aligned to the top-right | |
with gr.Row(elem_id="f1-tier-select-row"): | |
tier_selector = gr.Radio( | |
choices=list(reversed(list(TIER_TOTALS.keys()))), | |
value="Deeper Tier", | |
label=None, | |
show_label=False, | |
elem_id="f1-tier-select", | |
) | |
accuracy_plot = gr.Plot( | |
value=_initial_accuracy_fig, | |
elem_id="f1-accuracy-plot", | |
show_label=False, | |
) | |
tier_selector.change( | |
lambda t: build_accuracy_figure(t), | |
inputs=tier_selector, | |
outputs=accuracy_plot, | |
) | |
# Footnote (sampling + prompt details) | |
gr.Markdown( | |
""" | |
<div class="f1-container"> | |
<p class="f1-p" style="font-size:0.95rem;color:var(--f1-subtle);"> | |
All models were sampled with their highest available reasoning settings and a maximum token budget. | |
We also provided the models with a diverse few-shot prompt that is highly supportive for FormulaOne problems, | |
covering many of the subtle details of state design and maintenance, from a broad array of categories. | |
</p> | |
</div> | |
""", | |
elem_classes="markdown-text", | |
) | |
# Existing "What is FormulaOne" tab | |
with gr.TabItem("What is FormulaOne", id="what-is", elem_id="what-is-tab"): | |
gr.Image( | |
"assets/banner.png", | |
show_label=False, | |
elem_classes=["f1-image"], | |
show_share_button=False, | |
show_download_button=False, | |
show_fullscreen_button=False, | |
width=550, | |
) | |
# Top content and categories table | |
gr.HTML(WHAT_IS_F1_HTML_TOP) | |
# ---- Bottom content pieces interleaved with real Gradio media ---- | |
# Up to and including the "An Infinite Well" heading | |
gr.HTML(WHAT_IS_F1_HTML_BOTTOM_A_BEFORE_TABS) | |
# ===== Examples (now right after the “Infinite Well” heading; inner width 710px via CSS) ===== | |
with gr.Group(elem_id="f1-examples", elem_classes=["f1-container"]): | |
gr.HTML( | |
'<div class="f1-tabs-body"><div class="f1-examples-chip">Examples of FormulaOne problems</div></div>' | |
) | |
_latex = [ | |
{"left": "$$", "right": "$$", "display": True}, | |
{"left": "$", "right": "$", "display": False}, | |
{"left": "\\(", "right": "\\)", "display": False}, | |
{"left": "\\[", "right": "\\]", "display": True}, | |
] | |
md_warmup = gr.Markdown( | |
value=( | |
'<p style="text-align: center;"><code>Union-of-Paths-and-Cycles</code></p>\n' | |
"Given a tree-like graph $G=(V,E)$ and a weight function $w:V\\to\\mathbb{N}$, compute the sum of all weights of sets $S\\subseteq V$ such that the induced subgraph $G[S]$ is a <b>disjoint union of paths and cycles</b>." | |
), | |
latex_delimiters=_latex, | |
elem_classes=["f1-problem-markdown"], | |
) | |
md_tier1 = gr.Markdown( | |
value=( | |
'<p style="text-align: center;"><code>Maximal-Union-of-Paths-and-Cycles</code></p>\n' | |
"Given a tree-like graph $G=(V,E)$ and a weight function $w:V\\to\\mathbb{N}$, compute the sum of all weights of sets $S\\subseteq V$ such that $G[S]$ is a <b>disjoint union of paths and cycles</b> and $S$ is <b>maximal</b> with respect to this property." | |
), | |
visible=False, | |
latex_delimiters=_latex, | |
elem_classes=["f1-problem-markdown"], | |
) | |
md_tier2 = gr.Markdown( | |
value=( | |
'<p style="text-align: center;"><code>Maximal-Union-of-Cycles</code></p>\n' | |
"Given a tree-like graph $G=(V,E)$ and a weight function $w:V\\to\\mathbb{N}$, compute the sum of all weights of sets $S\\subseteq V$ such that $G[S]$ is a <b>disjoint union of cycles</b> and $S$ is <b>maximal</b> with respect to this property." | |
), | |
visible=False, | |
latex_delimiters=_latex, | |
elem_classes=["f1-problem-markdown"], | |
) | |
tab_radio = gr.Radio( | |
choices=["Shallow", "Deeper", "Deepest"], | |
value="Shallow", | |
label=None, | |
show_label=False, | |
elem_id="f1-example-radio", | |
) | |
tab_radio.change(_select_example_tab, inputs=tab_radio, outputs=[md_warmup, md_tier1, md_tier2]) | |
# Continue the text after the heading (before the first figure) | |
gr.HTML(WHAT_IS_F1_HTML_BOTTOM_A_AFTER_TABS) | |
# Video (no autoplay/loop), smaller gap to caption via CSS | |
gr.Video( | |
"assets/DominatingSetAnimation.mp4", | |
autoplay=False, | |
loop=False, | |
show_label=False, | |
interactive=False, | |
elem_classes=["f1-video"], | |
show_share_button=False, | |
show_download_button=False, | |
) | |
gr.HTML( | |
'<div class="f1-figcaption f1-figcaption-video">Brief explanation showcasing the design of a compressed dynamic programming state-space.</div>' | |
) | |
gr.HTML(WHAT_IS_F1_HTML_AFTER_VIDEO) | |
# Evaluation: Warmup figure | |
gr.HTML(WHAT_IS_F1_HTML_EVAL_BEFORE_WARMUPFIG, padding=False) | |
gr.Image( | |
"assets/perf_plot.png", | |
width=600, | |
show_label=False, | |
elem_classes=["f1-image"], | |
show_share_button=False, | |
show_download_button=False, | |
show_fullscreen_button=False, | |
) | |
gr.HTML('<div class="f1-figcaption">Performance of frontier models on the FormulaOne dataset.</div>') | |
# Tail after Deeper Tier fig | |
gr.HTML(WHAT_IS_F1_HTML_AFTER_TIER1FIG_TAIL) | |
# Rename tab to "Leaderboard" and cap at 800px width | |
with gr.TabItem("Leaderboard", elem_id="formulaone-leaderboard-tab-table", id=2): | |
gr.Markdown( | |
""" | |
Welcome to the FormulaOne leaderboard. This table tracks performance on the core FormulaOne benchmark, covering the **deeper** and **deepest** tiers (120 problems). | |
Use the 'Select Columns to Display' dropdown to customize your view, and the search bar to find specific models or organizations. | |
""", | |
elem_classes="markdown-text", | |
) | |
refresh_leaderboard_data() | |
assert leaderboard_df is not None | |
leaderboard_component = init_leaderboard(leaderboard_df) | |
with gr.TabItem("Submit Solutions", elem_id="formulaone-submit-tab-table", id=3): | |
logger.info("Tab submission") | |
with gr.Column(): | |
with gr.Row(): | |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text") | |
with gr.Row(): | |
gr.Markdown("# ✉️✨ Submit your solutions", elem_classes="markdown-text") | |
gr.Markdown(SUBMISSION_TERMS_TEXT, elem_classes="markdown-text") | |
login_box = gr.Group(visible=True, elem_id="f1-login-box") | |
with login_box: | |
gr.Markdown("Please sign in with Hugging Face to submit") | |
gr.LoginButton(elem_id="hf-login-btn") | |
submit_panel = gr.Group(visible=False, elem_classes="markdown-text") | |
with submit_panel: | |
with gr.Row(): | |
with gr.Column(): | |
system_name_textbox = gr.Textbox(label=AutoEvalColumn.system.name) | |
org_textbox = gr.Textbox(label=AutoEvalColumn.organization.name) | |
submission_file = gr.File(label="JSONL solutions file", file_types=[".jsonl"]) | |
# Required checkboxes | |
agreement_checkbox = gr.Checkbox( | |
label="I agree to the FormulaOne Submission Agreement (v1.2).", | |
value=False, | |
elem_classes="markdown-text", | |
) | |
privacy_checkbox = gr.Checkbox( | |
label="I have read the Privacy Notice.", value=False, elem_classes="markdown-text" | |
) | |
security_checkbox = gr.Checkbox( | |
label="I confirm this submission does not attempt to access private tests or exfiltrate data.", | |
value=False, | |
elem_classes="markdown-text", | |
) | |
privacy_link = "https://huggingface.co/spaces/double-ai/FormulaOne-Leaderboard/blob/main/docs/privacy-policy.md" | |
submission_agreement_link = "https://huggingface.co/spaces/double-ai/FormulaOne-Leaderboard/blob/main/terms/submission-agreement.md" | |
gr.Markdown( | |
f'<a href="{privacy_link}" target="_blank" rel="noopener noreferrer">Privacy Notice</a>; ' | |
f'<a href="{submission_agreement_link}" target="_blank" rel="noopener noreferrer">Submission Agreement</a>', | |
elem_classes="markdown-text", | |
) | |
logger.info("Submit button") | |
submit_button = gr.Button("Submit", variant="primary", interactive=False) | |
submission_result = gr.Markdown() | |
# Update submit button interactivity based on checkboxes | |
def update_submit_button(agreement, privacy, security): | |
return gr.update(interactive=agreement and privacy and security) | |
for checkbox in [agreement_checkbox, privacy_checkbox, security_checkbox]: | |
checkbox.change( | |
update_submit_button, | |
inputs=[agreement_checkbox, privacy_checkbox, security_checkbox], | |
outputs=submit_button, | |
) | |
submit_button.click( | |
add_solution_cbk, | |
[ | |
system_name_textbox, | |
org_textbox, | |
submission_file, | |
], | |
submission_result, | |
) | |
with gr.Row(): | |
logger.info("Citation") | |
with gr.Accordion(CITATION_BUTTON_LABEL, open=False): | |
gr.Code( | |
value=CITATION_BUTTON_TEXT.strip(), | |
elem_id="citation-block", | |
) | |
blocks.load(lambda: leaderboard_df, inputs=[], outputs=[leaderboard_component]) | |
blocks.load(gate_submission, inputs=None, outputs=[login_box, submit_panel]) | |
logger.info("Scheduler") | |
scheduler = BackgroundScheduler() | |
scheduler.add_job(restart_space, "interval", seconds=1800) | |
scheduler.add_job(refresh_leaderboard_data, "interval", seconds=120) | |
scheduler.start() | |
logger.info("Launch") | |
blocks.queue(default_concurrency_limit=40).launch() | |
logger.info("Done") | |