# app.py import math import gradio as gr import pandas as pd import plotly.graph_objects as go from apscheduler.schedulers.background import BackgroundScheduler from gradio_leaderboard import Leaderboard, SelectColumns from huggingface_hub import whoami # HTML is split so we can inject Gradio media (images/video) where needed. from src.about import WHAT_IS_F1_HTML_AFTER_VIDEO # text immediately after the video from src.about import WHAT_IS_F1_HTML_BOTTOM_A_AFTER_TABS # text after the heading, before the first figure from src.about import WHAT_IS_F1_HTML_BOTTOM_A_BEFORE_TABS # up to (and including) the "Infinite Well" heading from src.about import WHAT_IS_F1_HTML_EVAL_BEFORE_WARMUPFIG # evaluation section up to before Warmup fig from src.about import ( # tail after Tier1 fig; ⬅️ split to insert the tabs right after the heading CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, SUBMISSION_TERMS_TEXT, WHAT_IS_F1_HTML_AFTER_TIER1FIG_TAIL, WHAT_IS_F1_HTML_TOP, ) from src.datamodel.data import F1Data from src.display.css_html_js import custom_css from src.display.formatting import styled_error from src.display.utils import AutoEvalColumn, fields from src.envs import API, CODE_PROBLEMS_REPO, REPO_ID, RESULTS_REPO, SUBMISSIONS_REPO from src.logger import get_logger from src.populate import get_leaderboard_df from src.submission.submit import add_new_solutions, fetch_user_info from src.validation.validate import MAX_INPUT_LENGTH, MIN_INPUT_LENGTH, is_submission_file_valid, is_valid logger = get_logger(__name__) ENSURE_ALL_PRESENT = True SPLIT = "hard" # warmup for debug lbdb = F1Data( cp_ds_name=CODE_PROBLEMS_REPO, sub_ds_name=SUBMISSIONS_REPO, res_ds_name=RESULTS_REPO, split=SPLIT, ) leaderboard_df = None logger.info("Initialized LBDB") def restart_space(): logger.info("Restarting space") API.restart_space(repo_id=REPO_ID) def refresh_leaderboard_data(): """Refresh the leaderboard data from the latest results""" global leaderboard_df try: logger.info("Loading leaderboard data...") new_leaderboard_df = get_leaderboard_df(RESULTS_REPO) if new_leaderboard_df is not None: logger.info("Leaderboard data refreshed successfully") leaderboard_df = new_leaderboard_df else: logger.warning("No new leaderboard data found") return None except Exception as e: logger.error(f"Error refreshing leaderboard data: {e}") return None def init_leaderboard(dataframe: pd.DataFrame): if dataframe is None: raise ValueError("Leaderboard DataFrame is None.") lb = Leaderboard( value=dataframe, datatype=[c.type for c in fields(AutoEvalColumn)], select_columns=SelectColumns( default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default], cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden], label="Select Columns to Display:", ), search_columns=[AutoEvalColumn.system.name, AutoEvalColumn.organization.name], hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden], bool_checkboxgroup_label="Hide models", interactive=False, ) lb.col_count = (1, "fixed") return lb def add_solution_cbk( system_name: str, org: str, submission_path: str, profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None, ): logger.info("Fetching user details for submission") logger.info("PROFILE %s", profile) logger.info("TOKEN %s", oauth_token) if profile is None or oauth_token is None: return styled_error("Please sign in with Hugging Face before submitting.") # Display handle and display name (may change over time) logger.info(f"User handle: {profile.username}") display_name = profile.name or profile.username logger.info(f"Display name: {display_name}") # Stable account id user_info = fetch_user_info(oauth_token) logger.info("Logged in user info: %s", user_info) stable_id = user_info.get("id") if user_info else None logger.info(f"User stable ID: {stable_id}") if not stable_id: return styled_error("Could not retrieve your stable user ID. Please try signing in again.") user_id = stable_id if not profile.username: return styled_error("Could not retrieve username. Please try signing in again.") try: # Validating the submission file. if not submission_path: return styled_error("Please upload JSONL submission file.") if not is_submission_file_valid( submission_path, is_warmup_dataset=(SPLIT == "warmup"), ): return styled_error("Failed to read JSONL submission file. Please try again later.") # Validating all user-supplied arguments. for val, val_name in [ (system_name, "System name"), (org, "Organisation name"), ]: if len(val) == 0: return styled_error(f"Please fill in the '{val_name}' field.") if not is_valid(val): return styled_error( f"{val_name} is invalid! Must only contain characters [a-zA-Z0-9], spaces, " + "or the special characters '-' and '.', and be of length between " + f"{MIN_INPUT_LENGTH} and {MAX_INPUT_LENGTH}." ) except Exception: logger.warning("Failed to process user submission", exc_info=True) return styled_error("An error occurred. Please try again later.") # Intentionally vague. return add_new_solutions( lbdb, profile.username, user_id, system_name, org, submission_path, is_warmup_dataset=(SPLIT == "warmup"), ensure_all_present=ENSURE_ALL_PRESENT, ) def gate_submission(oauth_token: gr.OAuthToken | None): """ @brief Toggles the visibility of the login box and submission panel based on the user's login status. """ logger.info("GATE TOKEN %s", oauth_token) if oauth_token is None: logger.info("GATE: NO TOKEN") return gr.update(visible=True), gr.update(visible=False) try: whoami(oauth_token.token) logger.info("GATE: TOKEN IS VALID") return gr.update(visible=False), gr.update(visible=True) except Exception: logger.info("GATE: TOKEN HAS EXPIRED") return gr.update(visible=True), gr.update(visible=False) def get_theme(): # return gr.themes.Soft( # primary_hue=gr.themes.colors.blue, # secondary_hue=gr.themes.colors.sky, # neutral_hue=gr.themes.colors.gray, # ).set( # body_background_fill="#FFFFFF", # panel_background_fill="#f3f4f6", # ) return "light" # --- Gradio-based tabs for examples (no JS in HTML) --- def _select_example_tab(choice: str): return ( gr.update(visible=(choice == "Shallow")), gr.update(visible=(choice == "Deeper")), gr.update(visible=(choice == "Deepest")), ) # === Static, made-up results for the landing chart (not tied to leaderboard) === MODEL_RELEASES = { "GPT-5": "2025-08-07", "Gemini 2.5 Pro": "2025-03-25", "Grok 4": "2025-07-09", "Claude Opus 4": "2025-05-22", "o3 Pro": "2025-06-10", } TIER_TOTALS = {"Shallow Tier": 100, "Deeper Tier": 100, "Deepest Tier": 20} MODELS_ORDER = ["GPT-5", "Gemini 2.5 Pro", "Grok 4", "Claude Opus 4", "o3 Pro"] ACCURACY_PCT = { "Shallow Tier": { "GPT-5": 49, "Gemini 2.5 Pro": 30, "Grok 4": 28, "Claude Opus 4": 30, "o3 Pro": 41, }, "Deeper Tier": { "GPT-5": 4, "Gemini 2.5 Pro": 0, "Grok 4": 0, "Claude Opus 4": 0, "o3 Pro": 1, }, "Deepest Tier": { "GPT-5": 0, "Gemini 2.5 Pro": 0, "Grok 4": 0, "Claude Opus 4": 0, "o3 Pro": 0, }, } def build_accuracy_figure(tier: str): """Interactive scatter: x = release date (ISO str), y = accuracy (%). Hover shows solved/total.""" total = TIER_TOTALS[tier] fig = go.Figure() for model in MODELS_ORDER: date_str = MODEL_RELEASES[model] # e.g., "2025-08-07" y = ACCURACY_PCT[tier][model] # percent solved = round(y * total / 100) fig.add_trace( go.Scatter( x=[date_str], y=[y], mode="markers", opacity=0.85, name=model, # distinct legend entry & color per model marker=dict(size=8, opacity=0.85, line=dict(width=0.5)), cliponaxis=False, # let markers render over axes hovertemplate=( f"{model}
" "Release: %{x|%b %d, %Y}
" "Accuracy: %{y:.1f}%
" f"Solved: {solved}/{total}" "" ), ) ) fig.update_layout( template="plotly_white", height=420, margin=dict(l=30, r=120, t=10, b=40), # extra right room for legend xaxis=dict( title="Model Release Date", type="date", tickformat="%b %Y", showgrid=True, title_standoff=10, # small gap so the label doesn’t crowd the ticks ), yaxis=dict( title="Accuracy (%)", range=[0, 100], # fixed 0–100 tick0=0, dtick=10, showgrid=True, layer="below traces", # draw axis below points so dots aren't “cut” ), legend=dict(title="Models", orientation="v", y=1, x=1.02, yanchor="top"), hovermode="closest", ) return fig _initial_accuracy_fig = build_accuracy_figure("Deeper Tier") # Force light theme even if HF user prefers dark blocks = gr.Blocks( css=custom_css, theme=get_theme(), js=""" () => { // Force light theme (your original) document.body.classList.remove('dark'); document.documentElement.setAttribute('data-theme','light'); document.documentElement.setAttribute('data-color-mode','light'); // Handle to switch Gradio tabs by panel id document.addEventListener('click', (e) => { const a = e.target.closest('a[data-tab-target]'); if (!a) return; e.preventDefault(); const id = a.getAttribute('data-tab-target'); // e.g., "what-is" const panel = document.getElementById(id); if (!panel) return; // Find the tab header button that controls this panel and click it const btn = document.querySelector(`[role="tab"][aria-controls="${panel.id}"]`); if (btn) btn.click(); }, true); } """, ) with blocks: with gr.Tabs(elem_classes="tab-buttons") as tabs: with gr.TabItem("FormulaOne", id=0, elem_id="landing-accuracy-tab"): gr.HTML( '

FormulaOne

by AAI

' ) with gr.Row(elem_id="landing-hero-row"): with gr.Column(scale=7, elem_id="landing-hero-left"): gr.Markdown( """

A benchmark of novel, expert-level algorithmic problems over graphs that demand deep dynamic programming and logical reasoning. Shallow and Deeper tiers span moderate through challenging problems, while Deepest is research-level.

""", elem_classes="markdown-text", ) with gr.Column(scale=3, elem_id="landing-hero-right"): learn_more_btn = gr.Button( "Learn More about FormulaOne", elem_id="learn-more-pill", variant="secondary", ) # Make the pill switch to the "What is FormulaOne" tab learn_more_btn.click( lambda: gr.Tabs(selected="what-is"), # switches tabs inputs=None, outputs=tabs, # 'tabs' is your Tabs handle ) # Pill-style selector aligned to the top-right with gr.Row(elem_id="f1-tier-select-row"): tier_selector = gr.Radio( choices=list(reversed(list(TIER_TOTALS.keys()))), value="Deeper Tier", label=None, show_label=False, elem_id="f1-tier-select", ) accuracy_plot = gr.Plot( value=_initial_accuracy_fig, elem_id="f1-accuracy-plot", show_label=False, ) tier_selector.change( lambda t: build_accuracy_figure(t), inputs=tier_selector, outputs=accuracy_plot, ) # Footnote (sampling + prompt details) gr.Markdown( """

All models were sampled with their highest available reasoning settings and a maximum token budget. We also provided the models with a diverse few-shot prompt that is highly supportive for FormulaOne problems, covering many of the subtle details of state design and maintenance, from a broad array of categories.

""", elem_classes="markdown-text", ) # Existing "What is FormulaOne" tab with gr.TabItem("What is FormulaOne", id="what-is", elem_id="what-is-tab"): gr.Image( "assets/banner.png", show_label=False, elem_classes=["f1-image"], show_share_button=False, show_download_button=False, show_fullscreen_button=False, width=550, ) # Top content and categories table gr.HTML(WHAT_IS_F1_HTML_TOP) # ---- Bottom content pieces interleaved with real Gradio media ---- # Up to and including the "An Infinite Well" heading gr.HTML(WHAT_IS_F1_HTML_BOTTOM_A_BEFORE_TABS) # ===== Examples (now right after the “Infinite Well” heading; inner width 710px via CSS) ===== with gr.Group(elem_id="f1-examples", elem_classes=["f1-container"]): gr.HTML( '
Examples of FormulaOne problems
' ) _latex = [ {"left": "$$", "right": "$$", "display": True}, {"left": "$", "right": "$", "display": False}, {"left": "\\(", "right": "\\)", "display": False}, {"left": "\\[", "right": "\\]", "display": True}, ] md_warmup = gr.Markdown( value=( '

Union-of-Paths-and-Cycles

\n' "Given a tree-like graph $G=(V,E)$ and a weight function $w:V\\to\\mathbb{N}$, compute the sum of all weights of sets $S\\subseteq V$ such that the induced subgraph $G[S]$ is a disjoint union of paths and cycles." ), latex_delimiters=_latex, elem_classes=["f1-problem-markdown"], ) md_tier1 = gr.Markdown( value=( '

Maximal-Union-of-Paths-and-Cycles

\n' "Given a tree-like graph $G=(V,E)$ and a weight function $w:V\\to\\mathbb{N}$, compute the sum of all weights of sets $S\\subseteq V$ such that $G[S]$ is a disjoint union of paths and cycles and $S$ is maximal with respect to this property." ), visible=False, latex_delimiters=_latex, elem_classes=["f1-problem-markdown"], ) md_tier2 = gr.Markdown( value=( '

Maximal-Union-of-Cycles

\n' "Given a tree-like graph $G=(V,E)$ and a weight function $w:V\\to\\mathbb{N}$, compute the sum of all weights of sets $S\\subseteq V$ such that $G[S]$ is a disjoint union of cycles and $S$ is maximal with respect to this property." ), visible=False, latex_delimiters=_latex, elem_classes=["f1-problem-markdown"], ) tab_radio = gr.Radio( choices=["Shallow", "Deeper", "Deepest"], value="Shallow", label=None, show_label=False, elem_id="f1-example-radio", ) tab_radio.change(_select_example_tab, inputs=tab_radio, outputs=[md_warmup, md_tier1, md_tier2]) # Continue the text after the heading (before the first figure) gr.HTML(WHAT_IS_F1_HTML_BOTTOM_A_AFTER_TABS) # Video (no autoplay/loop), smaller gap to caption via CSS gr.Video( "assets/DominatingSetAnimation.mp4", autoplay=False, loop=False, show_label=False, interactive=False, elem_classes=["f1-video"], show_share_button=False, show_download_button=False, ) gr.HTML( '
Brief explanation showcasing the design of a compressed dynamic programming state-space.
' ) gr.HTML(WHAT_IS_F1_HTML_AFTER_VIDEO) # Evaluation: Warmup figure gr.HTML(WHAT_IS_F1_HTML_EVAL_BEFORE_WARMUPFIG, padding=False) gr.Image( "assets/perf_plot.png", width=600, show_label=False, elem_classes=["f1-image"], show_share_button=False, show_download_button=False, show_fullscreen_button=False, ) gr.HTML('
Performance of frontier models on the FormulaOne dataset.
') # Tail after Deeper Tier fig gr.HTML(WHAT_IS_F1_HTML_AFTER_TIER1FIG_TAIL) # Rename tab to "Leaderboard" and cap at 800px width with gr.TabItem("Leaderboard", elem_id="formulaone-leaderboard-tab-table", id=2): gr.Markdown( """ Welcome to the FormulaOne leaderboard. This table tracks performance on the core FormulaOne benchmark, covering the **deeper** and **deepest** tiers (120 problems). Use the 'Select Columns to Display' dropdown to customize your view, and the search bar to find specific models or organizations. """, elem_classes="markdown-text", ) refresh_leaderboard_data() assert leaderboard_df is not None leaderboard_component = init_leaderboard(leaderboard_df) with gr.TabItem("Submit Solutions", elem_id="formulaone-submit-tab-table", id=3): logger.info("Tab submission") with gr.Column(): with gr.Row(): gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text") with gr.Row(): gr.Markdown("# ✉️✨ Submit your solutions", elem_classes="markdown-text") gr.Markdown(SUBMISSION_TERMS_TEXT, elem_classes="markdown-text") login_box = gr.Group(visible=True, elem_id="f1-login-box") with login_box: gr.Markdown("Please sign in with Hugging Face to submit") gr.LoginButton(elem_id="hf-login-btn") submit_panel = gr.Group(visible=False, elem_classes="markdown-text") with submit_panel: with gr.Row(): with gr.Column(): system_name_textbox = gr.Textbox(label=AutoEvalColumn.system.name) org_textbox = gr.Textbox(label=AutoEvalColumn.organization.name) submission_file = gr.File(label="JSONL solutions file", file_types=[".jsonl"]) # Required checkboxes agreement_checkbox = gr.Checkbox( label="I agree to the FormulaOne Submission Agreement (v1.2).", value=False, elem_classes="markdown-text", ) privacy_checkbox = gr.Checkbox( label="I have read the Privacy Notice.", value=False, elem_classes="markdown-text" ) security_checkbox = gr.Checkbox( label="I confirm this submission does not attempt to access private tests or exfiltrate data.", value=False, elem_classes="markdown-text", ) privacy_link = "https://huggingface.co/spaces/double-ai/FormulaOne-Leaderboard/blob/main/docs/privacy-policy.md" submission_agreement_link = "https://huggingface.co/spaces/double-ai/FormulaOne-Leaderboard/blob/main/terms/submission-agreement.md" gr.Markdown( f'Privacy Notice; ' f'Submission Agreement', elem_classes="markdown-text", ) logger.info("Submit button") submit_button = gr.Button("Submit", variant="primary", interactive=False) submission_result = gr.Markdown() # Update submit button interactivity based on checkboxes def update_submit_button(agreement, privacy, security): return gr.update(interactive=agreement and privacy and security) for checkbox in [agreement_checkbox, privacy_checkbox, security_checkbox]: checkbox.change( update_submit_button, inputs=[agreement_checkbox, privacy_checkbox, security_checkbox], outputs=submit_button, ) submit_button.click( add_solution_cbk, [ system_name_textbox, org_textbox, submission_file, ], submission_result, ) with gr.Row(): logger.info("Citation") with gr.Accordion(CITATION_BUTTON_LABEL, open=False): gr.Code( value=CITATION_BUTTON_TEXT.strip(), elem_id="citation-block", ) blocks.load(lambda: leaderboard_df, inputs=[], outputs=[leaderboard_component]) blocks.load(gate_submission, inputs=None, outputs=[login_box, submit_panel]) logger.info("Scheduler") scheduler = BackgroundScheduler() scheduler.add_job(restart_space, "interval", seconds=1800) scheduler.add_job(refresh_leaderboard_data, "interval", seconds=120) scheduler.start() logger.info("Launch") blocks.queue(default_concurrency_limit=40).launch() logger.info("Done")