Alvinn-aai's picture
improve LB description
49f9252
# app.py
import math
import gradio as gr
import pandas as pd
import plotly.graph_objects as go
from apscheduler.schedulers.background import BackgroundScheduler
from gradio_leaderboard import Leaderboard, SelectColumns
from huggingface_hub import whoami
# HTML is split so we can inject Gradio media (images/video) where needed.
from src.about import WHAT_IS_F1_HTML_AFTER_VIDEO # text immediately after the video
from src.about import WHAT_IS_F1_HTML_BOTTOM_A_AFTER_TABS # text after the heading, before the first figure
from src.about import WHAT_IS_F1_HTML_BOTTOM_A_BEFORE_TABS # up to (and including) the "Infinite Well" heading
from src.about import WHAT_IS_F1_HTML_EVAL_BEFORE_WARMUPFIG # evaluation section up to before Warmup fig
from src.about import ( # tail after Tier1 fig; ⬅️ split to insert the tabs right after the heading
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
EVALUATION_QUEUE_TEXT,
SUBMISSION_TERMS_TEXT,
WHAT_IS_F1_HTML_AFTER_TIER1FIG_TAIL,
WHAT_IS_F1_HTML_TOP,
)
from src.datamodel.data import F1Data
from src.display.css_html_js import custom_css
from src.display.formatting import styled_error
from src.display.utils import AutoEvalColumn, fields
from src.envs import API, CODE_PROBLEMS_REPO, REPO_ID, RESULTS_REPO, SUBMISSIONS_REPO
from src.logger import get_logger
from src.populate import get_leaderboard_df
from src.submission.submit import add_new_solutions, fetch_user_info
from src.validation.validate import MAX_INPUT_LENGTH, MIN_INPUT_LENGTH, is_submission_file_valid, is_valid
logger = get_logger(__name__)
ENSURE_ALL_PRESENT = True
SPLIT = "hard" # warmup for debug
lbdb = F1Data(
cp_ds_name=CODE_PROBLEMS_REPO,
sub_ds_name=SUBMISSIONS_REPO,
res_ds_name=RESULTS_REPO,
split=SPLIT,
)
leaderboard_df = None
logger.info("Initialized LBDB")
def restart_space():
logger.info("Restarting space")
API.restart_space(repo_id=REPO_ID)
def refresh_leaderboard_data():
"""Refresh the leaderboard data from the latest results"""
global leaderboard_df
try:
logger.info("Loading leaderboard data...")
new_leaderboard_df = get_leaderboard_df(RESULTS_REPO)
if new_leaderboard_df is not None:
logger.info("Leaderboard data refreshed successfully")
leaderboard_df = new_leaderboard_df
else:
logger.warning("No new leaderboard data found")
return None
except Exception as e:
logger.error(f"Error refreshing leaderboard data: {e}")
return None
def init_leaderboard(dataframe: pd.DataFrame):
if dataframe is None:
raise ValueError("Leaderboard DataFrame is None.")
lb = Leaderboard(
value=dataframe,
datatype=[c.type for c in fields(AutoEvalColumn)],
select_columns=SelectColumns(
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
label="Select Columns to Display:",
),
search_columns=[AutoEvalColumn.system.name, AutoEvalColumn.organization.name],
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
bool_checkboxgroup_label="Hide models",
interactive=False,
)
lb.col_count = (1, "fixed")
return lb
def add_solution_cbk(
system_name: str,
org: str,
submission_path: str,
profile: gr.OAuthProfile | None,
oauth_token: gr.OAuthToken | None,
):
logger.info("Fetching user details for submission")
logger.info("PROFILE %s", profile)
logger.info("TOKEN %s", oauth_token)
if profile is None or oauth_token is None:
return styled_error("Please sign in with Hugging Face before submitting.")
# Display handle and display name (may change over time)
logger.info(f"User handle: {profile.username}")
display_name = profile.name or profile.username
logger.info(f"Display name: {display_name}")
# Stable account id
user_info = fetch_user_info(oauth_token)
logger.info("Logged in user info: %s", user_info)
stable_id = user_info.get("id") if user_info else None
logger.info(f"User stable ID: {stable_id}")
if not stable_id:
return styled_error("Could not retrieve your stable user ID. Please try signing in again.")
user_id = stable_id
if not profile.username:
return styled_error("Could not retrieve username. Please try signing in again.")
try:
# Validating the submission file.
if not submission_path:
return styled_error("Please upload JSONL submission file.")
if not is_submission_file_valid(
submission_path,
is_warmup_dataset=(SPLIT == "warmup"),
):
return styled_error("Failed to read JSONL submission file. Please try again later.")
# Validating all user-supplied arguments.
for val, val_name in [
(system_name, "System name"),
(org, "Organisation name"),
]:
if len(val) == 0:
return styled_error(f"Please fill in the '{val_name}' field.")
if not is_valid(val):
return styled_error(
f"{val_name} is invalid! Must only contain characters [a-zA-Z0-9], spaces, "
+ "or the special characters '-' and '.', and be of length between "
+ f"{MIN_INPUT_LENGTH} and {MAX_INPUT_LENGTH}."
)
except Exception:
logger.warning("Failed to process user submission", exc_info=True)
return styled_error("An error occurred. Please try again later.") # Intentionally vague.
return add_new_solutions(
lbdb,
profile.username,
user_id,
system_name,
org,
submission_path,
is_warmup_dataset=(SPLIT == "warmup"),
ensure_all_present=ENSURE_ALL_PRESENT,
)
def gate_submission(oauth_token: gr.OAuthToken | None):
"""
@brief Toggles the visibility of the login box and submission panel based on the user's login status.
"""
logger.info("GATE TOKEN %s", oauth_token)
if oauth_token is None:
logger.info("GATE: NO TOKEN")
return gr.update(visible=True), gr.update(visible=False)
try:
whoami(oauth_token.token)
logger.info("GATE: TOKEN IS VALID")
return gr.update(visible=False), gr.update(visible=True)
except Exception:
logger.info("GATE: TOKEN HAS EXPIRED")
return gr.update(visible=True), gr.update(visible=False)
def get_theme():
# return gr.themes.Soft(
# primary_hue=gr.themes.colors.blue,
# secondary_hue=gr.themes.colors.sky,
# neutral_hue=gr.themes.colors.gray,
# ).set(
# body_background_fill="#FFFFFF",
# panel_background_fill="#f3f4f6",
# )
return "light"
# --- Gradio-based tabs for examples (no JS in HTML) ---
def _select_example_tab(choice: str):
return (
gr.update(visible=(choice == "Shallow")),
gr.update(visible=(choice == "Deeper")),
gr.update(visible=(choice == "Deepest")),
)
# === Static, made-up results for the landing chart (not tied to leaderboard) ===
MODEL_RELEASES = {
"GPT-5": "2025-08-07",
"Gemini 2.5 Pro": "2025-03-25",
"Grok 4": "2025-07-09",
"Claude Opus 4": "2025-05-22",
"o3 Pro": "2025-06-10",
}
TIER_TOTALS = {"Shallow Tier": 100, "Deeper Tier": 100, "Deepest Tier": 20}
MODELS_ORDER = ["GPT-5", "Gemini 2.5 Pro", "Grok 4", "Claude Opus 4", "o3 Pro"]
ACCURACY_PCT = {
"Shallow Tier": {
"GPT-5": 49,
"Gemini 2.5 Pro": 30,
"Grok 4": 28,
"Claude Opus 4": 30,
"o3 Pro": 41,
},
"Deeper Tier": {
"GPT-5": 4,
"Gemini 2.5 Pro": 0,
"Grok 4": 0,
"Claude Opus 4": 0,
"o3 Pro": 1,
},
"Deepest Tier": {
"GPT-5": 0,
"Gemini 2.5 Pro": 0,
"Grok 4": 0,
"Claude Opus 4": 0,
"o3 Pro": 0,
},
}
def build_accuracy_figure(tier: str):
"""Interactive scatter: x = release date (ISO str), y = accuracy (%). Hover shows solved/total."""
total = TIER_TOTALS[tier]
fig = go.Figure()
for model in MODELS_ORDER:
date_str = MODEL_RELEASES[model] # e.g., "2025-08-07"
y = ACCURACY_PCT[tier][model] # percent
solved = round(y * total / 100)
fig.add_trace(
go.Scatter(
x=[date_str],
y=[y],
mode="markers",
opacity=0.85,
name=model, # distinct legend entry & color per model
marker=dict(size=8, opacity=0.85, line=dict(width=0.5)),
cliponaxis=False, # let markers render over axes
hovertemplate=(
f"<b>{model}</b><br>"
"Release: %{x|%b %d, %Y}<br>"
"Accuracy: %{y:.1f}%<br>"
f"Solved: {solved}/{total}"
"<extra></extra>"
),
)
)
fig.update_layout(
template="plotly_white",
height=420,
margin=dict(l=30, r=120, t=10, b=40), # extra right room for legend
xaxis=dict(
title="Model Release Date",
type="date",
tickformat="%b %Y",
showgrid=True,
title_standoff=10, # small gap so the label doesn’t crowd the ticks
),
yaxis=dict(
title="Accuracy (%)",
range=[0, 100], # fixed 0–100
tick0=0,
dtick=10,
showgrid=True,
layer="below traces", # draw axis below points so dots aren't “cut”
),
legend=dict(title="Models", orientation="v", y=1, x=1.02, yanchor="top"),
hovermode="closest",
)
return fig
_initial_accuracy_fig = build_accuracy_figure("Deeper Tier")
# Force light theme even if HF user prefers dark
blocks = gr.Blocks(
css=custom_css,
theme=get_theme(),
js="""
() => {
// Force light theme (your original)
document.body.classList.remove('dark');
document.documentElement.setAttribute('data-theme','light');
document.documentElement.setAttribute('data-color-mode','light');
// Handle <a data-tab-target="..."> to switch Gradio tabs by panel id
document.addEventListener('click', (e) => {
const a = e.target.closest('a[data-tab-target]');
if (!a) return;
e.preventDefault();
const id = a.getAttribute('data-tab-target'); // e.g., "what-is"
const panel = document.getElementById(id);
if (!panel) return;
// Find the tab header button that controls this panel and click it
const btn = document.querySelector(`[role="tab"][aria-controls="${panel.id}"]`);
if (btn) btn.click();
}, true);
}
""",
)
with blocks:
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("FormulaOne", id=0, elem_id="landing-accuracy-tab"):
gr.HTML(
'<div align="center"><header class="text-center mb-12"><h1 class="text-4xl md:text-5xl font-bold text-gray-900 f1-h1" style="margin:0; display:inline;">FormulaOne</h1><span style="display:inline-block; margin-left:0.5em;"><h3 style="margin:0; display:inline;" class="text-4xl md:text-5xl font-bold text-gray-900 f1-h3 style=">by <a href="https://doubleai.com/">AAI</a></h3></header></div>'
)
with gr.Row(elem_id="landing-hero-row"):
with gr.Column(scale=7, elem_id="landing-hero-left"):
gr.Markdown(
"""
<div class="f1-container">
<p class="f1-hero">
A benchmark of novel, expert-level algorithmic problems over graphs that demand deep dynamic
programming and logical reasoning. <strong>Shallow</strong> and <strong>Deeper</strong> tiers span moderate through
challenging problems, while <strong>Deepest</strong> is research-level.
</p>
</div>
""",
elem_classes="markdown-text",
)
with gr.Column(scale=3, elem_id="landing-hero-right"):
learn_more_btn = gr.Button(
"Learn More about FormulaOne",
elem_id="learn-more-pill",
variant="secondary",
)
# Make the pill switch to the "What is FormulaOne" tab
learn_more_btn.click(
lambda: gr.Tabs(selected="what-is"), # switches tabs
inputs=None,
outputs=tabs, # 'tabs' is your Tabs handle
)
# Pill-style selector aligned to the top-right
with gr.Row(elem_id="f1-tier-select-row"):
tier_selector = gr.Radio(
choices=list(reversed(list(TIER_TOTALS.keys()))),
value="Deeper Tier",
label=None,
show_label=False,
elem_id="f1-tier-select",
)
accuracy_plot = gr.Plot(
value=_initial_accuracy_fig,
elem_id="f1-accuracy-plot",
show_label=False,
)
tier_selector.change(
lambda t: build_accuracy_figure(t),
inputs=tier_selector,
outputs=accuracy_plot,
)
# Footnote (sampling + prompt details)
gr.Markdown(
"""
<div class="f1-container">
<p class="f1-p" style="font-size:0.95rem;color:var(--f1-subtle);">
All models were sampled with their highest available reasoning settings and a maximum token budget.
We also provided the models with a diverse few-shot prompt that is highly supportive for FormulaOne problems,
covering many of the subtle details of state design and maintenance, from a broad array of categories.
</p>
</div>
""",
elem_classes="markdown-text",
)
# Existing "What is FormulaOne" tab
with gr.TabItem("What is FormulaOne", id="what-is", elem_id="what-is-tab"):
gr.Image(
"assets/banner.png",
show_label=False,
elem_classes=["f1-image"],
show_share_button=False,
show_download_button=False,
show_fullscreen_button=False,
width=550,
)
# Top content and categories table
gr.HTML(WHAT_IS_F1_HTML_TOP)
# ---- Bottom content pieces interleaved with real Gradio media ----
# Up to and including the "An Infinite Well" heading
gr.HTML(WHAT_IS_F1_HTML_BOTTOM_A_BEFORE_TABS)
# ===== Examples (now right after the “Infinite Well” heading; inner width 710px via CSS) =====
with gr.Group(elem_id="f1-examples", elem_classes=["f1-container"]):
gr.HTML(
'<div class="f1-tabs-body"><div class="f1-examples-chip">Examples of FormulaOne problems</div></div>'
)
_latex = [
{"left": "$$", "right": "$$", "display": True},
{"left": "$", "right": "$", "display": False},
{"left": "\\(", "right": "\\)", "display": False},
{"left": "\\[", "right": "\\]", "display": True},
]
md_warmup = gr.Markdown(
value=(
'<p style="text-align: center;"><code>Union-of-Paths-and-Cycles</code></p>\n'
"Given a tree-like graph $G=(V,E)$ and a weight function $w:V\\to\\mathbb{N}$, compute the sum of all weights of sets $S\\subseteq V$ such that the induced subgraph $G[S]$ is a <b>disjoint union of paths and cycles</b>."
),
latex_delimiters=_latex,
elem_classes=["f1-problem-markdown"],
)
md_tier1 = gr.Markdown(
value=(
'<p style="text-align: center;"><code>Maximal-Union-of-Paths-and-Cycles</code></p>\n'
"Given a tree-like graph $G=(V,E)$ and a weight function $w:V\\to\\mathbb{N}$, compute the sum of all weights of sets $S\\subseteq V$ such that $G[S]$ is a <b>disjoint union of paths and cycles</b> and $S$ is <b>maximal</b> with respect to this property."
),
visible=False,
latex_delimiters=_latex,
elem_classes=["f1-problem-markdown"],
)
md_tier2 = gr.Markdown(
value=(
'<p style="text-align: center;"><code>Maximal-Union-of-Cycles</code></p>\n'
"Given a tree-like graph $G=(V,E)$ and a weight function $w:V\\to\\mathbb{N}$, compute the sum of all weights of sets $S\\subseteq V$ such that $G[S]$ is a <b>disjoint union of cycles</b> and $S$ is <b>maximal</b> with respect to this property."
),
visible=False,
latex_delimiters=_latex,
elem_classes=["f1-problem-markdown"],
)
tab_radio = gr.Radio(
choices=["Shallow", "Deeper", "Deepest"],
value="Shallow",
label=None,
show_label=False,
elem_id="f1-example-radio",
)
tab_radio.change(_select_example_tab, inputs=tab_radio, outputs=[md_warmup, md_tier1, md_tier2])
# Continue the text after the heading (before the first figure)
gr.HTML(WHAT_IS_F1_HTML_BOTTOM_A_AFTER_TABS)
# Video (no autoplay/loop), smaller gap to caption via CSS
gr.Video(
"assets/DominatingSetAnimation.mp4",
autoplay=False,
loop=False,
show_label=False,
interactive=False,
elem_classes=["f1-video"],
show_share_button=False,
show_download_button=False,
)
gr.HTML(
'<div class="f1-figcaption f1-figcaption-video">Brief explanation showcasing the design of a compressed dynamic programming state-space.</div>'
)
gr.HTML(WHAT_IS_F1_HTML_AFTER_VIDEO)
# Evaluation: Warmup figure
gr.HTML(WHAT_IS_F1_HTML_EVAL_BEFORE_WARMUPFIG, padding=False)
gr.Image(
"assets/perf_plot.png",
width=600,
show_label=False,
elem_classes=["f1-image"],
show_share_button=False,
show_download_button=False,
show_fullscreen_button=False,
)
gr.HTML('<div class="f1-figcaption">Performance of frontier models on the FormulaOne dataset.</div>')
# Tail after Deeper Tier fig
gr.HTML(WHAT_IS_F1_HTML_AFTER_TIER1FIG_TAIL)
# Rename tab to "Leaderboard" and cap at 800px width
with gr.TabItem("Leaderboard", elem_id="formulaone-leaderboard-tab-table", id=2):
gr.Markdown(
"""
Welcome to the FormulaOne leaderboard. This table tracks performance on the core FormulaOne benchmark, covering the **deeper** and **deepest** tiers (120 problems).
Use the 'Select Columns to Display' dropdown to customize your view, and the search bar to find specific models or organizations.
""",
elem_classes="markdown-text",
)
refresh_leaderboard_data()
assert leaderboard_df is not None
leaderboard_component = init_leaderboard(leaderboard_df)
with gr.TabItem("Submit Solutions", elem_id="formulaone-submit-tab-table", id=3):
logger.info("Tab submission")
with gr.Column():
with gr.Row():
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
with gr.Row():
gr.Markdown("# ✉️✨ Submit your solutions", elem_classes="markdown-text")
gr.Markdown(SUBMISSION_TERMS_TEXT, elem_classes="markdown-text")
login_box = gr.Group(visible=True, elem_id="f1-login-box")
with login_box:
gr.Markdown("Please sign in with Hugging Face to submit")
gr.LoginButton(elem_id="hf-login-btn")
submit_panel = gr.Group(visible=False, elem_classes="markdown-text")
with submit_panel:
with gr.Row():
with gr.Column():
system_name_textbox = gr.Textbox(label=AutoEvalColumn.system.name)
org_textbox = gr.Textbox(label=AutoEvalColumn.organization.name)
submission_file = gr.File(label="JSONL solutions file", file_types=[".jsonl"])
# Required checkboxes
agreement_checkbox = gr.Checkbox(
label="I agree to the FormulaOne Submission Agreement (v1.2).",
value=False,
elem_classes="markdown-text",
)
privacy_checkbox = gr.Checkbox(
label="I have read the Privacy Notice.", value=False, elem_classes="markdown-text"
)
security_checkbox = gr.Checkbox(
label="I confirm this submission does not attempt to access private tests or exfiltrate data.",
value=False,
elem_classes="markdown-text",
)
privacy_link = "https://huggingface.co/spaces/double-ai/FormulaOne-Leaderboard/blob/main/docs/privacy-policy.md"
submission_agreement_link = "https://huggingface.co/spaces/double-ai/FormulaOne-Leaderboard/blob/main/terms/submission-agreement.md"
gr.Markdown(
f'<a href="{privacy_link}" target="_blank" rel="noopener noreferrer">Privacy Notice</a>; '
f'<a href="{submission_agreement_link}" target="_blank" rel="noopener noreferrer">Submission Agreement</a>',
elem_classes="markdown-text",
)
logger.info("Submit button")
submit_button = gr.Button("Submit", variant="primary", interactive=False)
submission_result = gr.Markdown()
# Update submit button interactivity based on checkboxes
def update_submit_button(agreement, privacy, security):
return gr.update(interactive=agreement and privacy and security)
for checkbox in [agreement_checkbox, privacy_checkbox, security_checkbox]:
checkbox.change(
update_submit_button,
inputs=[agreement_checkbox, privacy_checkbox, security_checkbox],
outputs=submit_button,
)
submit_button.click(
add_solution_cbk,
[
system_name_textbox,
org_textbox,
submission_file,
],
submission_result,
)
with gr.Row():
logger.info("Citation")
with gr.Accordion(CITATION_BUTTON_LABEL, open=False):
gr.Code(
value=CITATION_BUTTON_TEXT.strip(),
elem_id="citation-block",
)
blocks.load(lambda: leaderboard_df, inputs=[], outputs=[leaderboard_component])
blocks.load(gate_submission, inputs=None, outputs=[login_box, submit_panel])
logger.info("Scheduler")
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=1800)
scheduler.add_job(refresh_leaderboard_data, "interval", seconds=120)
scheduler.start()
logger.info("Launch")
blocks.queue(default_concurrency_limit=40).launch()
logger.info("Done")