AAI FormulaOne Leaderboard

from dataclasses import dataclass
from enum import Enum


@dataclass
class Task:
    benchmark: str
    metric: str
    col_name: str


# Select your tasks here
# ---------------------------------------------------
class Tasks(Enum):
    # task_key in the json file, metric_key in the json file, name to display in the leaderboard
    task0 = Task("FormulaOne", "success_rate", "Success Rate (%)")
    # task1 = Task("logiqa", "acc_norm", "LogiQA")


NUM_FEWSHOT = 0  # Change with your few shot
# ---------------------------------------------------


# Your leaderboard name
# TITLE = """<h1 align="center" id="space-title">AAI FormulaOne Leaderboard</h1>"""

TITLE = """
<h1 id="space-title" style="
    text-align: center;
    font-family: 'Segoe UI', 'Helvetica Neue', sans-serif;
    font-weight: 300;
    letter-spacing: 0.05em;
    color: white;
    text-transform: none;
    margin-top: 2rem;
    font-size: 2.6rem;
">
    FormulaOne Leaderboard
</h1>
"""

# What does your leaderboard evaluate?
INTRODUCTION_TEXT = """
Welcome to the official leaderboard for the paper:

**FormulaOne: Measuring the Depth of Algorithmic Reasoning Beyond Competitive Programming** <br>
*Gal Beniamini, Yuval Dor, Alon Vinnikov, Shir Granot Peled, Or Weinstein, Or Sharir, Noam Wies, Tomer Nussbaum, Ido Ben Shaul, Tomer Zekharya, Yoav Levine, Shai Shalev-Shwartz, Amnon Shashua* <br>
**AAI, July 2025**

FormulaOne is a new benchmark designed to challenge frontier AI models. The benchmark is constructed from a vast and conceptually diverse family of dynamic programming problems derived from Monadic Second-Order (MSO) logic on graphs, a framework with profound connections to theoretical computer science.
"""

# Which evaluations are you running? how can people reproduce what you have?
LLM_BENCHMARKS_TEXT = f"""
## How it works

## Reproducibility
To reproduce our results, here is the commands you can run:

"""

EVALUATION_QUEUE_TEXT = """
## 🧪 Submitting to the FormulaOne Leaderboard

This leaderboard evaluates systems on the FormulaOne core dataset. Submissions consist of a .jsonl file with solution code for each problem.

### 📁 I. Format Your Submission File

Your submission must be a .jsonl file with one entry per problem:

```json
{"problem_id": "1", "solution": "<your Python code here>"}
{"problem_id": "2", "solution": "<your Python code here>"}
...
```

- problem_id: Must match the official list of FormulaOne core problems.
- solution: A Python code implementing the required callback functions.

📄 Full list of problem_ids:
View the [FormulaOne core dataset](https://github.com/double-ai/formulaone-dataset-release/dataset/formulaone) for the complete list of problem IDs.

⚠️ Validation Rules:
Submissions must:
- Contain exactly two columns: ["problem_id", "solution"]
- Include all required problems (no missing/unknown IDs)
- Provide solutions as Python strings
- Avoid duplicates

### 📤 II. Submit via the UI below

-  Upload your `.jsonl` file.
-  Fill in the following fields:
   - **System Name**
   - **Organization**
   - **System Type**
-  Click **Submit**.
	
### ⏱️ After Submission

Submissions are validated and evaluated within ~24 hours. Results will appear on the leaderboard once processed.
"""


CITATION_BUTTON_LABEL = """📚 How to cite FormulaOne"""
CITATION_BUTTON_TEXT = r"""
@misc{beniamini2025formulaonemeasuringdepthalgorithmic,
      title={FormulaOne: Measuring the Depth of Algorithmic Reasoning Beyond Competitive Programming}, 
      author={Gal Beniamini and Yuval Dor and Alon Vinnikov and Shir Granot Peled and Or Weinstein and Or Sharir and Noam Wies and Tomer Nussbaum and Ido Ben Shaul and Tomer Zekharya and Yoav Levine and Shai Shalev-Shwartz and Amnon Shashua},
      year={2025},
      eprint={2507.13337},
      archivePrefix={arXiv},
      primaryClass={cs.AI},
      url={https://arxiv.org/abs/2507.13337}, 
}
"""