from dataclasses import dataclass from enum import Enum @dataclass class Task: benchmark: str metric: str col_name: str # Select your tasks here # --------------------------------------------------- class Tasks(Enum): # task_key in the json file, metric_key in the json file, name to display in the leaderboard task0 = Task("FormulaOne", "success_rate", "Success Rate (%)") # task1 = Task("logiqa", "acc_norm", "LogiQA") NUM_FEWSHOT = 0 # Change with your few shot # --------------------------------------------------- # Your leaderboard name # TITLE = """

AAI FormulaOne Leaderboard

""" TITLE = """

FormulaOne Leaderboard

""" # What does your leaderboard evaluate? INTRODUCTION_TEXT = """ Welcome to the official leaderboard for the paper: **FormulaOne: Measuring the Depth of Algorithmic Reasoning Beyond Competitive Programming**
*Gal Beniamini, Yuval Dor, Alon Vinnikov, Shir Granot Peled, Or Weinstein, Or Sharir, Noam Wies, Tomer Nussbaum, Ido Ben Shaul, Tomer Zekharya, Yoav Levine, Shai Shalev-Shwartz, Amnon Shashua*
**AAI, July 2025** FormulaOne is a new benchmark designed to challenge frontier AI models. The benchmark is constructed from a vast and conceptually diverse family of dynamic programming problems derived from Monadic Second-Order (MSO) logic on graphs, a framework with profound connections to theoretical computer science. """ # Which evaluations are you running? how can people reproduce what you have? LLM_BENCHMARKS_TEXT = f""" ## How it works ## Reproducibility To reproduce our results, here is the commands you can run: """ EVALUATION_QUEUE_TEXT = """ ## ๐Ÿงช Submitting to the FormulaOne Leaderboard This leaderboard evaluates systems on the FormulaOne core dataset. Submissions consist of a .jsonl file with solution code for each problem. ### ๐Ÿ“ I. Format Your Submission File Your submission must be a .jsonl file with one entry per problem: ```json {"problem_id": "1", "solution": ""} {"problem_id": "2", "solution": ""} ... ``` - problem_id: Must match the official list of FormulaOne core problems. - solution: A Python code implementing the required callback functions. ๐Ÿ“„ Full list of problem_ids: View the [FormulaOne core dataset](https://github.com/double-ai/formulaone-dataset-release/dataset/formulaone) for the complete list of problem IDs. โš ๏ธ Validation Rules: Submissions must: - Contain exactly two columns: ["problem_id", "solution"] - Include all required problems (no missing/unknown IDs) - Provide solutions as Python strings - Avoid duplicates ### ๐Ÿ“ค II. Submit via the UI below - Upload your `.jsonl` file. - Fill in the following fields: - **System Name** - **Organization** - **System Type** - Click **Submit**. ### โฑ๏ธ After Submission Submissions are validated and evaluated within ~24 hours. Results will appear on the leaderboard once processed. """ CITATION_BUTTON_LABEL = """๐Ÿ“š How to cite FormulaOne""" CITATION_BUTTON_TEXT = r""" @misc{beniamini2025formulaonemeasuringdepthalgorithmic, title={FormulaOne: Measuring the Depth of Algorithmic Reasoning Beyond Competitive Programming}, author={Gal Beniamini and Yuval Dor and Alon Vinnikov and Shir Granot Peled and Or Weinstein and Or Sharir and Noam Wies and Tomer Nussbaum and Ido Ben Shaul and Tomer Zekharya and Yoav Levine and Shai Shalev-Shwartz and Amnon Shashua}, year={2025}, eprint={2507.13337}, archivePrefix={arXiv}, primaryClass={cs.AI}, url={https://arxiv.org/abs/2507.13337}, } """