from dataclasses import dataclass
from enum import Enum
@dataclass
class Task:
benchmark: str
metric: str
col_name: str
# Define our evaluation tasks
# ---------------------------------------------------
class Tasks(Enum):
# task_key in the data, metric name, display name
news = Task("news", "acc", "News")
polymarket = Task("polymarket", "acc", "PolyMarket")
# Your leaderboard name
TITLE = """
🔮 FutureBench Leaderboard
"""
# What does your leaderboard evaluate?
INTRODUCTION_TEXT = """
FutureBench is a benchmarking system for evaluating AI models on predicting future events.
This leaderboard shows how well different AI models perform at forecasting real-world outcomes
across various domains including news events, sports, and prediction markets.
📝
Read our blog post for more details about FutureBench.
"""
# Additional information about the benchmark
ABOUT_TEXT = """
FutureBench evaluates AI models on their ability to predict future events by:
- **Ingesting real-world events** from multiple sources (news, sports, prediction markets)
- **Collecting AI predictions** before events resolve
- **Measuring accuracy** once outcomes are known
- **Ranking models** based on their predictive performance
- **News Events**: Predictions about political developments, economic changes, and current events
- **PolyMarket**: Predictions on various real-world events traded on prediction markets
Models are evaluated using **accuracy** - the percentage of correct predictions made.
The **Average** score shows overall performance across all event types.
All predictions are made before events resolve, ensuring fair evaluation.
The leaderboard updates as new events are resolved and model performances are calculated.
"""