File size: 2,666 Bytes
6441bc6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from dataclasses import dataclass
from enum import Enum


@dataclass
class Task:
    benchmark: str
    metric: str
    col_name: str


# Define our evaluation tasks
# ---------------------------------------------------
class Tasks(Enum):
    # task_key in the data, metric name, display name
    news = Task("news", "acc", "News")
    polymarket = Task("polymarket", "acc", "PolyMarket")


# Your leaderboard name
TITLE = """<h1 align="center" id="space-title" style="font-size: 4.375rem; font-weight: bold; margin-bottom: 1rem;">๐Ÿ”ฎ FutureBench Leaderboard</h1>"""

# What does your leaderboard evaluate?
INTRODUCTION_TEXT = """<div class="section-card">
<h3 class="section-header"><span class="section-icon">๐ŸŽฏ</span> About FutureBench</h3>
FutureBench is a benchmarking system for evaluating AI models on predicting future events.
This leaderboard shows how well different AI models perform at forecasting real-world outcomes
across various domains including news events, sports, and prediction markets.
<br><br>
๐Ÿ“ <a href="https://www.together.ai/blog/futurebench" target="_blank" style="color: #007acc; text-decoration: none;">Read our blog post</a> for more details about FutureBench.
</div>"""

# Additional information about the benchmark
ABOUT_TEXT = """
<div class="section-card fade-in-up">
<h2 class="section-header"><span class="section-icon">โš™๏ธ</span> How it works</h2>

FutureBench evaluates AI models on their ability to predict future events by:

- **Ingesting real-world events** from multiple sources (news, sports, prediction markets)
- **Collecting AI predictions** before events resolve
- **Measuring accuracy** once outcomes are known
- **Ranking models** based on their predictive performance
</div>

<div class="section-card fade-in-up stagger-1">
<h2 class="section-header"><span class="section-icon">๐Ÿ“Š</span> Event Types</h2>

- **News Events**: Predictions about political developments, economic changes, and current events
- **PolyMarket**: Predictions on various real-world events traded on prediction markets
</div>

<div class="section-card fade-in-up stagger-2">
<h2 class="section-header"><span class="section-icon">๐Ÿ“ˆ</span> Metrics</h2>

Models are evaluated using **accuracy** - the percentage of correct predictions made.
The **Average** score shows overall performance across all event types.
</div>

<div class="section-card fade-in-up stagger-3">
<h2 class="section-header"><span class="section-icon">๐Ÿ”’</span> Data Integrity</h2>

All predictions are made before events resolve, ensuring fair evaluation.
The leaderboard updates as new events are resolved and model performances are calculated.
</div>
"""