Spaces:
Running
Running
Upload 23 files
Browse files- README.md +64 -24
- app.py +292 -199
- app_old.py +204 -0
- config.json +8 -0
- database.py +98 -0
- leaderboard_gaia.csv +9 -0
- leaderboard_swe.csv +10 -0
- model +6 -0
- models.json +6 -0
- requirements.txt +4 -1
- setup.py +51 -0
- start.sh +6 -0
README.md
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
---
|
2 |
title: TRAIL
|
3 |
emoji: 🥇
|
@@ -7,40 +8,79 @@ sdk: gradio
|
|
7 |
app_file: app.py
|
8 |
pinned: true
|
9 |
license: mit
|
10 |
-
short_description:
|
11 |
sdk_version: 5.19.0
|
12 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
-
|
|
|
|
|
|
|
15 |
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
-
Results files should have the following format and be stored as json files:
|
19 |
```json
|
20 |
{
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
},
|
26 |
-
"results": {
|
27 |
-
"task_name": {
|
28 |
-
"metric_name": score,
|
29 |
-
},
|
30 |
-
"task_name2": {
|
31 |
-
"metric_name": score,
|
32 |
-
}
|
33 |
-
}
|
34 |
}
|
35 |
```
|
36 |
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
-
|
|
|
|
|
|
|
40 |
|
41 |
-
|
42 |
|
43 |
-
|
44 |
-
- the main table' columns names and properties in `src/display/utils.py`
|
45 |
-
- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
|
46 |
-
- the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
|
|
|
1 |
+
|
2 |
---
|
3 |
title: TRAIL
|
4 |
emoji: 🥇
|
|
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
license: mit
|
11 |
+
short_description: 'TRAIL: Trace Reasoning and Agentic Issue Localization'
|
12 |
sdk_version: 5.19.0
|
13 |
---
|
14 |
+
# Model Performance Leaderboard
|
15 |
+
|
16 |
+
This is a Hugging Face Space that hosts a leaderboard for comparing model performances across various metrics of TRAIL dataset.
|
17 |
+
|
18 |
+
## Features
|
19 |
+
|
20 |
+
- **Submit Model Results**: Share your model's performance metrics
|
21 |
+
- **Interactive Leaderboard**: View and sort all submissions
|
22 |
+
- **Integrated Backend**: Stores all submissions with timestamp and attribution
|
23 |
+
- **Customizable Metrics**: Configure which metrics to display and track
|
24 |
+
|
25 |
+
## Installation
|
26 |
+
|
27 |
+
### Setting Up Your Space
|
28 |
+
|
29 |
+
1. Upload all files to your Hugging Face Space
|
30 |
+
2. Make sure to make `start.sh` executable:
|
31 |
+
```bash
|
32 |
+
chmod +x start.sh
|
33 |
+
```
|
34 |
+
3. Configure your Space to use the `start.sh` script as the entry point
|
35 |
+
|
36 |
+
### Troubleshooting Installation Issues
|
37 |
|
38 |
+
If you encounter JSON parsing errors:
|
39 |
+
1. Check if `models.json` exists and is a valid JSON file
|
40 |
+
2. Run `python setup.py` to regenerate configuration files
|
41 |
+
3. If problems persist, delete the `models.json` file and let the setup script create a new one
|
42 |
|
43 |
+
## How to Use
|
44 |
+
|
45 |
+
### Viewing the Leaderboard
|
46 |
+
|
47 |
+
Navigate to the "Leaderboard" tab to see all submitted models. You can:
|
48 |
+
- Sort by any metric (click on the dropdown)
|
49 |
+
- Change sort order (ascending/descending)
|
50 |
+
- Refresh the leaderboard for the latest submissions
|
51 |
+
|
52 |
+
### Submitting a Model
|
53 |
+
|
54 |
+
1. Go to the "Submit Model" tab
|
55 |
+
2. Fill in your model name, your name, and optional description
|
56 |
+
3. Enter values for the requested metrics
|
57 |
+
4. Click "Submit Model"
|
58 |
+
|
59 |
+
## Configuration
|
60 |
+
|
61 |
+
You can customize this leaderboard by modifying the `models.json` file:
|
62 |
|
|
|
63 |
```json
|
64 |
{
|
65 |
+
"title": "TRAIL Performance Leaderboard",
|
66 |
+
"description": "This leaderboard tracks and compares model performance across multiple metrics. Submit your model results to see how they stack up!",
|
67 |
+
"metrics": ["accuracy", "f1_score", "precision", "recall"],
|
68 |
+
"main_metric": "accuracy"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
}
|
70 |
```
|
71 |
|
72 |
+
- `title`: The title of your leaderboard
|
73 |
+
- `description`: A description that appears at the top
|
74 |
+
- `metrics`: List of metrics to track
|
75 |
+
- `main_metric`: Default metric for sorting
|
76 |
+
|
77 |
+
## Technical Details
|
78 |
|
79 |
+
This leaderboard is built using:
|
80 |
+
- Gradio for the UI components
|
81 |
+
- A file-based database to store submissions
|
82 |
+
- Pandas for data manipulation and display
|
83 |
|
84 |
+
## License
|
85 |
|
86 |
+
This project is open source and available under the MIT license.
|
|
|
|
|
|
app.py
CHANGED
@@ -1,204 +1,297 @@
|
|
1 |
import gradio as gr
|
2 |
-
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
3 |
import pandas as pd
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
from src.about import (
|
8 |
-
CITATION_BUTTON_LABEL,
|
9 |
-
CITATION_BUTTON_TEXT,
|
10 |
-
EVALUATION_QUEUE_TEXT,
|
11 |
-
INTRODUCTION_TEXT,
|
12 |
-
LLM_BENCHMARKS_TEXT,
|
13 |
-
TITLE,
|
14 |
-
)
|
15 |
-
from src.display.css_html_js import custom_css
|
16 |
-
from src.display.utils import (
|
17 |
-
BENCHMARK_COLS,
|
18 |
-
COLS,
|
19 |
-
EVAL_COLS,
|
20 |
-
EVAL_TYPES,
|
21 |
-
AutoEvalColumn,
|
22 |
-
ModelType,
|
23 |
-
fields,
|
24 |
-
WeightType,
|
25 |
-
Precision
|
26 |
-
)
|
27 |
-
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
28 |
-
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
29 |
-
from src.submission.submit import add_new_eval
|
30 |
-
|
31 |
-
|
32 |
-
def restart_space():
|
33 |
-
API.restart_space(repo_id=REPO_ID)
|
34 |
-
|
35 |
-
### Space initialisation
|
36 |
-
try:
|
37 |
-
print(EVAL_REQUESTS_PATH)
|
38 |
-
snapshot_download(
|
39 |
-
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
40 |
-
)
|
41 |
-
except Exception:
|
42 |
-
restart_space()
|
43 |
-
try:
|
44 |
-
print(EVAL_RESULTS_PATH)
|
45 |
-
snapshot_download(
|
46 |
-
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
47 |
-
)
|
48 |
-
except Exception:
|
49 |
-
restart_space()
|
50 |
-
|
51 |
-
|
52 |
-
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
53 |
-
|
54 |
-
(
|
55 |
-
finished_eval_queue_df,
|
56 |
-
running_eval_queue_df,
|
57 |
-
pending_eval_queue_df,
|
58 |
-
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
59 |
-
|
60 |
-
def init_leaderboard(dataframe):
|
61 |
-
if dataframe is None or dataframe.empty:
|
62 |
-
raise ValueError("Leaderboard DataFrame is empty or None.")
|
63 |
-
return Leaderboard(
|
64 |
-
value=dataframe,
|
65 |
-
datatype=[c.type for c in fields(AutoEvalColumn)],
|
66 |
-
select_columns=SelectColumns(
|
67 |
-
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
68 |
-
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
69 |
-
label="Select Columns to Display:",
|
70 |
-
),
|
71 |
-
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
72 |
-
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
73 |
-
filter_columns=[
|
74 |
-
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
75 |
-
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
76 |
-
ColumnFilter(
|
77 |
-
AutoEvalColumn.params.name,
|
78 |
-
type="slider",
|
79 |
-
min=0.01,
|
80 |
-
max=150,
|
81 |
-
label="Select the number of parameters (B)",
|
82 |
-
),
|
83 |
-
ColumnFilter(
|
84 |
-
AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
|
85 |
-
),
|
86 |
-
],
|
87 |
-
bool_checkboxgroup_label="Hide models",
|
88 |
-
interactive=False,
|
89 |
-
)
|
90 |
-
|
91 |
-
|
92 |
-
demo = gr.Blocks(css=custom_css)
|
93 |
-
with demo:
|
94 |
-
gr.HTML(TITLE)
|
95 |
-
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
96 |
-
|
97 |
-
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
98 |
-
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
99 |
-
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
100 |
-
|
101 |
-
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
102 |
-
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
103 |
-
|
104 |
-
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
105 |
-
with gr.Column():
|
106 |
-
with gr.Row():
|
107 |
-
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
108 |
-
|
109 |
-
with gr.Column():
|
110 |
-
with gr.Accordion(
|
111 |
-
f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
112 |
-
open=False,
|
113 |
-
):
|
114 |
-
with gr.Row():
|
115 |
-
finished_eval_table = gr.components.Dataframe(
|
116 |
-
value=finished_eval_queue_df,
|
117 |
-
headers=EVAL_COLS,
|
118 |
-
datatype=EVAL_TYPES,
|
119 |
-
row_count=5,
|
120 |
-
)
|
121 |
-
with gr.Accordion(
|
122 |
-
f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
123 |
-
open=False,
|
124 |
-
):
|
125 |
-
with gr.Row():
|
126 |
-
running_eval_table = gr.components.Dataframe(
|
127 |
-
value=running_eval_queue_df,
|
128 |
-
headers=EVAL_COLS,
|
129 |
-
datatype=EVAL_TYPES,
|
130 |
-
row_count=5,
|
131 |
-
)
|
132 |
-
|
133 |
-
with gr.Accordion(
|
134 |
-
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
135 |
-
open=False,
|
136 |
-
):
|
137 |
-
with gr.Row():
|
138 |
-
pending_eval_table = gr.components.Dataframe(
|
139 |
-
value=pending_eval_queue_df,
|
140 |
-
headers=EVAL_COLS,
|
141 |
-
datatype=EVAL_TYPES,
|
142 |
-
row_count=5,
|
143 |
-
)
|
144 |
-
with gr.Row():
|
145 |
-
gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
|
146 |
-
|
147 |
-
with gr.Row():
|
148 |
-
with gr.Column():
|
149 |
-
model_name_textbox = gr.Textbox(label="Model name")
|
150 |
-
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
151 |
-
model_type = gr.Dropdown(
|
152 |
-
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
153 |
-
label="Model type",
|
154 |
-
multiselect=False,
|
155 |
-
value=None,
|
156 |
-
interactive=True,
|
157 |
-
)
|
158 |
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
|
|
|
|
|
|
|
|
190 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
with gr.Row():
|
192 |
-
with gr.
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
|
|
2 |
import pandas as pd
|
3 |
+
import os
|
4 |
+
import shutil
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
+
# Function to load leaderboard data from a CSV file
|
7 |
+
def load_leaderboard_data(csv_file_path):
|
8 |
+
try:
|
9 |
+
df = pd.read_csv(csv_file_path)
|
10 |
+
return df
|
11 |
+
except Exception as e:
|
12 |
+
print(f"Error loading CSV file: {e}")
|
13 |
+
return pd.DataFrame() # Return an empty DataFrame in case of error
|
14 |
+
|
15 |
+
# Function to process uploaded JSON file
|
16 |
+
def process_json_file(json_file):
|
17 |
+
try:
|
18 |
+
# Read the JSON file
|
19 |
+
data = pd.read_json(json_file.name)
|
20 |
+
# Here you can process the data as needed
|
21 |
+
# For demonstration, we'll just return the data as a dictionary
|
22 |
+
return data.to_dict()
|
23 |
+
except Exception as e:
|
24 |
+
return {"error": str(e)}
|
25 |
+
|
26 |
+
# Load the leaderboard data
|
27 |
+
leaderboard1 = load_leaderboard_data("leaderboard_swe.csv")
|
28 |
+
leaderboard2 = load_leaderboard_data("leaderboard_gaia.csv")
|
29 |
+
|
30 |
+
# Function to save the uploaded JSON file
|
31 |
+
def save_json_file(file_path):
|
32 |
+
if not file_path:
|
33 |
+
return "No file uploaded."
|
34 |
+
|
35 |
+
# Define the directory to save uploaded files
|
36 |
+
save_dir = "uploaded_jsons"
|
37 |
+
os.makedirs(save_dir, exist_ok=True)
|
38 |
+
|
39 |
+
# Extract the original filename
|
40 |
+
original_filename = os.path.basename(file_path)
|
41 |
|
42 |
+
# Define the path to save the file
|
43 |
+
save_path = os.path.join(save_dir, original_filename)
|
44 |
+
|
45 |
+
# Move the uploaded file to the save directory
|
46 |
+
shutil.move(file_path, save_path)
|
47 |
+
|
48 |
+
return f"File saved to {save_path}"
|
49 |
+
|
50 |
+
# Create the Gradio interface
|
51 |
+
with gr.Blocks() as demo:
|
52 |
+
gr.Markdown("# 🥇 Leaderboards")
|
53 |
with gr.Row():
|
54 |
+
with gr.Column():
|
55 |
+
gr.Markdown("## TRAIL-SWE Leaderboard")
|
56 |
+
gr.Dataframe(leaderboard1)
|
57 |
+
with gr.Column():
|
58 |
+
gr.Markdown("## TRAIL-GAIA Leaderboard")
|
59 |
+
gr.Dataframe(leaderboard2)
|
60 |
+
|
61 |
+
"""
|
62 |
+
gr.Markdown("# Submit Here")
|
63 |
+
with gr.Row():
|
64 |
+
json_input = gr.File(label="Upload JSON File", type="filepath")
|
65 |
+
json_output = gr.JSON(label="Processed Output")
|
66 |
+
submit_button = gr.Button("Submit")
|
67 |
+
submit_button.click(process_json_file, inputs=json_input, outputs=json_output)
|
68 |
+
"""
|
69 |
+
with gr.Blocks() as submit_page:
|
70 |
+
gr.Markdown("## Submit Your JSON File Here")
|
71 |
+
file_input = gr.File(label="Upload JSON File", type="filepath", file_types=['.json'])
|
72 |
+
submit_button = gr.Button("Submit", interactive=True)
|
73 |
+
output = gr.Textbox("") # Successfully submitted! Thank you for your contribution!
|
74 |
+
submit_button.click(fn=save_json_file, inputs=file_input, outputs=output)
|
75 |
+
|
76 |
+
|
77 |
+
if __name__ == "__main__":
|
78 |
+
demo.launch()
|
79 |
+
|
80 |
+
|
81 |
+
|
82 |
+
"""
|
83 |
+
import gradio as gr
|
84 |
+
import pandas as pd
|
85 |
+
import os
|
86 |
+
import json
|
87 |
+
import uuid
|
88 |
+
import hashlib
|
89 |
+
from datetime import datetime
|
90 |
+
from huggingface_hub import HfApi, login, HfFolder
|
91 |
+
|
92 |
+
# Configuration
|
93 |
+
LEADERBOARD_CSV = "leaderboard.csv"
|
94 |
+
SUBMISSIONS_FOLDER = "submissions"
|
95 |
+
CONFIG_FILE = "config.json"
|
96 |
+
DEFAULT_COLUMNS = ["rank", "submission_name", "score", "user", "timestamp"]
|
97 |
+
VERIFY_USERS = False # Set to True to enable HF authentication
|
98 |
+
|
99 |
+
# Default configuration
|
100 |
+
DEFAULT_CONFIG = {
|
101 |
+
"title": "Hugging Face Competition Leaderboard",
|
102 |
+
"description": "Submit your results for the competition",
|
103 |
+
"metric_name": "Score",
|
104 |
+
"higher_is_better": True,
|
105 |
+
"max_submissions_per_user": 5,
|
106 |
+
"allow_submission_edits": True
|
107 |
+
}
|
108 |
+
|
109 |
+
# Ensure submissions folder exists
|
110 |
+
os.makedirs(SUBMISSIONS_FOLDER, exist_ok=True)
|
111 |
+
|
112 |
+
# Load or create config
|
113 |
+
if os.path.exists(CONFIG_FILE):
|
114 |
+
with open(CONFIG_FILE, "r") as f:
|
115 |
+
config = json.load(f)
|
116 |
+
else:
|
117 |
+
config = DEFAULT_CONFIG
|
118 |
+
with open(CONFIG_FILE, "w") as f:
|
119 |
+
json.dump(config, f, indent=2)
|
120 |
+
|
121 |
+
# Initialize leaderboard if it doesn't exist
|
122 |
+
if not os.path.exists(LEADERBOARD_CSV):
|
123 |
+
pd.DataFrame(columns=DEFAULT_COLUMNS).to_csv(LEADERBOARD_CSV, index=False)
|
124 |
+
|
125 |
+
def read_leaderboard():
|
126 |
+
#Read the current leaderboard
|
127 |
+
if os.path.exists(LEADERBOARD_CSV):
|
128 |
+
df = pd.read_csv(LEADERBOARD_CSV)
|
129 |
+
return df
|
130 |
+
return pd.DataFrame(columns=DEFAULT_COLUMNS)
|
131 |
+
|
132 |
+
def verify_user(username, token):
|
133 |
+
#Verify a user with their Hugging Face token
|
134 |
+
if not VERIFY_USERS:
|
135 |
+
return True
|
136 |
+
|
137 |
+
try:
|
138 |
+
api = HfApi(token=token)
|
139 |
+
user_info = api.whoami()
|
140 |
+
return user_info["name"] == username
|
141 |
+
except:
|
142 |
+
return False
|
143 |
+
|
144 |
+
def count_user_submissions(username):
|
145 |
+
#Count how many submissions a user already has
|
146 |
+
df = read_leaderboard()
|
147 |
+
return len(df[df["user"] == username])
|
148 |
+
|
149 |
+
def update_leaderboard():
|
150 |
+
#Update the leaderboard based on submissions
|
151 |
+
# Read all submissions
|
152 |
+
submissions = []
|
153 |
+
for filename in os.listdir(SUBMISSIONS_FOLDER):
|
154 |
+
if filename.endswith(".json"):
|
155 |
+
with open(os.path.join(SUBMISSIONS_FOLDER, filename), "r") as f:
|
156 |
+
try:
|
157 |
+
data = json.load(f)
|
158 |
+
submissions.append(data)
|
159 |
+
except json.JSONDecodeError:
|
160 |
+
print(f"Error decoding {filename}")
|
161 |
+
|
162 |
+
if not submissions:
|
163 |
+
return pd.DataFrame(columns=DEFAULT_COLUMNS)
|
164 |
+
|
165 |
+
# Create dataframe and sort by score
|
166 |
+
df = pd.DataFrame(submissions)
|
167 |
+
|
168 |
+
# Sort based on configuration (higher or lower is better)
|
169 |
+
ascending = not config.get("higher_is_better", True)
|
170 |
+
df = df.sort_values("score", ascending=ascending)
|
171 |
+
|
172 |
+
# Add rank
|
173 |
+
df["rank"] = range(1, len(df) + 1)
|
174 |
+
|
175 |
+
# Save updated leaderboard
|
176 |
+
df.to_csv(LEADERBOARD_CSV, index=False)
|
177 |
+
return df
|
178 |
+
|
179 |
+
def submit(submission_name, score, username, hf_token="", submission_details=None):
|
180 |
+
#Add a new submission to the leaderboard
|
181 |
+
if not submission_name or not username:
|
182 |
+
return "Submission name and username are required", None
|
183 |
+
|
184 |
+
try:
|
185 |
+
score = float(score)
|
186 |
+
except ValueError:
|
187 |
+
return "Score must be a valid number", None
|
188 |
+
|
189 |
+
# Verify user if enabled
|
190 |
+
if VERIFY_USERS and not verify_user(username, hf_token):
|
191 |
+
return "Invalid Hugging Face credentials", None
|
192 |
+
|
193 |
+
# Check submission limit
|
194 |
+
max_submissions = config.get("max_submissions_per_user", 5)
|
195 |
+
if count_user_submissions(username) >= max_submissions:
|
196 |
+
return f"You've reached the maximum of {max_submissions} submissions", None
|
197 |
+
|
198 |
+
# Create submission entry
|
199 |
+
submission_id = str(uuid.uuid4())[:8]
|
200 |
+
submission = {
|
201 |
+
"submission_id": submission_id,
|
202 |
+
"submission_name": submission_name,
|
203 |
+
"score": score,
|
204 |
+
"user": username,
|
205 |
+
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
206 |
+
}
|
207 |
+
|
208 |
+
# Add optional details
|
209 |
+
if submission_details:
|
210 |
+
submission["details"] = submission_details
|
211 |
+
|
212 |
+
# Save submission to file
|
213 |
+
filename = f"{username}_{submission_name.replace(' ', '_')}_{submission_id}.json"
|
214 |
+
with open(os.path.join(SUBMISSIONS_FOLDER, filename), "w") as f:
|
215 |
+
json.dump(submission, f)
|
216 |
+
|
217 |
+
# Update leaderboard
|
218 |
+
leaderboard = update_leaderboard()
|
219 |
+
return f"Submission '{submission_name}' added successfully!", leaderboard
|
220 |
+
|
221 |
+
def render_leaderboard():
|
222 |
+
#Display the current leaderboard
|
223 |
+
df = update_leaderboard()
|
224 |
+
if len(df) == 0:
|
225 |
+
return "No submissions yet."
|
226 |
+
|
227 |
+
# Format the dataframe for display
|
228 |
+
display_df = df[DEFAULT_COLUMNS].copy()
|
229 |
+
return display_df
|
230 |
+
|
231 |
+
# Create the Gradio interface
|
232 |
+
with gr.Blocks(title=config["title"]) as demo:
|
233 |
+
gr.Markdown(f"# {config['title']}")
|
234 |
+
gr.Markdown(f"{config['description']}")
|
235 |
+
|
236 |
+
with gr.Tab("Leaderboard"):
|
237 |
+
gr.Markdown("## Current Rankings")
|
238 |
+
metric_name = config.get("metric_name", "Score")
|
239 |
+
higher_better = "higher is better" if config.get("higher_is_better", True) else "lower is better"
|
240 |
+
gr.Markdown(f"*Ranked by {metric_name} ({higher_better})*")
|
241 |
+
|
242 |
+
leaderboard_output = gr.Dataframe(
|
243 |
+
headers=["Rank", "Submission", metric_name, "User", "Timestamp"],
|
244 |
+
datatype=["number", "str", "number", "str", "str"],
|
245 |
+
interactive=False
|
246 |
+
)
|
247 |
+
refresh_btn = gr.Button("Refresh Leaderboard")
|
248 |
+
refresh_btn.click(render_leaderboard, inputs=[], outputs=[leaderboard_output])
|
249 |
+
|
250 |
+
with gr.Tab("Submit"):
|
251 |
+
gr.Markdown("## Submit Your Results")
|
252 |
+
with gr.Row():
|
253 |
+
with gr.Column():
|
254 |
+
submission_name = gr.Textbox(label="Submission Name", placeholder="MyAwesomeModel v1.0")
|
255 |
+
score = gr.Number(label=metric_name, precision=4)
|
256 |
+
username = gr.Textbox(label="Username", placeholder="Your Hugging Face username")
|
257 |
+
|
258 |
+
# Only show token field if verification is enabled
|
259 |
+
if VERIFY_USERS:
|
260 |
+
hf_token = gr.Textbox(
|
261 |
+
label="Hugging Face Token",
|
262 |
+
placeholder="hf_...",
|
263 |
+
type="password"
|
264 |
+
)
|
265 |
+
else:
|
266 |
+
hf_token = gr.Textbox(visible=False)
|
267 |
+
|
268 |
+
submission_details = gr.Textbox(
|
269 |
+
label="Additional Details (optional)",
|
270 |
+
placeholder="Model details, training info, etc.",
|
271 |
+
lines=5
|
272 |
+
)
|
273 |
+
submit_btn = gr.Button("Submit to Leaderboard")
|
274 |
+
|
275 |
+
submit_output = gr.Markdown()
|
276 |
+
submission_leaderboard = gr.Dataframe(
|
277 |
+
headers=["Rank", "Submission", metric_name, "User", "Timestamp"],
|
278 |
+
datatype=["number", "str", "number", "str", "str"],
|
279 |
+
interactive=False
|
280 |
+
)
|
281 |
+
|
282 |
+
submit_btn.click(
|
283 |
+
submit,
|
284 |
+
inputs=[submission_name, score, username, hf_token, submission_details],
|
285 |
+
outputs=[submit_output, submission_leaderboard]
|
286 |
+
)
|
287 |
+
|
288 |
+
# Add admin tab if desired
|
289 |
+
with gr.Tab("About"):
|
290 |
+
gr.Markdown("## About This Leaderboard")
|
291 |
+
|
292 |
+
# Initialize the leaderboard on load
|
293 |
+
demo.load(render_leaderboard, inputs=[], outputs=[leaderboard_output])
|
294 |
+
|
295 |
+
if __name__ == "__main__":
|
296 |
+
demo.launch()
|
297 |
+
"""
|
app_old.py
ADDED
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
3 |
+
import pandas as pd
|
4 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
5 |
+
from huggingface_hub import snapshot_download
|
6 |
+
|
7 |
+
from src.about import (
|
8 |
+
CITATION_BUTTON_LABEL,
|
9 |
+
CITATION_BUTTON_TEXT,
|
10 |
+
EVALUATION_QUEUE_TEXT,
|
11 |
+
INTRODUCTION_TEXT,
|
12 |
+
LLM_BENCHMARKS_TEXT,
|
13 |
+
TITLE,
|
14 |
+
)
|
15 |
+
from src.display.css_html_js import custom_css
|
16 |
+
from src.display.utils import (
|
17 |
+
BENCHMARK_COLS,
|
18 |
+
COLS,
|
19 |
+
EVAL_COLS,
|
20 |
+
EVAL_TYPES,
|
21 |
+
AutoEvalColumn,
|
22 |
+
ModelType,
|
23 |
+
fields,
|
24 |
+
WeightType,
|
25 |
+
Precision
|
26 |
+
)
|
27 |
+
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
28 |
+
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
29 |
+
from src.submission.submit import add_new_eval
|
30 |
+
|
31 |
+
|
32 |
+
def restart_space():
|
33 |
+
API.restart_space(repo_id=REPO_ID)
|
34 |
+
|
35 |
+
### Space initialisation
|
36 |
+
try:
|
37 |
+
print(EVAL_REQUESTS_PATH)
|
38 |
+
snapshot_download(
|
39 |
+
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
40 |
+
)
|
41 |
+
except Exception:
|
42 |
+
restart_space()
|
43 |
+
try:
|
44 |
+
print(EVAL_RESULTS_PATH)
|
45 |
+
snapshot_download(
|
46 |
+
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
47 |
+
)
|
48 |
+
except Exception:
|
49 |
+
restart_space()
|
50 |
+
|
51 |
+
|
52 |
+
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
53 |
+
|
54 |
+
(
|
55 |
+
finished_eval_queue_df,
|
56 |
+
running_eval_queue_df,
|
57 |
+
pending_eval_queue_df,
|
58 |
+
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
59 |
+
|
60 |
+
def init_leaderboard(dataframe):
|
61 |
+
if dataframe is None or dataframe.empty:
|
62 |
+
raise ValueError("Leaderboard DataFrame is empty or None.")
|
63 |
+
return Leaderboard(
|
64 |
+
value=dataframe,
|
65 |
+
datatype=[c.type for c in fields(AutoEvalColumn)],
|
66 |
+
select_columns=SelectColumns(
|
67 |
+
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
68 |
+
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
69 |
+
label="Select Columns to Display:",
|
70 |
+
),
|
71 |
+
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
72 |
+
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
73 |
+
filter_columns=[
|
74 |
+
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
75 |
+
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
76 |
+
ColumnFilter(
|
77 |
+
AutoEvalColumn.params.name,
|
78 |
+
type="slider",
|
79 |
+
min=0.01,
|
80 |
+
max=150,
|
81 |
+
label="Select the number of parameters (B)",
|
82 |
+
),
|
83 |
+
ColumnFilter(
|
84 |
+
AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
|
85 |
+
),
|
86 |
+
],
|
87 |
+
bool_checkboxgroup_label="Hide models",
|
88 |
+
interactive=False,
|
89 |
+
)
|
90 |
+
|
91 |
+
|
92 |
+
demo = gr.Blocks(css=custom_css)
|
93 |
+
with demo:
|
94 |
+
gr.HTML(TITLE)
|
95 |
+
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
96 |
+
|
97 |
+
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
98 |
+
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
99 |
+
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
100 |
+
|
101 |
+
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
102 |
+
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
103 |
+
|
104 |
+
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
105 |
+
with gr.Column():
|
106 |
+
with gr.Row():
|
107 |
+
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
108 |
+
|
109 |
+
with gr.Column():
|
110 |
+
with gr.Accordion(
|
111 |
+
f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
112 |
+
open=False,
|
113 |
+
):
|
114 |
+
with gr.Row():
|
115 |
+
finished_eval_table = gr.components.Dataframe(
|
116 |
+
value=finished_eval_queue_df,
|
117 |
+
headers=EVAL_COLS,
|
118 |
+
datatype=EVAL_TYPES,
|
119 |
+
row_count=5,
|
120 |
+
)
|
121 |
+
with gr.Accordion(
|
122 |
+
f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
123 |
+
open=False,
|
124 |
+
):
|
125 |
+
with gr.Row():
|
126 |
+
running_eval_table = gr.components.Dataframe(
|
127 |
+
value=running_eval_queue_df,
|
128 |
+
headers=EVAL_COLS,
|
129 |
+
datatype=EVAL_TYPES,
|
130 |
+
row_count=5,
|
131 |
+
)
|
132 |
+
|
133 |
+
with gr.Accordion(
|
134 |
+
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
135 |
+
open=False,
|
136 |
+
):
|
137 |
+
with gr.Row():
|
138 |
+
pending_eval_table = gr.components.Dataframe(
|
139 |
+
value=pending_eval_queue_df,
|
140 |
+
headers=EVAL_COLS,
|
141 |
+
datatype=EVAL_TYPES,
|
142 |
+
row_count=5,
|
143 |
+
)
|
144 |
+
with gr.Row():
|
145 |
+
gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
|
146 |
+
|
147 |
+
with gr.Row():
|
148 |
+
with gr.Column():
|
149 |
+
model_name_textbox = gr.Textbox(label="Model name")
|
150 |
+
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
151 |
+
model_type = gr.Dropdown(
|
152 |
+
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
153 |
+
label="Model type",
|
154 |
+
multiselect=False,
|
155 |
+
value=None,
|
156 |
+
interactive=True,
|
157 |
+
)
|
158 |
+
|
159 |
+
with gr.Column():
|
160 |
+
precision = gr.Dropdown(
|
161 |
+
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
162 |
+
label="Precision",
|
163 |
+
multiselect=False,
|
164 |
+
value="float16",
|
165 |
+
interactive=True,
|
166 |
+
)
|
167 |
+
weight_type = gr.Dropdown(
|
168 |
+
choices=[i.value.name for i in WeightType],
|
169 |
+
label="Weights type",
|
170 |
+
multiselect=False,
|
171 |
+
value="Original",
|
172 |
+
interactive=True,
|
173 |
+
)
|
174 |
+
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
175 |
+
|
176 |
+
submit_button = gr.Button("Submit Eval")
|
177 |
+
submission_result = gr.Markdown()
|
178 |
+
submit_button.click(
|
179 |
+
add_new_eval,
|
180 |
+
[
|
181 |
+
model_name_textbox,
|
182 |
+
base_model_name_textbox,
|
183 |
+
revision_name_textbox,
|
184 |
+
precision,
|
185 |
+
weight_type,
|
186 |
+
model_type,
|
187 |
+
],
|
188 |
+
submission_result,
|
189 |
+
)
|
190 |
+
|
191 |
+
with gr.Row():
|
192 |
+
with gr.Accordion("📙 Citation", open=False):
|
193 |
+
citation_button = gr.Textbox(
|
194 |
+
value=CITATION_BUTTON_TEXT,
|
195 |
+
label=CITATION_BUTTON_LABEL,
|
196 |
+
lines=20,
|
197 |
+
elem_id="citation-button",
|
198 |
+
show_copy_button=True,
|
199 |
+
)
|
200 |
+
|
201 |
+
scheduler = BackgroundScheduler()
|
202 |
+
scheduler.add_job(restart_space, "interval", seconds=1800)
|
203 |
+
scheduler.start()
|
204 |
+
demo.queue(default_concurrency_limit=40).launch()
|
config.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"title": "TRAIL Performance Leaderboard",
|
3 |
+
"description": "Submit your results for the TRAIL Trace Reasoning and Issue Localization competition. Models are evaluated on a combination of Categorical F1 and Location Accuracy (Joint F1)",
|
4 |
+
"metric_name": "F1 Score",
|
5 |
+
"higher_is_better": true,
|
6 |
+
"max_submissions_per_user": 3,
|
7 |
+
"allow_submission_edits": false
|
8 |
+
}
|
database.py
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import datetime
|
4 |
+
from pathlib import Path
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
class Database:
|
8 |
+
def __init__(self, submission_dir="submissions"):
|
9 |
+
self.submission_dir = submission_dir
|
10 |
+
os.makedirs(submission_dir, exist_ok=True)
|
11 |
+
|
12 |
+
def add_submission(self, submission):
|
13 |
+
"""Add a new submission to the database"""
|
14 |
+
# Generate a timestamp and ID for the submission
|
15 |
+
timestamp = datetime.datetime.now().isoformat()
|
16 |
+
submission_id = f"{submission['model_name'].replace(' ', '_')}_{timestamp.replace(':', '-')}"
|
17 |
+
|
18 |
+
# Add timestamp and ID to submission
|
19 |
+
submission['timestamp'] = timestamp
|
20 |
+
submission['id'] = submission_id
|
21 |
+
|
22 |
+
# Save submission to a JSON file
|
23 |
+
file_path = os.path.join(self.submission_dir, f"{submission_id}.json")
|
24 |
+
with open(file_path, 'w') as f:
|
25 |
+
json.dump(submission, f, indent=2)
|
26 |
+
|
27 |
+
return submission_id
|
28 |
+
|
29 |
+
def get_submission(self, submission_id):
|
30 |
+
"""Get a specific submission by ID"""
|
31 |
+
file_path = os.path.join(self.submission_dir, f"{submission_id}.json")
|
32 |
+
if os.path.exists(file_path):
|
33 |
+
with open(file_path, 'r') as f:
|
34 |
+
return json.load(f)
|
35 |
+
return None
|
36 |
+
|
37 |
+
def get_all_submissions(self):
|
38 |
+
"""Get all submissions"""
|
39 |
+
submissions = []
|
40 |
+
for file_name in os.listdir(self.submission_dir):
|
41 |
+
if file_name.endswith('.json'):
|
42 |
+
file_path = os.path.join(self.submission_dir, file_name)
|
43 |
+
with open(file_path, 'r') as f:
|
44 |
+
submissions.append(json.load(f))
|
45 |
+
return submissions
|
46 |
+
|
47 |
+
def get_leaderboard(self, sort_by="score", ascending=False):
|
48 |
+
"""Get submissions sorted for leaderboard display"""
|
49 |
+
submissions = self.get_all_submissions()
|
50 |
+
|
51 |
+
# Make sure we have submissions to sort
|
52 |
+
if not submissions:
|
53 |
+
return []
|
54 |
+
|
55 |
+
# Sort submissions
|
56 |
+
if sort_by in submissions[0]:
|
57 |
+
submissions.sort(key=lambda x: x.get(sort_by, 0), reverse=not ascending)
|
58 |
+
|
59 |
+
return submissions
|
60 |
+
|
61 |
+
def delete_submission(self, submission_id):
|
62 |
+
"""Delete a submission by ID"""
|
63 |
+
file_path = os.path.join(self.submission_dir, f"{submission_id}.json")
|
64 |
+
if os.path.exists(file_path):
|
65 |
+
os.remove(file_path)
|
66 |
+
return True
|
67 |
+
return False
|
68 |
+
|
69 |
+
# Load leaderboard configuration
|
70 |
+
def load_config():
|
71 |
+
try:
|
72 |
+
if os.path.exists("models.json") and os.path.getsize("models.json") > 0:
|
73 |
+
with open("models.json", "r") as f:
|
74 |
+
return json.load(f)
|
75 |
+
else:
|
76 |
+
print("models.json file is empty or missing. Creating with default configuration.")
|
77 |
+
# Default configuration
|
78 |
+
config = {
|
79 |
+
"title": "TRAIL Model Leaderboard",
|
80 |
+
"description": "Submit and compare model performances",
|
81 |
+
"metrics": ["Cat. F1", "Loc. Acc", "Joint F1"],
|
82 |
+
"main_metric": "Cat. F1"
|
83 |
+
}
|
84 |
+
with open("models.json", "w") as f:
|
85 |
+
json.dump(config, f, indent=2)
|
86 |
+
return config
|
87 |
+
except json.JSONDecodeError:
|
88 |
+
print("Error parsing models.json. Creating with default configuration.")
|
89 |
+
# Default configuration if JSON is invalid
|
90 |
+
config = {
|
91 |
+
"title": "TRAIL Model Leaderboard",
|
92 |
+
"description": "Submit and compare model performances",
|
93 |
+
"metrics": ["Cat. F1", "Loc. Acc", "Joint F1"],
|
94 |
+
"main_metric": "Cat. F1"
|
95 |
+
}
|
96 |
+
with open("models.json", "w") as f:
|
97 |
+
json.dump(config, f, indent=2)
|
98 |
+
return config
|
leaderboard_gaia.csv
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Rank,Model,Joint F1,Categorical F1,Location Accuracy,Date
|
2 |
+
1,Gemini-2.5-Pro-Preview-05-06,0.183,0.389,0.546,2025-05-14
|
3 |
+
2,Gemini-2.5-Flash-Preview-04-17,0.100,0.337,0.372,2025-05-14
|
4 |
+
3,Open AI o3,0.092,0.296,0.535,2025-05-14
|
5 |
+
4,Anthropic Claude-3.7-Sonnet,0.047,0.254,0.204,2025-05-14
|
6 |
+
5,GPT-4.1,0.028,0.218,0.107,2025-05-14
|
7 |
+
6,Open AI o1,0.013,0.138,0.040,2025-05-14
|
8 |
+
7,Llama-4-Maverick-17B-128E-Instruct,0.122,0.023,0.000,2025-05-14
|
9 |
+
8,Llama-4-Scout-17B-16E-Instruct,0.041,0.000,0.000,2025-05-14
|
leaderboard_swe.csv
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Rank,Model,Joint F1,Categorical F1,Location Accuracy,Date
|
2 |
+
1,Gemini-2.5-Pro-Preview-05-06,0.050,0.148,0.238,2025-05-14
|
3 |
+
2,Gemini-2.5-Flash-Preview-04-17,0.000,0.213,0.060,2025-05-14
|
4 |
+
3,Llama-4-Maverick-17B-128E-Instruct,0.000,0.191,0.083,2025-05-14
|
5 |
+
4,GPT-4.1,0.000,0.166,0.000,2025-05-14
|
6 |
+
5,Llama-4-Scout-17B-16E-Instruct,0.000,0.050,0.000,2025-05-14
|
7 |
+
6,Open AI o1,CLE,CLE,CLE,2025-05-14
|
8 |
+
7,Open AI o3,CLE,CLE,CLE,2025-05-14
|
9 |
+
8,Anthropic Claude-3.7-Sonnet,CLE,CLE,CLE,2025-05-14
|
10 |
+
|
model
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"title": "TRAIL Performance Leaderboard",
|
3 |
+
"description": "This leaderboard tracks and compares model performance across multiple metrics. Submit your model results to see how they stack up!",
|
4 |
+
"metrics": ["Cat. F1", "Loc. Acc", "Joint F1"],
|
5 |
+
"main_metric": "Cat. F1"
|
6 |
+
}
|
models.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"title": "TRAIL Performance Leaderboard",
|
3 |
+
"description": "This leaderboard tracks and compares model performance across multiple metrics. Submit your model results to see how they stack up!",
|
4 |
+
"metrics": ["Cat. F1", "Loc. Acc", "Joint F1"],
|
5 |
+
"main_metric": "Cat. F1"
|
6 |
+
}
|
requirements.txt
CHANGED
@@ -13,4 +13,7 @@ python-dateutil
|
|
13 |
tqdm
|
14 |
transformers
|
15 |
tokenizers>=0.15.0
|
16 |
-
sentencepiece
|
|
|
|
|
|
|
|
13 |
tqdm
|
14 |
transformers
|
15 |
tokenizers>=0.15.0
|
16 |
+
sentencepiece
|
17 |
+
numpy>=1.24.3
|
18 |
+
pandas
|
19 |
+
huggingface_hub
|
setup.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Setup script to ensure all necessary files and directories are created
|
4 |
+
before running the application.
|
5 |
+
"""
|
6 |
+
|
7 |
+
import os
|
8 |
+
import json
|
9 |
+
import sys
|
10 |
+
|
11 |
+
def setup():
|
12 |
+
"""Create necessary directories and files if they don't exist."""
|
13 |
+
print("Setting up leaderboard application...")
|
14 |
+
|
15 |
+
# Create submissions directory
|
16 |
+
if not os.path.exists("submissions"):
|
17 |
+
print("Creating submissions directory...")
|
18 |
+
os.makedirs("submissions", exist_ok=True)
|
19 |
+
|
20 |
+
# Create models.json if it doesn't exist or is empty
|
21 |
+
if not os.path.exists("models.json") or os.path.getsize("models.json") == 0:
|
22 |
+
print("Creating models.json configuration file...")
|
23 |
+
config = {
|
24 |
+
"title": "TRAIL Performance Leaderboard",
|
25 |
+
"description": "This leaderboard tracks and compares model performance across multiple metrics. Submit your model results to see how they stack up!",
|
26 |
+
"metrics": ["Cat. F1", "Loc. Acc", "Joint F1"],
|
27 |
+
"main_metric": "Cat. F1"
|
28 |
+
}
|
29 |
+
with open("models.json", "w") as f:
|
30 |
+
json.dump(config, f, indent=2)
|
31 |
+
else:
|
32 |
+
# Validate JSON format
|
33 |
+
try:
|
34 |
+
with open("models.json", "r") as f:
|
35 |
+
json.load(f)
|
36 |
+
print("models.json exists and is valid.")
|
37 |
+
except json.JSONDecodeError:
|
38 |
+
print("models.json exists but has invalid JSON. Creating new file...")
|
39 |
+
config = {
|
40 |
+
"title": "Model Performance Leaderboard",
|
41 |
+
"description": "This leaderboard tracks and compares model performance across multiple metrics. Submit your model results to see how they stack up!",
|
42 |
+
"metrics": ["Cat. F1", "Loc. Acc", "Joint F1"],
|
43 |
+
"main_metric": "Cat. F1"
|
44 |
+
}
|
45 |
+
with open("models.json", "w") as f:
|
46 |
+
json.dump(config, f, indent=2)
|
47 |
+
|
48 |
+
print("Setup complete.")
|
49 |
+
|
50 |
+
if __name__ == "__main__":
|
51 |
+
setup()
|
start.sh
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
# Run setup script first
|
3 |
+
python setup.py
|
4 |
+
|
5 |
+
# Then start the main application
|
6 |
+
python app.py
|