jitinpatronus commited on
Commit
0380c4f
·
verified ·
1 Parent(s): 872c476

Upload 23 files

Browse files
Files changed (12) hide show
  1. README.md +64 -24
  2. app.py +292 -199
  3. app_old.py +204 -0
  4. config.json +8 -0
  5. database.py +98 -0
  6. leaderboard_gaia.csv +9 -0
  7. leaderboard_swe.csv +10 -0
  8. model +6 -0
  9. models.json +6 -0
  10. requirements.txt +4 -1
  11. setup.py +51 -0
  12. start.sh +6 -0
README.md CHANGED
@@ -1,3 +1,4 @@
 
1
  ---
2
  title: TRAIL
3
  emoji: 🥇
@@ -7,40 +8,79 @@ sdk: gradio
7
  app_file: app.py
8
  pinned: true
9
  license: mit
10
- short_description: Leaderboard for TRAIL
11
  sdk_version: 5.19.0
12
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
- # Start the configuration
 
 
 
15
 
16
- Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
- Results files should have the following format and be stored as json files:
19
  ```json
20
  {
21
- "config": {
22
- "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
23
- "model_name": "path of the model on the hub: org/model",
24
- "model_sha": "revision on the hub",
25
- },
26
- "results": {
27
- "task_name": {
28
- "metric_name": score,
29
- },
30
- "task_name2": {
31
- "metric_name": score,
32
- }
33
- }
34
  }
35
  ```
36
 
37
- Request files are created automatically by this tool.
 
 
 
 
 
38
 
39
- If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
 
 
 
40
 
41
- # Code logic for more complex edits
42
 
43
- You'll find
44
- - the main table' columns names and properties in `src/display/utils.py`
45
- - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
46
- - the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
 
1
+
2
  ---
3
  title: TRAIL
4
  emoji: 🥇
 
8
  app_file: app.py
9
  pinned: true
10
  license: mit
11
+ short_description: 'TRAIL: Trace Reasoning and Agentic Issue Localization'
12
  sdk_version: 5.19.0
13
  ---
14
+ # Model Performance Leaderboard
15
+
16
+ This is a Hugging Face Space that hosts a leaderboard for comparing model performances across various metrics of TRAIL dataset.
17
+
18
+ ## Features
19
+
20
+ - **Submit Model Results**: Share your model's performance metrics
21
+ - **Interactive Leaderboard**: View and sort all submissions
22
+ - **Integrated Backend**: Stores all submissions with timestamp and attribution
23
+ - **Customizable Metrics**: Configure which metrics to display and track
24
+
25
+ ## Installation
26
+
27
+ ### Setting Up Your Space
28
+
29
+ 1. Upload all files to your Hugging Face Space
30
+ 2. Make sure to make `start.sh` executable:
31
+ ```bash
32
+ chmod +x start.sh
33
+ ```
34
+ 3. Configure your Space to use the `start.sh` script as the entry point
35
+
36
+ ### Troubleshooting Installation Issues
37
 
38
+ If you encounter JSON parsing errors:
39
+ 1. Check if `models.json` exists and is a valid JSON file
40
+ 2. Run `python setup.py` to regenerate configuration files
41
+ 3. If problems persist, delete the `models.json` file and let the setup script create a new one
42
 
43
+ ## How to Use
44
+
45
+ ### Viewing the Leaderboard
46
+
47
+ Navigate to the "Leaderboard" tab to see all submitted models. You can:
48
+ - Sort by any metric (click on the dropdown)
49
+ - Change sort order (ascending/descending)
50
+ - Refresh the leaderboard for the latest submissions
51
+
52
+ ### Submitting a Model
53
+
54
+ 1. Go to the "Submit Model" tab
55
+ 2. Fill in your model name, your name, and optional description
56
+ 3. Enter values for the requested metrics
57
+ 4. Click "Submit Model"
58
+
59
+ ## Configuration
60
+
61
+ You can customize this leaderboard by modifying the `models.json` file:
62
 
 
63
  ```json
64
  {
65
+ "title": "TRAIL Performance Leaderboard",
66
+ "description": "This leaderboard tracks and compares model performance across multiple metrics. Submit your model results to see how they stack up!",
67
+ "metrics": ["accuracy", "f1_score", "precision", "recall"],
68
+ "main_metric": "accuracy"
 
 
 
 
 
 
 
 
 
69
  }
70
  ```
71
 
72
+ - `title`: The title of your leaderboard
73
+ - `description`: A description that appears at the top
74
+ - `metrics`: List of metrics to track
75
+ - `main_metric`: Default metric for sorting
76
+
77
+ ## Technical Details
78
 
79
+ This leaderboard is built using:
80
+ - Gradio for the UI components
81
+ - A file-based database to store submissions
82
+ - Pandas for data manipulation and display
83
 
84
+ ## License
85
 
86
+ This project is open source and available under the MIT license.
 
 
 
app.py CHANGED
@@ -1,204 +1,297 @@
1
  import gradio as gr
2
- from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
4
- from apscheduler.schedulers.background import BackgroundScheduler
5
- from huggingface_hub import snapshot_download
6
-
7
- from src.about import (
8
- CITATION_BUTTON_LABEL,
9
- CITATION_BUTTON_TEXT,
10
- EVALUATION_QUEUE_TEXT,
11
- INTRODUCTION_TEXT,
12
- LLM_BENCHMARKS_TEXT,
13
- TITLE,
14
- )
15
- from src.display.css_html_js import custom_css
16
- from src.display.utils import (
17
- BENCHMARK_COLS,
18
- COLS,
19
- EVAL_COLS,
20
- EVAL_TYPES,
21
- AutoEvalColumn,
22
- ModelType,
23
- fields,
24
- WeightType,
25
- Precision
26
- )
27
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
- from src.submission.submit import add_new_eval
30
-
31
-
32
- def restart_space():
33
- API.restart_space(repo_id=REPO_ID)
34
-
35
- ### Space initialisation
36
- try:
37
- print(EVAL_REQUESTS_PATH)
38
- snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
- )
41
- except Exception:
42
- restart_space()
43
- try:
44
- print(EVAL_RESULTS_PATH)
45
- snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
- )
48
- except Exception:
49
- restart_space()
50
-
51
-
52
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
-
54
- (
55
- finished_eval_queue_df,
56
- running_eval_queue_df,
57
- pending_eval_queue_df,
58
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
-
60
- def init_leaderboard(dataframe):
61
- if dataframe is None or dataframe.empty:
62
- raise ValueError("Leaderboard DataFrame is empty or None.")
63
- return Leaderboard(
64
- value=dataframe,
65
- datatype=[c.type for c in fields(AutoEvalColumn)],
66
- select_columns=SelectColumns(
67
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
- label="Select Columns to Display:",
70
- ),
71
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
- filter_columns=[
74
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
- ColumnFilter(
77
- AutoEvalColumn.params.name,
78
- type="slider",
79
- min=0.01,
80
- max=150,
81
- label="Select the number of parameters (B)",
82
- ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
- ),
86
- ],
87
- bool_checkboxgroup_label="Hide models",
88
- interactive=False,
89
- )
90
-
91
-
92
- demo = gr.Blocks(css=custom_css)
93
- with demo:
94
- gr.HTML(TITLE)
95
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
-
97
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
- leaderboard = init_leaderboard(LEADERBOARD_DF)
100
-
101
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
102
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
-
104
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
- with gr.Column():
106
- with gr.Row():
107
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
-
109
- with gr.Column():
110
- with gr.Accordion(
111
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
112
- open=False,
113
- ):
114
- with gr.Row():
115
- finished_eval_table = gr.components.Dataframe(
116
- value=finished_eval_queue_df,
117
- headers=EVAL_COLS,
118
- datatype=EVAL_TYPES,
119
- row_count=5,
120
- )
121
- with gr.Accordion(
122
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
123
- open=False,
124
- ):
125
- with gr.Row():
126
- running_eval_table = gr.components.Dataframe(
127
- value=running_eval_queue_df,
128
- headers=EVAL_COLS,
129
- datatype=EVAL_TYPES,
130
- row_count=5,
131
- )
132
-
133
- with gr.Accordion(
134
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
- open=False,
136
- ):
137
- with gr.Row():
138
- pending_eval_table = gr.components.Dataframe(
139
- value=pending_eval_queue_df,
140
- headers=EVAL_COLS,
141
- datatype=EVAL_TYPES,
142
- row_count=5,
143
- )
144
- with gr.Row():
145
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
146
-
147
- with gr.Row():
148
- with gr.Column():
149
- model_name_textbox = gr.Textbox(label="Model name")
150
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
- model_type = gr.Dropdown(
152
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
- label="Model type",
154
- multiselect=False,
155
- value=None,
156
- interactive=True,
157
- )
158
 
159
- with gr.Column():
160
- precision = gr.Dropdown(
161
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
- label="Precision",
163
- multiselect=False,
164
- value="float16",
165
- interactive=True,
166
- )
167
- weight_type = gr.Dropdown(
168
- choices=[i.value.name for i in WeightType],
169
- label="Weights type",
170
- multiselect=False,
171
- value="Original",
172
- interactive=True,
173
- )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
-
176
- submit_button = gr.Button("Submit Eval")
177
- submission_result = gr.Markdown()
178
- submit_button.click(
179
- add_new_eval,
180
- [
181
- model_name_textbox,
182
- base_model_name_textbox,
183
- revision_name_textbox,
184
- precision,
185
- weight_type,
186
- model_type,
187
- ],
188
- submission_result,
189
- )
 
 
 
 
190
 
 
 
 
 
 
 
 
 
 
 
 
191
  with gr.Row():
192
- with gr.Accordion("📙 Citation", open=False):
193
- citation_button = gr.Textbox(
194
- value=CITATION_BUTTON_TEXT,
195
- label=CITATION_BUTTON_LABEL,
196
- lines=20,
197
- elem_id="citation-button",
198
- show_copy_button=True,
199
- )
200
-
201
- scheduler = BackgroundScheduler()
202
- scheduler.add_job(restart_space, "interval", seconds=1800)
203
- scheduler.start()
204
- demo.queue(default_concurrency_limit=40).launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
 
2
  import pandas as pd
3
+ import os
4
+ import shutil
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
+ # Function to load leaderboard data from a CSV file
7
+ def load_leaderboard_data(csv_file_path):
8
+ try:
9
+ df = pd.read_csv(csv_file_path)
10
+ return df
11
+ except Exception as e:
12
+ print(f"Error loading CSV file: {e}")
13
+ return pd.DataFrame() # Return an empty DataFrame in case of error
14
+
15
+ # Function to process uploaded JSON file
16
+ def process_json_file(json_file):
17
+ try:
18
+ # Read the JSON file
19
+ data = pd.read_json(json_file.name)
20
+ # Here you can process the data as needed
21
+ # For demonstration, we'll just return the data as a dictionary
22
+ return data.to_dict()
23
+ except Exception as e:
24
+ return {"error": str(e)}
25
+
26
+ # Load the leaderboard data
27
+ leaderboard1 = load_leaderboard_data("leaderboard_swe.csv")
28
+ leaderboard2 = load_leaderboard_data("leaderboard_gaia.csv")
29
+
30
+ # Function to save the uploaded JSON file
31
+ def save_json_file(file_path):
32
+ if not file_path:
33
+ return "No file uploaded."
34
+
35
+ # Define the directory to save uploaded files
36
+ save_dir = "uploaded_jsons"
37
+ os.makedirs(save_dir, exist_ok=True)
38
+
39
+ # Extract the original filename
40
+ original_filename = os.path.basename(file_path)
41
 
42
+ # Define the path to save the file
43
+ save_path = os.path.join(save_dir, original_filename)
44
+
45
+ # Move the uploaded file to the save directory
46
+ shutil.move(file_path, save_path)
47
+
48
+ return f"File saved to {save_path}"
49
+
50
+ # Create the Gradio interface
51
+ with gr.Blocks() as demo:
52
+ gr.Markdown("# 🥇 Leaderboards")
53
  with gr.Row():
54
+ with gr.Column():
55
+ gr.Markdown("## TRAIL-SWE Leaderboard")
56
+ gr.Dataframe(leaderboard1)
57
+ with gr.Column():
58
+ gr.Markdown("## TRAIL-GAIA Leaderboard")
59
+ gr.Dataframe(leaderboard2)
60
+
61
+ """
62
+ gr.Markdown("# Submit Here")
63
+ with gr.Row():
64
+ json_input = gr.File(label="Upload JSON File", type="filepath")
65
+ json_output = gr.JSON(label="Processed Output")
66
+ submit_button = gr.Button("Submit")
67
+ submit_button.click(process_json_file, inputs=json_input, outputs=json_output)
68
+ """
69
+ with gr.Blocks() as submit_page:
70
+ gr.Markdown("## Submit Your JSON File Here")
71
+ file_input = gr.File(label="Upload JSON File", type="filepath", file_types=['.json'])
72
+ submit_button = gr.Button("Submit", interactive=True)
73
+ output = gr.Textbox("") # Successfully submitted! Thank you for your contribution!
74
+ submit_button.click(fn=save_json_file, inputs=file_input, outputs=output)
75
+
76
+
77
+ if __name__ == "__main__":
78
+ demo.launch()
79
+
80
+
81
+
82
+ """
83
+ import gradio as gr
84
+ import pandas as pd
85
+ import os
86
+ import json
87
+ import uuid
88
+ import hashlib
89
+ from datetime import datetime
90
+ from huggingface_hub import HfApi, login, HfFolder
91
+
92
+ # Configuration
93
+ LEADERBOARD_CSV = "leaderboard.csv"
94
+ SUBMISSIONS_FOLDER = "submissions"
95
+ CONFIG_FILE = "config.json"
96
+ DEFAULT_COLUMNS = ["rank", "submission_name", "score", "user", "timestamp"]
97
+ VERIFY_USERS = False # Set to True to enable HF authentication
98
+
99
+ # Default configuration
100
+ DEFAULT_CONFIG = {
101
+ "title": "Hugging Face Competition Leaderboard",
102
+ "description": "Submit your results for the competition",
103
+ "metric_name": "Score",
104
+ "higher_is_better": True,
105
+ "max_submissions_per_user": 5,
106
+ "allow_submission_edits": True
107
+ }
108
+
109
+ # Ensure submissions folder exists
110
+ os.makedirs(SUBMISSIONS_FOLDER, exist_ok=True)
111
+
112
+ # Load or create config
113
+ if os.path.exists(CONFIG_FILE):
114
+ with open(CONFIG_FILE, "r") as f:
115
+ config = json.load(f)
116
+ else:
117
+ config = DEFAULT_CONFIG
118
+ with open(CONFIG_FILE, "w") as f:
119
+ json.dump(config, f, indent=2)
120
+
121
+ # Initialize leaderboard if it doesn't exist
122
+ if not os.path.exists(LEADERBOARD_CSV):
123
+ pd.DataFrame(columns=DEFAULT_COLUMNS).to_csv(LEADERBOARD_CSV, index=False)
124
+
125
+ def read_leaderboard():
126
+ #Read the current leaderboard
127
+ if os.path.exists(LEADERBOARD_CSV):
128
+ df = pd.read_csv(LEADERBOARD_CSV)
129
+ return df
130
+ return pd.DataFrame(columns=DEFAULT_COLUMNS)
131
+
132
+ def verify_user(username, token):
133
+ #Verify a user with their Hugging Face token
134
+ if not VERIFY_USERS:
135
+ return True
136
+
137
+ try:
138
+ api = HfApi(token=token)
139
+ user_info = api.whoami()
140
+ return user_info["name"] == username
141
+ except:
142
+ return False
143
+
144
+ def count_user_submissions(username):
145
+ #Count how many submissions a user already has
146
+ df = read_leaderboard()
147
+ return len(df[df["user"] == username])
148
+
149
+ def update_leaderboard():
150
+ #Update the leaderboard based on submissions
151
+ # Read all submissions
152
+ submissions = []
153
+ for filename in os.listdir(SUBMISSIONS_FOLDER):
154
+ if filename.endswith(".json"):
155
+ with open(os.path.join(SUBMISSIONS_FOLDER, filename), "r") as f:
156
+ try:
157
+ data = json.load(f)
158
+ submissions.append(data)
159
+ except json.JSONDecodeError:
160
+ print(f"Error decoding {filename}")
161
+
162
+ if not submissions:
163
+ return pd.DataFrame(columns=DEFAULT_COLUMNS)
164
+
165
+ # Create dataframe and sort by score
166
+ df = pd.DataFrame(submissions)
167
+
168
+ # Sort based on configuration (higher or lower is better)
169
+ ascending = not config.get("higher_is_better", True)
170
+ df = df.sort_values("score", ascending=ascending)
171
+
172
+ # Add rank
173
+ df["rank"] = range(1, len(df) + 1)
174
+
175
+ # Save updated leaderboard
176
+ df.to_csv(LEADERBOARD_CSV, index=False)
177
+ return df
178
+
179
+ def submit(submission_name, score, username, hf_token="", submission_details=None):
180
+ #Add a new submission to the leaderboard
181
+ if not submission_name or not username:
182
+ return "Submission name and username are required", None
183
+
184
+ try:
185
+ score = float(score)
186
+ except ValueError:
187
+ return "Score must be a valid number", None
188
+
189
+ # Verify user if enabled
190
+ if VERIFY_USERS and not verify_user(username, hf_token):
191
+ return "Invalid Hugging Face credentials", None
192
+
193
+ # Check submission limit
194
+ max_submissions = config.get("max_submissions_per_user", 5)
195
+ if count_user_submissions(username) >= max_submissions:
196
+ return f"You've reached the maximum of {max_submissions} submissions", None
197
+
198
+ # Create submission entry
199
+ submission_id = str(uuid.uuid4())[:8]
200
+ submission = {
201
+ "submission_id": submission_id,
202
+ "submission_name": submission_name,
203
+ "score": score,
204
+ "user": username,
205
+ "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
206
+ }
207
+
208
+ # Add optional details
209
+ if submission_details:
210
+ submission["details"] = submission_details
211
+
212
+ # Save submission to file
213
+ filename = f"{username}_{submission_name.replace(' ', '_')}_{submission_id}.json"
214
+ with open(os.path.join(SUBMISSIONS_FOLDER, filename), "w") as f:
215
+ json.dump(submission, f)
216
+
217
+ # Update leaderboard
218
+ leaderboard = update_leaderboard()
219
+ return f"Submission '{submission_name}' added successfully!", leaderboard
220
+
221
+ def render_leaderboard():
222
+ #Display the current leaderboard
223
+ df = update_leaderboard()
224
+ if len(df) == 0:
225
+ return "No submissions yet."
226
+
227
+ # Format the dataframe for display
228
+ display_df = df[DEFAULT_COLUMNS].copy()
229
+ return display_df
230
+
231
+ # Create the Gradio interface
232
+ with gr.Blocks(title=config["title"]) as demo:
233
+ gr.Markdown(f"# {config['title']}")
234
+ gr.Markdown(f"{config['description']}")
235
+
236
+ with gr.Tab("Leaderboard"):
237
+ gr.Markdown("## Current Rankings")
238
+ metric_name = config.get("metric_name", "Score")
239
+ higher_better = "higher is better" if config.get("higher_is_better", True) else "lower is better"
240
+ gr.Markdown(f"*Ranked by {metric_name} ({higher_better})*")
241
+
242
+ leaderboard_output = gr.Dataframe(
243
+ headers=["Rank", "Submission", metric_name, "User", "Timestamp"],
244
+ datatype=["number", "str", "number", "str", "str"],
245
+ interactive=False
246
+ )
247
+ refresh_btn = gr.Button("Refresh Leaderboard")
248
+ refresh_btn.click(render_leaderboard, inputs=[], outputs=[leaderboard_output])
249
+
250
+ with gr.Tab("Submit"):
251
+ gr.Markdown("## Submit Your Results")
252
+ with gr.Row():
253
+ with gr.Column():
254
+ submission_name = gr.Textbox(label="Submission Name", placeholder="MyAwesomeModel v1.0")
255
+ score = gr.Number(label=metric_name, precision=4)
256
+ username = gr.Textbox(label="Username", placeholder="Your Hugging Face username")
257
+
258
+ # Only show token field if verification is enabled
259
+ if VERIFY_USERS:
260
+ hf_token = gr.Textbox(
261
+ label="Hugging Face Token",
262
+ placeholder="hf_...",
263
+ type="password"
264
+ )
265
+ else:
266
+ hf_token = gr.Textbox(visible=False)
267
+
268
+ submission_details = gr.Textbox(
269
+ label="Additional Details (optional)",
270
+ placeholder="Model details, training info, etc.",
271
+ lines=5
272
+ )
273
+ submit_btn = gr.Button("Submit to Leaderboard")
274
+
275
+ submit_output = gr.Markdown()
276
+ submission_leaderboard = gr.Dataframe(
277
+ headers=["Rank", "Submission", metric_name, "User", "Timestamp"],
278
+ datatype=["number", "str", "number", "str", "str"],
279
+ interactive=False
280
+ )
281
+
282
+ submit_btn.click(
283
+ submit,
284
+ inputs=[submission_name, score, username, hf_token, submission_details],
285
+ outputs=[submit_output, submission_leaderboard]
286
+ )
287
+
288
+ # Add admin tab if desired
289
+ with gr.Tab("About"):
290
+ gr.Markdown("## About This Leaderboard")
291
+
292
+ # Initialize the leaderboard on load
293
+ demo.load(render_leaderboard, inputs=[], outputs=[leaderboard_output])
294
+
295
+ if __name__ == "__main__":
296
+ demo.launch()
297
+ """
app_old.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
+ import pandas as pd
4
+ from apscheduler.schedulers.background import BackgroundScheduler
5
+ from huggingface_hub import snapshot_download
6
+
7
+ from src.about import (
8
+ CITATION_BUTTON_LABEL,
9
+ CITATION_BUTTON_TEXT,
10
+ EVALUATION_QUEUE_TEXT,
11
+ INTRODUCTION_TEXT,
12
+ LLM_BENCHMARKS_TEXT,
13
+ TITLE,
14
+ )
15
+ from src.display.css_html_js import custom_css
16
+ from src.display.utils import (
17
+ BENCHMARK_COLS,
18
+ COLS,
19
+ EVAL_COLS,
20
+ EVAL_TYPES,
21
+ AutoEvalColumn,
22
+ ModelType,
23
+ fields,
24
+ WeightType,
25
+ Precision
26
+ )
27
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
+ from src.submission.submit import add_new_eval
30
+
31
+
32
+ def restart_space():
33
+ API.restart_space(repo_id=REPO_ID)
34
+
35
+ ### Space initialisation
36
+ try:
37
+ print(EVAL_REQUESTS_PATH)
38
+ snapshot_download(
39
+ repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
+ )
41
+ except Exception:
42
+ restart_space()
43
+ try:
44
+ print(EVAL_RESULTS_PATH)
45
+ snapshot_download(
46
+ repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
+ )
48
+ except Exception:
49
+ restart_space()
50
+
51
+
52
+ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
+
54
+ (
55
+ finished_eval_queue_df,
56
+ running_eval_queue_df,
57
+ pending_eval_queue_df,
58
+ ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
+
60
+ def init_leaderboard(dataframe):
61
+ if dataframe is None or dataframe.empty:
62
+ raise ValueError("Leaderboard DataFrame is empty or None.")
63
+ return Leaderboard(
64
+ value=dataframe,
65
+ datatype=[c.type for c in fields(AutoEvalColumn)],
66
+ select_columns=SelectColumns(
67
+ default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
+ cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
+ label="Select Columns to Display:",
70
+ ),
71
+ search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
+ hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
+ filter_columns=[
74
+ ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
+ ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
+ ColumnFilter(
77
+ AutoEvalColumn.params.name,
78
+ type="slider",
79
+ min=0.01,
80
+ max=150,
81
+ label="Select the number of parameters (B)",
82
+ ),
83
+ ColumnFilter(
84
+ AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
+ ),
86
+ ],
87
+ bool_checkboxgroup_label="Hide models",
88
+ interactive=False,
89
+ )
90
+
91
+
92
+ demo = gr.Blocks(css=custom_css)
93
+ with demo:
94
+ gr.HTML(TITLE)
95
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
+
97
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
+ with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
+ leaderboard = init_leaderboard(LEADERBOARD_DF)
100
+
101
+ with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
102
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
+
104
+ with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
+ with gr.Column():
106
+ with gr.Row():
107
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
+
109
+ with gr.Column():
110
+ with gr.Accordion(
111
+ f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
112
+ open=False,
113
+ ):
114
+ with gr.Row():
115
+ finished_eval_table = gr.components.Dataframe(
116
+ value=finished_eval_queue_df,
117
+ headers=EVAL_COLS,
118
+ datatype=EVAL_TYPES,
119
+ row_count=5,
120
+ )
121
+ with gr.Accordion(
122
+ f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
123
+ open=False,
124
+ ):
125
+ with gr.Row():
126
+ running_eval_table = gr.components.Dataframe(
127
+ value=running_eval_queue_df,
128
+ headers=EVAL_COLS,
129
+ datatype=EVAL_TYPES,
130
+ row_count=5,
131
+ )
132
+
133
+ with gr.Accordion(
134
+ f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
+ open=False,
136
+ ):
137
+ with gr.Row():
138
+ pending_eval_table = gr.components.Dataframe(
139
+ value=pending_eval_queue_df,
140
+ headers=EVAL_COLS,
141
+ datatype=EVAL_TYPES,
142
+ row_count=5,
143
+ )
144
+ with gr.Row():
145
+ gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
146
+
147
+ with gr.Row():
148
+ with gr.Column():
149
+ model_name_textbox = gr.Textbox(label="Model name")
150
+ revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
+ model_type = gr.Dropdown(
152
+ choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
+ label="Model type",
154
+ multiselect=False,
155
+ value=None,
156
+ interactive=True,
157
+ )
158
+
159
+ with gr.Column():
160
+ precision = gr.Dropdown(
161
+ choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
+ label="Precision",
163
+ multiselect=False,
164
+ value="float16",
165
+ interactive=True,
166
+ )
167
+ weight_type = gr.Dropdown(
168
+ choices=[i.value.name for i in WeightType],
169
+ label="Weights type",
170
+ multiselect=False,
171
+ value="Original",
172
+ interactive=True,
173
+ )
174
+ base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
+
176
+ submit_button = gr.Button("Submit Eval")
177
+ submission_result = gr.Markdown()
178
+ submit_button.click(
179
+ add_new_eval,
180
+ [
181
+ model_name_textbox,
182
+ base_model_name_textbox,
183
+ revision_name_textbox,
184
+ precision,
185
+ weight_type,
186
+ model_type,
187
+ ],
188
+ submission_result,
189
+ )
190
+
191
+ with gr.Row():
192
+ with gr.Accordion("📙 Citation", open=False):
193
+ citation_button = gr.Textbox(
194
+ value=CITATION_BUTTON_TEXT,
195
+ label=CITATION_BUTTON_LABEL,
196
+ lines=20,
197
+ elem_id="citation-button",
198
+ show_copy_button=True,
199
+ )
200
+
201
+ scheduler = BackgroundScheduler()
202
+ scheduler.add_job(restart_space, "interval", seconds=1800)
203
+ scheduler.start()
204
+ demo.queue(default_concurrency_limit=40).launch()
config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "title": "TRAIL Performance Leaderboard",
3
+ "description": "Submit your results for the TRAIL Trace Reasoning and Issue Localization competition. Models are evaluated on a combination of Categorical F1 and Location Accuracy (Joint F1)",
4
+ "metric_name": "F1 Score",
5
+ "higher_is_better": true,
6
+ "max_submissions_per_user": 3,
7
+ "allow_submission_edits": false
8
+ }
database.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import datetime
4
+ from pathlib import Path
5
+ import numpy as np
6
+
7
+ class Database:
8
+ def __init__(self, submission_dir="submissions"):
9
+ self.submission_dir = submission_dir
10
+ os.makedirs(submission_dir, exist_ok=True)
11
+
12
+ def add_submission(self, submission):
13
+ """Add a new submission to the database"""
14
+ # Generate a timestamp and ID for the submission
15
+ timestamp = datetime.datetime.now().isoformat()
16
+ submission_id = f"{submission['model_name'].replace(' ', '_')}_{timestamp.replace(':', '-')}"
17
+
18
+ # Add timestamp and ID to submission
19
+ submission['timestamp'] = timestamp
20
+ submission['id'] = submission_id
21
+
22
+ # Save submission to a JSON file
23
+ file_path = os.path.join(self.submission_dir, f"{submission_id}.json")
24
+ with open(file_path, 'w') as f:
25
+ json.dump(submission, f, indent=2)
26
+
27
+ return submission_id
28
+
29
+ def get_submission(self, submission_id):
30
+ """Get a specific submission by ID"""
31
+ file_path = os.path.join(self.submission_dir, f"{submission_id}.json")
32
+ if os.path.exists(file_path):
33
+ with open(file_path, 'r') as f:
34
+ return json.load(f)
35
+ return None
36
+
37
+ def get_all_submissions(self):
38
+ """Get all submissions"""
39
+ submissions = []
40
+ for file_name in os.listdir(self.submission_dir):
41
+ if file_name.endswith('.json'):
42
+ file_path = os.path.join(self.submission_dir, file_name)
43
+ with open(file_path, 'r') as f:
44
+ submissions.append(json.load(f))
45
+ return submissions
46
+
47
+ def get_leaderboard(self, sort_by="score", ascending=False):
48
+ """Get submissions sorted for leaderboard display"""
49
+ submissions = self.get_all_submissions()
50
+
51
+ # Make sure we have submissions to sort
52
+ if not submissions:
53
+ return []
54
+
55
+ # Sort submissions
56
+ if sort_by in submissions[0]:
57
+ submissions.sort(key=lambda x: x.get(sort_by, 0), reverse=not ascending)
58
+
59
+ return submissions
60
+
61
+ def delete_submission(self, submission_id):
62
+ """Delete a submission by ID"""
63
+ file_path = os.path.join(self.submission_dir, f"{submission_id}.json")
64
+ if os.path.exists(file_path):
65
+ os.remove(file_path)
66
+ return True
67
+ return False
68
+
69
+ # Load leaderboard configuration
70
+ def load_config():
71
+ try:
72
+ if os.path.exists("models.json") and os.path.getsize("models.json") > 0:
73
+ with open("models.json", "r") as f:
74
+ return json.load(f)
75
+ else:
76
+ print("models.json file is empty or missing. Creating with default configuration.")
77
+ # Default configuration
78
+ config = {
79
+ "title": "TRAIL Model Leaderboard",
80
+ "description": "Submit and compare model performances",
81
+ "metrics": ["Cat. F1", "Loc. Acc", "Joint F1"],
82
+ "main_metric": "Cat. F1"
83
+ }
84
+ with open("models.json", "w") as f:
85
+ json.dump(config, f, indent=2)
86
+ return config
87
+ except json.JSONDecodeError:
88
+ print("Error parsing models.json. Creating with default configuration.")
89
+ # Default configuration if JSON is invalid
90
+ config = {
91
+ "title": "TRAIL Model Leaderboard",
92
+ "description": "Submit and compare model performances",
93
+ "metrics": ["Cat. F1", "Loc. Acc", "Joint F1"],
94
+ "main_metric": "Cat. F1"
95
+ }
96
+ with open("models.json", "w") as f:
97
+ json.dump(config, f, indent=2)
98
+ return config
leaderboard_gaia.csv ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ Rank,Model,Joint F1,Categorical F1,Location Accuracy,Date
2
+ 1,Gemini-2.5-Pro-Preview-05-06,0.183,0.389,0.546,2025-05-14
3
+ 2,Gemini-2.5-Flash-Preview-04-17,0.100,0.337,0.372,2025-05-14
4
+ 3,Open AI o3,0.092,0.296,0.535,2025-05-14
5
+ 4,Anthropic Claude-3.7-Sonnet,0.047,0.254,0.204,2025-05-14
6
+ 5,GPT-4.1,0.028,0.218,0.107,2025-05-14
7
+ 6,Open AI o1,0.013,0.138,0.040,2025-05-14
8
+ 7,Llama-4-Maverick-17B-128E-Instruct,0.122,0.023,0.000,2025-05-14
9
+ 8,Llama-4-Scout-17B-16E-Instruct,0.041,0.000,0.000,2025-05-14
leaderboard_swe.csv ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ Rank,Model,Joint F1,Categorical F1,Location Accuracy,Date
2
+ 1,Gemini-2.5-Pro-Preview-05-06,0.050,0.148,0.238,2025-05-14
3
+ 2,Gemini-2.5-Flash-Preview-04-17,0.000,0.213,0.060,2025-05-14
4
+ 3,Llama-4-Maverick-17B-128E-Instruct,0.000,0.191,0.083,2025-05-14
5
+ 4,GPT-4.1,0.000,0.166,0.000,2025-05-14
6
+ 5,Llama-4-Scout-17B-16E-Instruct,0.000,0.050,0.000,2025-05-14
7
+ 6,Open AI o1,CLE,CLE,CLE,2025-05-14
8
+ 7,Open AI o3,CLE,CLE,CLE,2025-05-14
9
+ 8,Anthropic Claude-3.7-Sonnet,CLE,CLE,CLE,2025-05-14
10
+
model ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "title": "TRAIL Performance Leaderboard",
3
+ "description": "This leaderboard tracks and compares model performance across multiple metrics. Submit your model results to see how they stack up!",
4
+ "metrics": ["Cat. F1", "Loc. Acc", "Joint F1"],
5
+ "main_metric": "Cat. F1"
6
+ }
models.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "title": "TRAIL Performance Leaderboard",
3
+ "description": "This leaderboard tracks and compares model performance across multiple metrics. Submit your model results to see how they stack up!",
4
+ "metrics": ["Cat. F1", "Loc. Acc", "Joint F1"],
5
+ "main_metric": "Cat. F1"
6
+ }
requirements.txt CHANGED
@@ -13,4 +13,7 @@ python-dateutil
13
  tqdm
14
  transformers
15
  tokenizers>=0.15.0
16
- sentencepiece
 
 
 
 
13
  tqdm
14
  transformers
15
  tokenizers>=0.15.0
16
+ sentencepiece
17
+ numpy>=1.24.3
18
+ pandas
19
+ huggingface_hub
setup.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Setup script to ensure all necessary files and directories are created
4
+ before running the application.
5
+ """
6
+
7
+ import os
8
+ import json
9
+ import sys
10
+
11
+ def setup():
12
+ """Create necessary directories and files if they don't exist."""
13
+ print("Setting up leaderboard application...")
14
+
15
+ # Create submissions directory
16
+ if not os.path.exists("submissions"):
17
+ print("Creating submissions directory...")
18
+ os.makedirs("submissions", exist_ok=True)
19
+
20
+ # Create models.json if it doesn't exist or is empty
21
+ if not os.path.exists("models.json") or os.path.getsize("models.json") == 0:
22
+ print("Creating models.json configuration file...")
23
+ config = {
24
+ "title": "TRAIL Performance Leaderboard",
25
+ "description": "This leaderboard tracks and compares model performance across multiple metrics. Submit your model results to see how they stack up!",
26
+ "metrics": ["Cat. F1", "Loc. Acc", "Joint F1"],
27
+ "main_metric": "Cat. F1"
28
+ }
29
+ with open("models.json", "w") as f:
30
+ json.dump(config, f, indent=2)
31
+ else:
32
+ # Validate JSON format
33
+ try:
34
+ with open("models.json", "r") as f:
35
+ json.load(f)
36
+ print("models.json exists and is valid.")
37
+ except json.JSONDecodeError:
38
+ print("models.json exists but has invalid JSON. Creating new file...")
39
+ config = {
40
+ "title": "Model Performance Leaderboard",
41
+ "description": "This leaderboard tracks and compares model performance across multiple metrics. Submit your model results to see how they stack up!",
42
+ "metrics": ["Cat. F1", "Loc. Acc", "Joint F1"],
43
+ "main_metric": "Cat. F1"
44
+ }
45
+ with open("models.json", "w") as f:
46
+ json.dump(config, f, indent=2)
47
+
48
+ print("Setup complete.")
49
+
50
+ if __name__ == "__main__":
51
+ setup()
start.sh ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Run setup script first
3
+ python setup.py
4
+
5
+ # Then start the main application
6
+ python app.py