Spaces:

PatronusAI
/

TRAIL

Sleeping

App Files Files Community

jitinpatronus commited on May 15

Commit

0380c4f

verified ·

1 Parent(s): 872c476

Upload 23 files

Browse files

Files changed (12) hide show

README.md +64 -24
app.py +292 -199
app_old.py +204 -0
config.json +8 -0
database.py +98 -0
leaderboard_gaia.csv +9 -0
leaderboard_swe.csv +10 -0
model +6 -0
models.json +6 -0
requirements.txt +4 -1
setup.py +51 -0
start.sh +6 -0

README.md CHANGED Viewed

@@ -1,3 +1,4 @@
 ---
 title: TRAIL
 emoji: 🥇
@@ -7,40 +8,79 @@ sdk: gradio
 app_file: app.py
 pinned: true
 license: mit
-short_description: Leaderboard for TRAIL
 sdk_version: 5.19.0
 ---
-# Start the configuration
-Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
-Results files should have the following format and be stored as json files:
 ```json
 {
-    "config": {
-        "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
-        "model_name": "path of the model on the hub: org/model",
-        "model_sha": "revision on the hub",
-    },
-    "results": {
-        "task_name": {
-            "metric_name": score,
-        },
-        "task_name2": {
-            "metric_name": score,
-        }
-    }
 }
 ```
-Request files are created automatically by this tool.
-If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
-# Code logic for more complex edits
-You'll find
-- the main table' columns names and properties in `src/display/utils.py`
-- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
-- the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`

 ---
 title: TRAIL
 emoji: 🥇
 app_file: app.py
 pinned: true
 license: mit
+short_description: 'TRAIL: Trace Reasoning and Agentic Issue Localization'
 sdk_version: 5.19.0
 ---
+# Model Performance Leaderboard
+This is a Hugging Face Space that hosts a leaderboard for comparing model performances across various metrics of TRAIL dataset.
+## Features
+- **Submit Model Results**: Share your model's performance metrics
+- **Interactive Leaderboard**: View and sort all submissions
+- **Integrated Backend**: Stores all submissions with timestamp and attribution
+- **Customizable Metrics**: Configure which metrics to display and track
+## Installation
+### Setting Up Your Space
+1. Upload all files to your Hugging Face Space
+2. Make sure to make `start.sh` executable:
+   ```bash
+   chmod +x start.sh
+   ```
+3. Configure your Space to use the `start.sh` script as the entry point
+### Troubleshooting Installation Issues
+If you encounter JSON parsing errors:
+1. Check if `models.json` exists and is a valid JSON file
+2. Run `python setup.py` to regenerate configuration files
+3. If problems persist, delete the `models.json` file and let the setup script create a new one
+## How to Use
+### Viewing the Leaderboard
+Navigate to the "Leaderboard" tab to see all submitted models. You can:
+- Sort by any metric (click on the dropdown)
+- Change sort order (ascending/descending)
+- Refresh the leaderboard for the latest submissions
+### Submitting a Model
+1. Go to the "Submit Model" tab
+2. Fill in your model name, your name, and optional description
+3. Enter values for the requested metrics
+4. Click "Submit Model"
+## Configuration
+You can customize this leaderboard by modifying the `models.json` file:
 ```json
 {
+  "title": "TRAIL Performance Leaderboard",
+  "description": "This leaderboard tracks and compares model performance across multiple metrics. Submit your model results to see how they stack up!",
+  "metrics": ["accuracy", "f1_score", "precision", "recall"],
+  "main_metric": "accuracy"
 }
 ```
+- `title`: The title of your leaderboard
+- `description`: A description that appears at the top
+- `metrics`: List of metrics to track
+- `main_metric`: Default metric for sorting
+## Technical Details
+This leaderboard is built using:
+- Gradio for the UI components
+- A file-based database to store submissions
+- Pandas for data manipulation and display
+## License
+This project is open source and available under the MIT license.

app.py CHANGED Viewed

@@ -1,204 +1,297 @@
 import gradio as gr
-from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 import pandas as pd
-from apscheduler.schedulers.background import BackgroundScheduler
-from huggingface_hub import snapshot_download
-from src.about import (
-    CITATION_BUTTON_LABEL,
-    CITATION_BUTTON_TEXT,
-    EVALUATION_QUEUE_TEXT,
-    INTRODUCTION_TEXT,
-    LLM_BENCHMARKS_TEXT,
-    TITLE,
-)
-from src.display.css_html_js import custom_css
-from src.display.utils import (
-    BENCHMARK_COLS,
-    COLS,
-    EVAL_COLS,
-    EVAL_TYPES,
-    AutoEvalColumn,
-    ModelType,
-    fields,
-    WeightType,
-    Precision
-)
-from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
-from src.populate import get_evaluation_queue_df, get_leaderboard_df
-from src.submission.submit import add_new_eval
-def restart_space():
-    API.restart_space(repo_id=REPO_ID)
-### Space initialisation
-try:
-    print(EVAL_REQUESTS_PATH)
-    snapshot_download(
-        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-except Exception:
-    restart_space()
-try:
-    print(EVAL_RESULTS_PATH)
-    snapshot_download(
-        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-except Exception:
-    restart_space()
-LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
-(
-    finished_eval_queue_df,
-    running_eval_queue_df,
-    pending_eval_queue_df,
-) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
-def init_leaderboard(dataframe):
-    if dataframe is None or dataframe.empty:
-        raise ValueError("Leaderboard DataFrame is empty or None.")
-    return Leaderboard(
-        value=dataframe,
-        datatype=[c.type for c in fields(AutoEvalColumn)],
-        select_columns=SelectColumns(
-            default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
-            cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
-            label="Select Columns to Display:",
-        ),
-        search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
-        hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
-        filter_columns=[
-            ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
-            ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
-            ColumnFilter(
-                AutoEvalColumn.params.name,
-                type="slider",
-                min=0.01,
-                max=150,
-                label="Select the number of parameters (B)",
-            ),
-            ColumnFilter(
-                AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
-            ),
-        ],
-        bool_checkboxgroup_label="Hide models",
-        interactive=False,
-    )
-demo = gr.Blocks(css=custom_css)
-with demo:
-    gr.HTML(TITLE)
-    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
-    with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
-            leaderboard = init_leaderboard(LEADERBOARD_DF)
-        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
-            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
-        with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
-            with gr.Column():
-                with gr.Row():
-                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
-                with gr.Column():
-                    with gr.Accordion(
-                        f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            finished_eval_table = gr.components.Dataframe(
-                                value=finished_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            running_eval_table = gr.components.Dataframe(
-                                value=running_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            pending_eval_table = gr.components.Dataframe(
-                                value=pending_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-            with gr.Row():
-                gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
-            with gr.Row():
-                with gr.Column():
-                    model_name_textbox = gr.Textbox(label="Model name")
-                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
-                    model_type = gr.Dropdown(
-                        choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
-                        label="Model type",
-                        multiselect=False,
-                        value=None,
-                        interactive=True,
-                    )
-                with gr.Column():
-                    precision = gr.Dropdown(
-                        choices=[i.value.name for i in Precision if i != Precision.Unknown],
-                        label="Precision",
-                        multiselect=False,
-                        value="float16",
-                        interactive=True,
-                    )
-                    weight_type = gr.Dropdown(
-                        choices=[i.value.name for i in WeightType],
-                        label="Weights type",
-                        multiselect=False,
-                        value="Original",
-                        interactive=True,
-                    )
-                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
-            submit_button = gr.Button("Submit Eval")
-            submission_result = gr.Markdown()
-            submit_button.click(
-                add_new_eval,
-                [
-                    model_name_textbox,
-                    base_model_name_textbox,
-                    revision_name_textbox,
-                    precision,
-                    weight_type,
-                    model_type,
-                ],
-                submission_result,
-            )
     with gr.Row():
-        with gr.Accordion("📙 Citation", open=False):
-            citation_button = gr.Textbox(
-                value=CITATION_BUTTON_TEXT,
-                label=CITATION_BUTTON_LABEL,
-                lines=20,
-                elem_id="citation-button",
-                show_copy_button=True,
-            )
-scheduler = BackgroundScheduler()
-scheduler.add_job(restart_space, "interval", seconds=1800)
-scheduler.start()
-demo.queue(default_concurrency_limit=40).launch()

 import gradio as gr
 import pandas as pd
+import os
+import shutil
+# Function to load leaderboard data from a CSV file
+def load_leaderboard_data(csv_file_path):
+    try:
+        df = pd.read_csv(csv_file_path)
+        return df
+    except Exception as e:
+        print(f"Error loading CSV file: {e}")
+        return pd.DataFrame()  # Return an empty DataFrame in case of error
+# Function to process uploaded JSON file
+def process_json_file(json_file):
+    try:
+        # Read the JSON file
+        data = pd.read_json(json_file.name)
+        # Here you can process the data as needed
+        # For demonstration, we'll just return the data as a dictionary
+        return data.to_dict()
+    except Exception as e:
+        return {"error": str(e)}
+# Load the leaderboard data
+leaderboard1 = load_leaderboard_data("leaderboard_swe.csv")
+leaderboard2 = load_leaderboard_data("leaderboard_gaia.csv")
+# Function to save the uploaded JSON file
+def save_json_file(file_path):
+    if not file_path:
+        return "No file uploaded."
+    # Define the directory to save uploaded files
+    save_dir = "uploaded_jsons"
+    os.makedirs(save_dir, exist_ok=True)
+    # Extract the original filename
+    original_filename = os.path.basename(file_path)
+    # Define the path to save the file
+    save_path = os.path.join(save_dir, original_filename)
+    # Move the uploaded file to the save directory
+    shutil.move(file_path, save_path)
+    return f"File saved to {save_path}"
+# Create the Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown("# 🥇 Leaderboards")
     with gr.Row():
+        with gr.Column():
+            gr.Markdown("## TRAIL-SWE Leaderboard")
+            gr.Dataframe(leaderboard1)
+        with gr.Column():
+            gr.Markdown("## TRAIL-GAIA Leaderboard")
+            gr.Dataframe(leaderboard2)
+    """
+    gr.Markdown("# Submit Here")
+    with gr.Row():
+        json_input = gr.File(label="Upload JSON File", type="filepath")
+        json_output = gr.JSON(label="Processed Output")
+    submit_button = gr.Button("Submit")
+    submit_button.click(process_json_file, inputs=json_input, outputs=json_output)
+    """
+    with gr.Blocks() as submit_page:
+        gr.Markdown("## Submit Your JSON File Here")
+        file_input = gr.File(label="Upload JSON File", type="filepath", file_types=['.json'])
+        submit_button = gr.Button("Submit", interactive=True)
+        output = gr.Textbox("") # Successfully submitted! Thank you for your contribution!
+        submit_button.click(fn=save_json_file, inputs=file_input, outputs=output)
+if __name__ == "__main__":
+    demo.launch()
+"""
+import gradio as gr
+import pandas as pd
+import os
+import json
+import uuid
+import hashlib
+from datetime import datetime
+from huggingface_hub import HfApi, login, HfFolder
+# Configuration
+LEADERBOARD_CSV = "leaderboard.csv"
+SUBMISSIONS_FOLDER = "submissions"
+CONFIG_FILE = "config.json"
+DEFAULT_COLUMNS = ["rank", "submission_name", "score", "user", "timestamp"]
+VERIFY_USERS = False  # Set to True to enable HF authentication
+# Default configuration
+DEFAULT_CONFIG = {
+    "title": "Hugging Face Competition Leaderboard",
+    "description": "Submit your results for the competition",
+    "metric_name": "Score",
+    "higher_is_better": True,
+    "max_submissions_per_user": 5,
+    "allow_submission_edits": True
+}
+# Ensure submissions folder exists
+os.makedirs(SUBMISSIONS_FOLDER, exist_ok=True)
+# Load or create config
+if os.path.exists(CONFIG_FILE):
+    with open(CONFIG_FILE, "r") as f:
+        config = json.load(f)
+else:
+    config = DEFAULT_CONFIG
+    with open(CONFIG_FILE, "w") as f:
+        json.dump(config, f, indent=2)
+# Initialize leaderboard if it doesn't exist
+if not os.path.exists(LEADERBOARD_CSV):
+    pd.DataFrame(columns=DEFAULT_COLUMNS).to_csv(LEADERBOARD_CSV, index=False)
+def read_leaderboard():
+    #Read the current leaderboard
+    if os.path.exists(LEADERBOARD_CSV):
+        df = pd.read_csv(LEADERBOARD_CSV)
+        return df
+    return pd.DataFrame(columns=DEFAULT_COLUMNS)
+def verify_user(username, token):
+    #Verify a user with their Hugging Face token
+    if not VERIFY_USERS:
+        return True
+    try:
+        api = HfApi(token=token)
+        user_info = api.whoami()
+        return user_info["name"] == username
+    except:
+        return False
+def count_user_submissions(username):
+    #Count how many submissions a user already has
+    df = read_leaderboard()
+    return len(df[df["user"] == username])
+def update_leaderboard():
+    #Update the leaderboard based on submissions
+    # Read all submissions
+    submissions = []
+    for filename in os.listdir(SUBMISSIONS_FOLDER):
+        if filename.endswith(".json"):
+            with open(os.path.join(SUBMISSIONS_FOLDER, filename), "r") as f:
+                try:
+                    data = json.load(f)
+                    submissions.append(data)
+                except json.JSONDecodeError:
+                    print(f"Error decoding {filename}")
+    if not submissions:
+        return pd.DataFrame(columns=DEFAULT_COLUMNS)
+    # Create dataframe and sort by score
+    df = pd.DataFrame(submissions)
+    # Sort based on configuration (higher or lower is better)
+    ascending = not config.get("higher_is_better", True)
+    df = df.sort_values("score", ascending=ascending)
+    # Add rank
+    df["rank"] = range(1, len(df) + 1)
+    # Save updated leaderboard
+    df.to_csv(LEADERBOARD_CSV, index=False)
+    return df
+def submit(submission_name, score, username, hf_token="", submission_details=None):
+    #Add a new submission to the leaderboard
+    if not submission_name or not username:
+        return "Submission name and username are required", None
+    try:
+        score = float(score)
+    except ValueError:
+        return "Score must be a valid number", None
+    # Verify user if enabled
+    if VERIFY_USERS and not verify_user(username, hf_token):
+        return "Invalid Hugging Face credentials", None
+    # Check submission limit
+    max_submissions = config.get("max_submissions_per_user", 5)
+    if count_user_submissions(username) >= max_submissions:
+        return f"You've reached the maximum of {max_submissions} submissions", None
+    # Create submission entry
+    submission_id = str(uuid.uuid4())[:8]
+    submission = {
+        "submission_id": submission_id,
+        "submission_name": submission_name,
+        "score": score,
+        "user": username,
+        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    }
+    # Add optional details
+    if submission_details:
+        submission["details"] = submission_details
+    # Save submission to file
+    filename = f"{username}_{submission_name.replace(' ', '_')}_{submission_id}.json"
+    with open(os.path.join(SUBMISSIONS_FOLDER, filename), "w") as f:
+        json.dump(submission, f)
+    # Update leaderboard
+    leaderboard = update_leaderboard()
+    return f"Submission '{submission_name}' added successfully!", leaderboard
+def render_leaderboard():
+    #Display the current leaderboard
+    df = update_leaderboard()
+    if len(df) == 0:
+        return "No submissions yet."
+    # Format the dataframe for display
+    display_df = df[DEFAULT_COLUMNS].copy()
+    return display_df
+# Create the Gradio interface
+with gr.Blocks(title=config["title"]) as demo:
+    gr.Markdown(f"# {config['title']}")
+    gr.Markdown(f"{config['description']}")
+    with gr.Tab("Leaderboard"):
+        gr.Markdown("## Current Rankings")
+        metric_name = config.get("metric_name", "Score")
+        higher_better = "higher is better" if config.get("higher_is_better", True) else "lower is better"
+        gr.Markdown(f"*Ranked by {metric_name} ({higher_better})*")
+        leaderboard_output = gr.Dataframe(
+            headers=["Rank", "Submission", metric_name, "User", "Timestamp"],
+            datatype=["number", "str", "number", "str", "str"],
+            interactive=False
+        )
+        refresh_btn = gr.Button("Refresh Leaderboard")
+        refresh_btn.click(render_leaderboard, inputs=[], outputs=[leaderboard_output])
+    with gr.Tab("Submit"):
+        gr.Markdown("## Submit Your Results")
+        with gr.Row():
+            with gr.Column():
+                submission_name = gr.Textbox(label="Submission Name", placeholder="MyAwesomeModel v1.0")
+                score = gr.Number(label=metric_name, precision=4)
+                username = gr.Textbox(label="Username", placeholder="Your Hugging Face username")
+                # Only show token field if verification is enabled
+                if VERIFY_USERS:
+                    hf_token = gr.Textbox(
+                        label="Hugging Face Token",
+                        placeholder="hf_...",
+                        type="password"
+                    )
+                else:
+                    hf_token = gr.Textbox(visible=False)
+                submission_details = gr.Textbox(
+                    label="Additional Details (optional)",
+                    placeholder="Model details, training info, etc.",
+                    lines=5
+                )
+                submit_btn = gr.Button("Submit to Leaderboard")
+        submit_output = gr.Markdown()
+        submission_leaderboard = gr.Dataframe(
+            headers=["Rank", "Submission", metric_name, "User", "Timestamp"],
+            datatype=["number", "str", "number", "str", "str"],
+            interactive=False
+        )
+        submit_btn.click(
+            submit,
+            inputs=[submission_name, score, username, hf_token, submission_details],
+            outputs=[submit_output, submission_leaderboard]
+        )
+    # Add admin tab if desired
+    with gr.Tab("About"):
+        gr.Markdown("## About This Leaderboard")
+    # Initialize the leaderboard on load
+    demo.load(render_leaderboard, inputs=[], outputs=[leaderboard_output])
+if __name__ == "__main__":
+    demo.launch()
+"""

app_old.py ADDED Viewed

	@@ -0,0 +1,204 @@

+import gradio as gr
+from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
+import pandas as pd
+from apscheduler.schedulers.background import BackgroundScheduler
+from huggingface_hub import snapshot_download
+from src.about import (
+    CITATION_BUTTON_LABEL,
+    CITATION_BUTTON_TEXT,
+    EVALUATION_QUEUE_TEXT,
+    INTRODUCTION_TEXT,
+    LLM_BENCHMARKS_TEXT,
+    TITLE,
+)
+from src.display.css_html_js import custom_css
+from src.display.utils import (
+    BENCHMARK_COLS,
+    COLS,
+    EVAL_COLS,
+    EVAL_TYPES,
+    AutoEvalColumn,
+    ModelType,
+    fields,
+    WeightType,
+    Precision
+)
+from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
+from src.populate import get_evaluation_queue_df, get_leaderboard_df
+from src.submission.submit import add_new_eval
+def restart_space():
+    API.restart_space(repo_id=REPO_ID)
+### Space initialisation
+try:
+    print(EVAL_REQUESTS_PATH)
+    snapshot_download(
+        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
+    )
+except Exception:
+    restart_space()
+try:
+    print(EVAL_RESULTS_PATH)
+    snapshot_download(
+        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
+    )
+except Exception:
+    restart_space()
+LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
+(
+    finished_eval_queue_df,
+    running_eval_queue_df,
+    pending_eval_queue_df,
+) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
+def init_leaderboard(dataframe):
+    if dataframe is None or dataframe.empty:
+        raise ValueError("Leaderboard DataFrame is empty or None.")
+    return Leaderboard(
+        value=dataframe,
+        datatype=[c.type for c in fields(AutoEvalColumn)],
+        select_columns=SelectColumns(
+            default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
+            cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
+            label="Select Columns to Display:",
+        ),
+        search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
+        hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
+        filter_columns=[
+            ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
+            ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
+            ColumnFilter(
+                AutoEvalColumn.params.name,
+                type="slider",
+                min=0.01,
+                max=150,
+                label="Select the number of parameters (B)",
+            ),
+            ColumnFilter(
+                AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
+            ),
+        ],
+        bool_checkboxgroup_label="Hide models",
+        interactive=False,
+    )
+demo = gr.Blocks(css=custom_css)
+with demo:
+    gr.HTML(TITLE)
+    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+    with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
+            leaderboard = init_leaderboard(LEADERBOARD_DF)
+        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
+            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
+        with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
+            with gr.Column():
+                with gr.Row():
+                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
+                with gr.Column():
+                    with gr.Accordion(
+                        f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
+                        open=False,
+                    ):
+                        with gr.Row():
+                            finished_eval_table = gr.components.Dataframe(
+                                value=finished_eval_queue_df,
+                                headers=EVAL_COLS,
+                                datatype=EVAL_TYPES,
+                                row_count=5,
+                            )
+                    with gr.Accordion(
+                        f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
+                        open=False,
+                    ):
+                        with gr.Row():
+                            running_eval_table = gr.components.Dataframe(
+                                value=running_eval_queue_df,
+                                headers=EVAL_COLS,
+                                datatype=EVAL_TYPES,
+                                row_count=5,
+                            )
+                    with gr.Accordion(
+                        f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
+                        open=False,
+                    ):
+                        with gr.Row():
+                            pending_eval_table = gr.components.Dataframe(
+                                value=pending_eval_queue_df,
+                                headers=EVAL_COLS,
+                                datatype=EVAL_TYPES,
+                                row_count=5,
+                            )
+            with gr.Row():
+                gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
+            with gr.Row():
+                with gr.Column():
+                    model_name_textbox = gr.Textbox(label="Model name")
+                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
+                    model_type = gr.Dropdown(
+                        choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
+                        label="Model type",
+                        multiselect=False,
+                        value=None,
+                        interactive=True,
+                    )
+                with gr.Column():
+                    precision = gr.Dropdown(
+                        choices=[i.value.name for i in Precision if i != Precision.Unknown],
+                        label="Precision",
+                        multiselect=False,
+                        value="float16",
+                        interactive=True,
+                    )
+                    weight_type = gr.Dropdown(
+                        choices=[i.value.name for i in WeightType],
+                        label="Weights type",
+                        multiselect=False,
+                        value="Original",
+                        interactive=True,
+                    )
+                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
+            submit_button = gr.Button("Submit Eval")
+            submission_result = gr.Markdown()
+            submit_button.click(
+                add_new_eval,
+                [
+                    model_name_textbox,
+                    base_model_name_textbox,
+                    revision_name_textbox,
+                    precision,
+                    weight_type,
+                    model_type,
+                ],
+                submission_result,
+            )
+    with gr.Row():
+        with gr.Accordion("📙 Citation", open=False):
+            citation_button = gr.Textbox(
+                value=CITATION_BUTTON_TEXT,
+                label=CITATION_BUTTON_LABEL,
+                lines=20,
+                elem_id="citation-button",
+                show_copy_button=True,
+            )
+scheduler = BackgroundScheduler()
+scheduler.add_job(restart_space, "interval", seconds=1800)
+scheduler.start()
+demo.queue(default_concurrency_limit=40).launch()

config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "title": "TRAIL Performance Leaderboard",
+    "description": "Submit your results for the TRAIL Trace Reasoning and Issue Localization competition. Models are evaluated on a combination of Categorical F1 and Location Accuracy (Joint F1)",
+    "metric_name": "F1 Score",
+    "higher_is_better": true,
+    "max_submissions_per_user": 3,
+    "allow_submission_edits": false
+  }

database.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import os
+import json
+import datetime
+from pathlib import Path
+import numpy as np
+class Database:
+    def __init__(self, submission_dir="submissions"):
+        self.submission_dir = submission_dir
+        os.makedirs(submission_dir, exist_ok=True)
+    def add_submission(self, submission):
+        """Add a new submission to the database"""
+        # Generate a timestamp and ID for the submission
+        timestamp = datetime.datetime.now().isoformat()
+        submission_id = f"{submission['model_name'].replace(' ', '_')}_{timestamp.replace(':', '-')}"
+        # Add timestamp and ID to submission
+        submission['timestamp'] = timestamp
+        submission['id'] = submission_id
+        # Save submission to a JSON file
+        file_path = os.path.join(self.submission_dir, f"{submission_id}.json")
+        with open(file_path, 'w') as f:
+            json.dump(submission, f, indent=2)
+        return submission_id
+    def get_submission(self, submission_id):
+        """Get a specific submission by ID"""
+        file_path = os.path.join(self.submission_dir, f"{submission_id}.json")
+        if os.path.exists(file_path):
+            with open(file_path, 'r') as f:
+                return json.load(f)
+        return None
+    def get_all_submissions(self):
+        """Get all submissions"""
+        submissions = []
+        for file_name in os.listdir(self.submission_dir):
+            if file_name.endswith('.json'):
+                file_path = os.path.join(self.submission_dir, file_name)
+                with open(file_path, 'r') as f:
+                    submissions.append(json.load(f))
+        return submissions
+    def get_leaderboard(self, sort_by="score", ascending=False):
+        """Get submissions sorted for leaderboard display"""
+        submissions = self.get_all_submissions()
+        # Make sure we have submissions to sort
+        if not submissions:
+            return []
+        # Sort submissions
+        if sort_by in submissions[0]:
+            submissions.sort(key=lambda x: x.get(sort_by, 0), reverse=not ascending)
+        return submissions
+    def delete_submission(self, submission_id):
+        """Delete a submission by ID"""
+        file_path = os.path.join(self.submission_dir, f"{submission_id}.json")
+        if os.path.exists(file_path):
+            os.remove(file_path)
+            return True
+        return False
+# Load leaderboard configuration
+def load_config():
+    try:
+        if os.path.exists("models.json") and os.path.getsize("models.json") > 0:
+            with open("models.json", "r") as f:
+                return json.load(f)
+        else:
+            print("models.json file is empty or missing. Creating with default configuration.")
+            # Default configuration
+            config = {
+                "title": "TRAIL Model Leaderboard",
+                "description": "Submit and compare model performances",
+                "metrics": ["Cat. F1", "Loc. Acc", "Joint F1"],
+                "main_metric": "Cat. F1"
+            }
+            with open("models.json", "w") as f:
+                json.dump(config, f, indent=2)
+            return config
+    except json.JSONDecodeError:
+        print("Error parsing models.json. Creating with default configuration.")
+        # Default configuration if JSON is invalid
+        config = {
+            "title": "TRAIL Model Leaderboard",
+            "description": "Submit and compare model performances",
+            "metrics": ["Cat. F1", "Loc. Acc", "Joint F1"],
+            "main_metric": "Cat. F1"
+        }
+        with open("models.json", "w") as f:
+            json.dump(config, f, indent=2)
+        return config

leaderboard_gaia.csv ADDED Viewed

	@@ -0,0 +1,9 @@

+Rank,Model,Joint F1,Categorical F1,Location Accuracy,Date
+1,Gemini-2.5-Pro-Preview-05-06,0.183,0.389,0.546,2025-05-14
+2,Gemini-2.5-Flash-Preview-04-17,0.100,0.337,0.372,2025-05-14
+3,Open AI o3,0.092,0.296,0.535,2025-05-14
+4,Anthropic Claude-3.7-Sonnet,0.047,0.254,0.204,2025-05-14
+5,GPT-4.1,0.028,0.218,0.107,2025-05-14
+6,Open AI o1,0.013,0.138,0.040,2025-05-14
+7,Llama-4-Maverick-17B-128E-Instruct,0.122,0.023,0.000,2025-05-14
+8,Llama-4-Scout-17B-16E-Instruct,0.041,0.000,0.000,2025-05-14

leaderboard_swe.csv ADDED Viewed

	@@ -0,0 +1,10 @@

+Rank,Model,Joint F1,Categorical F1,Location Accuracy,Date
+1,Gemini-2.5-Pro-Preview-05-06,0.050,0.148,0.238,2025-05-14
+2,Gemini-2.5-Flash-Preview-04-17,0.000,0.213,0.060,2025-05-14
+3,Llama-4-Maverick-17B-128E-Instruct,0.000,0.191,0.083,2025-05-14
+4,GPT-4.1,0.000,0.166,0.000,2025-05-14
+5,Llama-4-Scout-17B-16E-Instruct,0.000,0.050,0.000,2025-05-14
+6,Open AI o1,CLE,CLE,CLE,2025-05-14
+7,Open AI o3,CLE,CLE,CLE,2025-05-14
+8,Anthropic Claude-3.7-Sonnet,CLE,CLE,CLE,2025-05-14

model ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "title": "TRAIL Performance Leaderboard",
+  "description": "This leaderboard tracks and compares model performance across multiple metrics. Submit your model results to see how they stack up!",
+  "metrics": ["Cat. F1", "Loc. Acc", "Joint F1"],
+  "main_metric": "Cat. F1"
+}

models.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+    "title": "TRAIL Performance Leaderboard",
+    "description": "This leaderboard tracks and compares model performance across multiple metrics. Submit your model results to see how they stack up!",
+    "metrics": ["Cat. F1", "Loc. Acc", "Joint F1"],
+    "main_metric": "Cat. F1"
+  }

requirements.txt CHANGED Viewed

@@ -13,4 +13,7 @@ python-dateutil
 tqdm
 transformers
 tokenizers>=0.15.0
-sentencepiece

 tqdm
 transformers
 tokenizers>=0.15.0
+sentencepiece
+numpy>=1.24.3
+pandas
+huggingface_hub

setup.py ADDED Viewed

	@@ -0,0 +1,51 @@

+#!/usr/bin/env python3
+"""
+Setup script to ensure all necessary files and directories are created
+before running the application.
+"""
+import os
+import json
+import sys
+def setup():
+    """Create necessary directories and files if they don't exist."""
+    print("Setting up leaderboard application...")
+    # Create submissions directory
+    if not os.path.exists("submissions"):
+        print("Creating submissions directory...")
+        os.makedirs("submissions", exist_ok=True)
+    # Create models.json if it doesn't exist or is empty
+    if not os.path.exists("models.json") or os.path.getsize("models.json") == 0:
+        print("Creating models.json configuration file...")
+        config = {
+            "title": "TRAIL Performance Leaderboard",
+            "description": "This leaderboard tracks and compares model performance across multiple metrics. Submit your model results to see how they stack up!",
+            "metrics": ["Cat. F1", "Loc. Acc", "Joint F1"],
+            "main_metric": "Cat. F1"
+        }
+        with open("models.json", "w") as f:
+            json.dump(config, f, indent=2)
+    else:
+        # Validate JSON format
+        try:
+            with open("models.json", "r") as f:
+                json.load(f)
+            print("models.json exists and is valid.")
+        except json.JSONDecodeError:
+            print("models.json exists but has invalid JSON. Creating new file...")
+            config = {
+                "title": "Model Performance Leaderboard",
+                "description": "This leaderboard tracks and compares model performance across multiple metrics. Submit your model results to see how they stack up!",
+                "metrics": ["Cat. F1", "Loc. Acc", "Joint F1"],
+                "main_metric": "Cat. F1"
+            }
+            with open("models.json", "w") as f:
+                json.dump(config, f, indent=2)
+    print("Setup complete.")
+if __name__ == "__main__":
+    setup()

start.sh ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/bin/bash
+# Run setup script first
+python setup.py
+# Then start the main application
+python app.py