Jerrycool commited on
Commit
e4014fe
Β·
verified Β·
1 Parent(s): cc4532a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +156 -62
app.py CHANGED
@@ -2,48 +2,56 @@ import gradio as gr
2
  import pandas as pd
3
  from apscheduler.schedulers.background import BackgroundScheduler
4
  # Removed Hugging Face Hub imports as they are not needed for the simplified leaderboard
5
- # from huggingface_hub import snapshot_download, HfApi
6
- from src.about import ( # Assuming these still exist and are relevant for other tabs
 
 
7
  CITATION_BUTTON_LABEL,
8
  CITATION_BUTTON_TEXT,
9
- EVALUATION_QUEUE_TEXT,
10
  INTRODUCTION_TEXT,
11
  LLM_BENCHMARKS_TEXT,
12
  TITLE,
13
  )
14
- from src.display.css_html_js import custom_css # Keep custom CSS
15
- # Removed utils imports related to the old leaderboard
16
- # from src.display.utils import (...)
17
  from src.envs import REPO_ID # Keep if needed for restart_space or other functions
18
- # Removed constants related to old data paths and repos if not needed elsewhere
19
- # from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
20
- # Removed old data processing functions
21
- # from src.populate import get_evaluation_queue_df, get_leaderboard_df
22
- from src.submission.submit import add_new_eval # Keep submission logic
 
 
 
 
 
 
 
 
 
 
23
 
24
  # --- Elo Leaderboard Configuration ---
25
- # Data from the table provided by the user
 
 
26
  data = [
27
- {'model': 'gpt-4o-mini', 'MLE-Lite_Elo': 753, 'Tabular_Elo': 839, 'NLP_Elo': 758, 'CV_Elo': 754, 'Overall': 778},
28
- {'model': 'gpt-4o', 'MLE-Lite_Elo': 830, 'Tabular_Elo': 861, 'NLP_Elo': 903, 'CV_Elo': 761, 'Overall': 841},
29
- {'model': 'o3-mini', 'MLE-Lite_Elo': 1108, 'Tabular_Elo': 1019, 'NLP_Elo': 1056, 'CV_Elo': 1207, 'Overall': 1096},
30
- # Renamed 'DeepSeek-v3' to match previous list - adjust if needed
31
- {'model': 'deepseek-v3', 'MLE-Lite_Elo': 1004, 'Tabular_Elo': 1015, 'NLP_Elo': 1028, 'CV_Elo': 1067, 'Overall': 1023},
32
- # Renamed 'DeepSeek-r1' to match previous list - adjust if needed
33
- {'model': 'deepseek-r1', 'MLE-Lite_Elo': 1137, 'Tabular_Elo': 1053, 'NLP_Elo': 1103, 'CV_Elo': 1083, 'Overall': 1100},
34
- # Renamed 'Gemini-2.0-Flash' to match previous list - adjust if needed
35
- {'model': 'gemini-2.0-flash', 'MLE-Lite_Elo': 847, 'Tabular_Elo': 923, 'NLP_Elo': 860, 'CV_Elo': 978, 'Overall': 895},
36
- # Renamed 'Gemini-2.0-Pro' to match previous list - adjust if needed
37
- {'model': 'gemini-2.0-pro', 'MLE-Lite_Elo': 1064, 'Tabular_Elo': 1139, 'NLP_Elo': 1028, 'CV_Elo': 973, 'Overall': 1054},
38
- # Renamed 'Gemini-2.5-Pro' to match previous list - adjust if needed
39
- {'model': 'gemini-2.5-pro', 'MLE-Lite_Elo': 1257, 'Tabular_Elo': 1150, 'NLP_Elo': 1266, 'CV_Elo': 1177, 'Overall': 1214},
40
  ]
41
 
42
  # Create a master DataFrame
 
43
  master_df = pd.DataFrame(data)
44
 
45
  # Define categories for selection (user-facing)
46
- CATEGORIES = ["MLE-Lite", "Tabular", "NLP", "CV", "Overall"]
47
  DEFAULT_CATEGORY = "Overall" # Set a default category
48
 
49
  # Map user-facing categories to DataFrame column names
@@ -58,33 +66,58 @@ category_to_column = {
58
  # --- Helper function to update leaderboard ---
59
  def update_leaderboard(category):
60
  """
61
- Selects the relevant columns for the category, renames the score column
62
- to 'Elo Score', sorts by score descending, and returns the DataFrame.
63
  """
64
  score_column = category_to_column.get(category)
65
  if score_column is None or score_column not in master_df.columns:
66
- # Fallback if category or column is invalid
67
  print(f"Warning: Invalid category '{category}' or column '{score_column}'. Falling back to default.")
68
  score_column = category_to_column[DEFAULT_CATEGORY]
69
  if score_column not in master_df.columns: # Check fallback column too
70
- return pd.DataFrame({"Model": [], "Elo Score": []}) # Return empty if still invalid
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
- # Select model and the specific score column
73
- df = master_df[['model', score_column]].copy()
 
 
 
 
74
 
75
  # Rename the score column to 'Elo Score' for consistent display
76
  df.rename(columns={score_column: 'Elo Score'}, inplace=True)
77
 
78
- # Sort by 'Elo Score' descending
79
- df.sort_values(by='Elo Score', ascending=False, inplace=True)
 
 
80
 
81
- # Reset index for cleaner display (optional)
82
- df.reset_index(drop=True, inplace=True)
83
 
84
  return df
85
 
86
  # --- Mock/Placeholder functions/data for other tabs ---
87
- # (Same as previous version - providing empty data)
88
  print("Warning: Evaluation queue data fetching is disabled/mocked due to leaderboard changes.")
89
  finished_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "Started"])
90
  running_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "Started"])
@@ -94,38 +127,55 @@ EVAL_TYPES = ["str", "str", "str", "str"] # Define for the dataframe types
94
 
95
 
96
  # --- Keep restart function if relevant ---
97
- # (Same as previous version)
98
  def restart_space():
 
99
  print(f"Attempting to restart space: {REPO_ID}")
100
- # Replace with your actual space restart mechanism if needed
101
 
102
  # --- Gradio App Definition ---
103
- demo = gr.Blocks(css=custom_css)
 
 
 
 
 
 
 
 
 
 
 
104
 
105
  with demo:
 
106
  gr.HTML(TITLE)
 
107
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
108
 
109
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
110
- with gr.TabItem("πŸ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
111
  with gr.Column():
112
- gr.Markdown("## Model Elo Rankings") # New title for the section
113
  category_selector = gr.Radio(
114
  choices=CATEGORIES,
115
- label="Select Category to Sort By", # Updated label
116
- value=DEFAULT_CATEGORY, # Default selection
117
  interactive=True,
118
- container=False,
119
  )
120
  leaderboard_df_component = gr.Dataframe(
121
  # Initialize with sorted data for the default category
122
  value=update_leaderboard(DEFAULT_CATEGORY),
123
- headers=["Model", "Elo Score"],
124
- datatype=["str", "number"],
 
 
125
  interactive=False,
126
- # Adjust row count based on the number of models
 
127
  row_count=(len(master_df), "fixed"),
128
- col_count=(2, "fixed"),
 
 
129
  )
130
  # Link the radio button change to the update function
131
  category_selector.change(
@@ -134,20 +184,60 @@ with demo:
134
  outputs=leaderboard_df_component
135
  )
136
 
137
- with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
138
- # (Content unchanged)
139
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
140
 
141
- with gr.Row():
142
- with gr.Accordion("πŸ“™ Citation", open=False):
143
- # (Content unchanged)
144
- citation_button = gr.Textbox(
145
- value=CITATION_BUTTON_TEXT,
146
- label=CITATION_BUTTON_LABEL,
147
- lines=20,
148
- elem_id="citation-button",
149
- show_copy_button=True,
150
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
  # --- Keep scheduler if relevant ---
153
  # scheduler = BackgroundScheduler()
@@ -155,4 +245,8 @@ with demo:
155
  # scheduler.start()
156
 
157
  # --- Launch the app ---
158
- demo.launch()
 
 
 
 
 
2
  import pandas as pd
3
  from apscheduler.schedulers.background import BackgroundScheduler
4
  # Removed Hugging Face Hub imports as they are not needed for the simplified leaderboard
5
+
6
+ # --- Make sure these imports work relative to your file structure ---
7
+ # Option 1: If src is a directory in the same folder as your script:
8
+ from src.about import (
9
  CITATION_BUTTON_LABEL,
10
  CITATION_BUTTON_TEXT,
11
+ EVALUATION_QUEUE_TEXT, # Keep if used by commented-out submit tab
12
  INTRODUCTION_TEXT,
13
  LLM_BENCHMARKS_TEXT,
14
  TITLE,
15
  )
16
+ from src.display.css_html_js import custom_css
 
 
17
  from src.envs import REPO_ID # Keep if needed for restart_space or other functions
18
+ from src.submission.submit import add_new_eval # Keep if using the submit tab
19
+
20
+ # Option 2: If you don't have these files, define placeholders (REMOVE THIS if using Option 1)
21
+ # print("Warning: Using placeholder values for src module imports.")
22
+ # CITATION_BUTTON_LABEL="Citation"
23
+ # CITATION_BUTTON_TEXT="Please cite us if you use this benchmark..."
24
+ # EVALUATION_QUEUE_TEXT="Current evaluation queue:"
25
+ # INTRODUCTION_TEXT="Welcome to the MLE-Dojo Benchmark Leaderboard."
26
+ # LLM_BENCHMARKS_TEXT="Information about the benchmarks..."
27
+ # TITLE="<h1>πŸ† MLE-Dojo Benchmark Leaderboard</h1>"
28
+ # custom_css=""
29
+ # REPO_ID="your/space-id" # Replace with actual ID if needed
30
+ # def add_new_eval(*args): return "Submission placeholder."
31
+ # --- End Placeholder Definitions ---
32
+
33
 
34
  # --- Elo Leaderboard Configuration ---
35
+ # Enhanced data with Rank (placeholder), Organizer, License, and URL
36
+ # !!! IMPORTANT: Replace placeholder URLs with actual model/project pages. !!!
37
+ # Verify organizer and license information for accuracy.
38
  data = [
39
+ {'model_name': 'gpt-4o-mini', 'url': 'https://openai.com/index/hello-gpt-4o/', 'organizer': 'OpenAI', 'license': 'Proprietary', 'MLE-Lite_Elo': 753, 'Tabular_Elo': 839, 'NLP_Elo': 758, 'CV_Elo': 754, 'Overall': 778},
40
+ {'model_name': 'gpt-4o', 'url': 'https://openai.com/index/hello-gpt-4o/', 'organizer': 'OpenAI', 'license': 'Proprietary', 'MLE-Lite_Elo': 830, 'Tabular_Elo': 861, 'NLP_Elo': 903, 'CV_Elo': 761, 'Overall': 841},
41
+ {'model_name': 'o3-mini', 'url': 'https://placeholder.url/o3-mini', 'organizer': 'Unknown', 'license': 'Unknown', 'MLE-Lite_Elo': 1108, 'Tabular_Elo': 1019, 'NLP_Elo': 1056, 'CV_Elo': 1207, 'Overall': 1096}, # Fill details later
42
+ {'model_name': 'deepseek-v3', 'url': 'https://deepseek.com/', 'organizer': 'DeepSeek AI', 'license': 'DeepSeek License', 'MLE-Lite_Elo': 1004, 'Tabular_Elo': 1015, 'NLP_Elo': 1028, 'CV_Elo': 1067, 'Overall': 1023},
43
+ {'model_name': 'deepseek-r1', 'url': 'https://deepseek.com/', 'organizer': 'DeepSeek AI', 'license': 'DeepSeek License', 'MLE-Lite_Elo': 1137, 'Tabular_Elo': 1053, 'NLP_Elo': 1103, 'CV_Elo': 1083, 'Overall': 1100},
44
+ {'model_name': 'gemini-2.0-flash', 'url': 'https://deepmind.google/technologies/gemini/flash/', 'organizer': 'Google', 'license': 'Proprietary (API)', 'MLE-Lite_Elo': 847, 'Tabular_Elo': 923, 'NLP_Elo': 860, 'CV_Elo': 978, 'Overall': 895},
45
+ {'model_name': 'gemini-2.0-pro', 'url': 'https://deepmind.google/technologies/gemini/#introduction', 'organizer': 'Google', 'license': 'Proprietary (API)', 'MLE-Lite_Elo': 1064, 'Tabular_Elo': 1139, 'NLP_Elo': 1028, 'CV_Elo': 973, 'Overall': 1054},
46
+ {'model_name': 'gemini-2.5-pro', 'url': 'https://deepmind.google/technologies/gemini/2-5-pro/', 'organizer': 'Google', 'license': 'Proprietary (API)', 'MLE-Lite_Elo': 1257, 'Tabular_Elo': 1150, 'NLP_Elo': 1266, 'CV_Elo': 1177, 'Overall': 1214},
 
 
 
 
 
47
  ]
48
 
49
  # Create a master DataFrame
50
+ # Note: Columns 'organizer' and 'license' are created in lowercase here.
51
  master_df = pd.DataFrame(data)
52
 
53
  # Define categories for selection (user-facing)
54
+ CATEGORIES = ["Overall", "MLE-Lite", "Tabular", "NLP", "CV"] # Overall first
55
  DEFAULT_CATEGORY = "Overall" # Set a default category
56
 
57
  # Map user-facing categories to DataFrame column names
 
66
  # --- Helper function to update leaderboard ---
67
  def update_leaderboard(category):
68
  """
69
+ Selects relevant columns, sorts by the chosen category's Elo score,
70
+ adds Rank, formats model name as a link, and returns the DataFrame.
71
  """
72
  score_column = category_to_column.get(category)
73
  if score_column is None or score_column not in master_df.columns:
 
74
  print(f"Warning: Invalid category '{category}' or column '{score_column}'. Falling back to default.")
75
  score_column = category_to_column[DEFAULT_CATEGORY]
76
  if score_column not in master_df.columns: # Check fallback column too
77
+ # Return empty df with correct columns if still invalid
78
+ # Use lowercase keys here consistent with master_df for the empty case
79
+ return pd.DataFrame({
80
+ "Rank": [],
81
+ "Model": [],
82
+ "organizer": [], # lowercase
83
+ "license": [], # lowercase
84
+ "Elo Score": []
85
+ })
86
+
87
+ # Select base columns + the score column for sorting
88
+ # Ensure 'organizer' and 'license' are selected correctly (lowercase)
89
+ cols_to_select = ['model_name', 'url', 'organizer', 'license', score_column]
90
+ df = master_df[cols_to_select].copy()
91
+
92
+ # Sort by the selected 'Elo Score' descending
93
+ df.sort_values(by=score_column, ascending=False, inplace=True)
94
+
95
+ # Add Rank based on the sorted order
96
+ df.reset_index(drop=True, inplace=True)
97
+ df.insert(0, 'Rank', df.index + 1)
98
 
99
+ # Format Model Name as HTML Hyperlink
100
+ # The resulting column name will be 'Model' (capitalized)
101
+ df['Model'] = df.apply(
102
+ lambda row: f"<a href='{row['url'] if pd.notna(row['url']) else '#'}' target='_blank' style='color: #007bff; text-decoration: none;'>{row['model_name']}</a>",
103
+ axis=1
104
+ )
105
 
106
  # Rename the score column to 'Elo Score' for consistent display
107
  df.rename(columns={score_column: 'Elo Score'}, inplace=True)
108
 
109
+ # Select and reorder columns for final display using the ACTUAL column names in df
110
+ # Use lowercase 'organizer' and 'license' here because they haven't been renamed.
111
+ final_columns = ["Rank", "Model", "organizer", "license", "Elo Score"]
112
+ df = df[final_columns]
113
 
114
+ # Note: The DataFrame returned now has columns:
115
+ # 'Rank', 'Model', 'organizer', 'license', 'Elo Score'
116
 
117
  return df
118
 
119
  # --- Mock/Placeholder functions/data for other tabs ---
120
+ # (If the Submit tab is used, ensure these variables are appropriately populated or handled)
121
  print("Warning: Evaluation queue data fetching is disabled/mocked due to leaderboard changes.")
122
  finished_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "Started"])
123
  running_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "Started"])
 
127
 
128
 
129
  # --- Keep restart function if relevant ---
 
130
  def restart_space():
131
+ # Make sure REPO_ID is correctly defined/imported if this function is used
132
  print(f"Attempting to restart space: {REPO_ID}")
133
+ # Replace with your actual space restart mechanism if needed (e.g., HfApi().restart_space(REPO_ID))
134
 
135
  # --- Gradio App Definition ---
136
+ # Add custom CSS rules here or ensure custom_css is imported correctly
137
+ # Example CSS rules you might want in your custom_css:
138
+ # table { width: 100%; border-collapse: collapse; }
139
+ # th, td { padding: 8px 12px; border: 1px solid #ddd; text-align: left; white-space: normal; } /* Allow wrapping */
140
+ # th { background-color: #f2f2f2; font-weight: bold; }
141
+ # tr:nth-child(even) { background-color: #f9f9f9; }
142
+ # tr:hover { background-color: #e9e9e9; }
143
+ # td a { color: #007bff; text-decoration: none; }
144
+ # td a:hover { text-decoration: underline; }
145
+
146
+ # Use a theme for better default styling
147
+ demo = gr.Blocks(css=custom_css, theme=gr.themes.Soft())
148
 
149
  with demo:
150
+ # Use the TITLE variable imported or defined above
151
  gr.HTML(TITLE)
152
+ # Use the INTRODUCTION_TEXT variable imported or defined above
153
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
154
 
155
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
156
+ with gr.TabItem("πŸ… MLE-Dojo Benchmark", elem_id="llm-benchmark-tab-table", id=0):
157
  with gr.Column():
158
+ gr.Markdown("## Model Elo Rankings by Category")
159
  category_selector = gr.Radio(
160
  choices=CATEGORIES,
161
+ label="Select Category:",
162
+ value=DEFAULT_CATEGORY,
163
  interactive=True,
 
164
  )
165
  leaderboard_df_component = gr.Dataframe(
166
  # Initialize with sorted data for the default category
167
  value=update_leaderboard(DEFAULT_CATEGORY),
168
+ # Headers for DISPLAY remain capitalized
169
+ headers=["Rank", "Model", "Organizer", "License", "Elo Score"],
170
+ # Datatype maps to the final df columns: Rank, Model, organizer, license, Elo Score
171
+ datatype=["number", "html", "str", "str", "number"],
172
  interactive=False,
173
+ # --- FIX APPLIED: Removed unsupported 'height' argument ---
174
+ # row_count determines the number of rows to display
175
  row_count=(len(master_df), "fixed"),
176
+ col_count=(5, "fixed"),
177
+ wrap=True, # Allow text wrapping
178
+ elem_id="leaderboard-table" # CSS hook for custom styling
179
  )
180
  # Link the radio button change to the update function
181
  category_selector.change(
 
184
  outputs=leaderboard_df_component
185
  )
186
 
187
+ with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-about", id=1):
188
+ # Use the LLM_BENCHMARKS_TEXT variable imported or defined above
189
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
190
 
191
+ # --- Submit Tab (Commented out as in original request) ---
192
+ # Make sure EVALUATION_QUEUE_TEXT and add_new_eval are imported/defined if uncommented
193
+ # with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-submit", id=2):
194
+ # with gr.Column():
195
+ # with gr.Row():
196
+ # gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text") # Requires import/definition
197
+ # with gr.Column():
198
+ # with gr.Accordion(f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})", open=False):
199
+ # finished_eval_table = gr.components.Dataframe(
200
+ # value=finished_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5,
201
+ # )
202
+ # with gr.Accordion(f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})", open=False):
203
+ # running_eval_table = gr.components.Dataframe(
204
+ # value=running_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5,
205
+ # )
206
+ # with gr.Accordion(f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})", open=False):
207
+ # pending_eval_table = gr.components.Dataframe(
208
+ # value=pending_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5,
209
+ # )
210
+ # with gr.Row():
211
+ # gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
212
+ # with gr.Row():
213
+ # with gr.Column():
214
+ # model_name_textbox = gr.Textbox(label="Model name (on Hugging Face Hub)")
215
+ # revision_name_textbox = gr.Textbox(label="Revision / Commit Hash", placeholder="main")
216
+ # model_type = gr.Dropdown(choices=["Type A", "Type B", "Type C"], label="Model type", multiselect=False, value=None, interactive=True) # Example choices
217
+ # with gr.Column():
218
+ # precision = gr.Dropdown(choices=["float16", "bfloat16", "float32", "int8", "auto"], label="Precision", multiselect=False, value="auto", interactive=True)
219
+ # weight_type = gr.Dropdown(choices=["Original", "Adapter", "Delta"], label="Weights type", multiselect=False, value="Original", interactive=True)
220
+ # base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
221
+ # submit_button = gr.Button("Submit Eval")
222
+ # submission_result = gr.Markdown()
223
+ # # Ensure add_new_eval is correctly imported/defined and handles these inputs
224
+ # submit_button.click(
225
+ # add_new_eval, # Requires import/definition
226
+ # [ model_name_textbox, base_model_name_textbox, revision_name_textbox, precision, weight_type, model_type, ],
227
+ # submission_result,
228
+ # )
229
+
230
+
231
+ # --- Citation Row (at the bottom, outside Tabs) ---
232
+ with gr.Accordion("πŸ“™ Citation", open=False):
233
+ # Use the CITATION_BUTTON_TEXT and CITATION_BUTTON_LABEL variables imported or defined above
234
+ citation_button = gr.Textbox(
235
+ value=CITATION_BUTTON_TEXT,
236
+ label=CITATION_BUTTON_LABEL,
237
+ lines=10,
238
+ elem_id="citation-button",
239
+ show_copy_button=True,
240
+ )
241
 
242
  # --- Keep scheduler if relevant ---
243
  # scheduler = BackgroundScheduler()
 
245
  # scheduler.start()
246
 
247
  # --- Launch the app ---
248
+ # Ensures the app launches only when the script is run directly
249
+ if __name__ == "__main__":
250
+ # Ensure you have installed necessary libraries: pip install gradio pandas apscheduler
251
+ # Make sure your src module files (about.py etc.) are accessible OR use the placeholder definitions above.
252
+ demo.launch()