File size: 21,121 Bytes
8a254d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
import gradio as gr
from src.display.formatting import render_leaderboard_html, get_display_model_name
from src.data_utils import get_length_category_list, get_length_category_df
import pandas as pd
import numpy as np

def render_length_category_html(df, med_len_map=None):
    """
    Render the length category table with Model Name colored by Rank (gold/silver/bronze), no Rank column.
    Model Name cell includes Think, Model Type badges. Overall column is always right after Model Name.
    Optionally, inserts Med. Len. column after Overall if med_len_map is provided.
    """
    if df is None or df.empty:
        return "<div>No data available.</div>"

    # Compute Rank based on Overall (descending)
    df = df.copy()
    # 1. Sort so that empty strings come to the top first
    df = df.sort_values("Overall", key=lambda x: (x == "").astype(int))
    # 2. Then sort the actual values in descending order (empty strings are already at the top, so no effect)
    df = df.sort_values("Overall", ascending=False, kind="mergesort").reset_index(drop=True)
    df["Rank_Internal"] = df["Overall"].rank(method="min", ascending=False).astype(int)

    # Ensure Think and Model Type columns exist for badge rendering
    # Rename columns to ensure exact match
    if "Type" not in df.columns:
        df["Type"] = "unknown"
    if "Model Type" not in df.columns:
        df["Model Type"] = "unknown"
    if "Think" not in df.columns:
        df["Think"] = "unknown"

    # Optionally add Med. Len. column
    if med_len_map is not None:
        df["Med. Len."] = df["Model Name"].map(med_len_map)

    # Determine display columns: Model Name, Overall, Med. Len., {Category}, (rest, excluding Rank_Internal, Model Type, Think)
    base_cols = [col for col in df.columns if col not in ["Rank_Internal", "Comment", "Group", "Link"]]

    # Find the dynamic category column (e.g., "Short", "Long", etc.)
    from src.data_utils import get_length_category_list
    category_cols = [col for col in get_length_category_list() if col in base_cols]
    category_col = category_cols[0] if category_cols else None

    # Build display_cols: Model Name, Overall, Med. Len., {Category}, (rest)
    display_cols = []
    if "Model Name" in base_cols:
        display_cols.append("Model Name")
    if "Overall" in base_cols:
        display_cols.append("Overall")
    if "Med. Len." in base_cols:
        display_cols.append("Med. Len.")
    if "Med. Resp. Len." in base_cols:
        display_cols.append("Med. Resp. Len.")
    if category_col:
        display_cols.append(category_col)
    for col in base_cols:
        if col not in display_cols:
            display_cols.append(col)

    # Build HTML table
    html = '<table class="pretty-leaderboard-table">\n<thead><tr>'
    for col in display_cols:
        # Info icon for Model Name, Med. Len. and Med. Resp. Len.
        if col == "Model Name":
            html += (
                f'<th>{col}'
                '<span class="info-icon" title="Hovering the mouse displays additional details, and clicking the model name navigates to the corresponding page.">β“˜</span>'
                '</th>'
            )
        elif col == "Med. Len.":
            html += (
                f'<th>{col}'
                '<span class="info-icon" title="Median token length of think and response for the model.">β“˜</span>'
                '</th>'
            )
        elif col == "Med. Resp. Len.":
            html += (
                f'<th>{col}'
                '<span class="info-icon" title="Median token length of the model\'s responses (excluding think).">β“˜</span>'
                '</th>'
            )
        else:
            html += f'<th>{col}</th>'
    html += '</tr></thead>\n<tbody>\n'

    # --- Define number formatting function ---
    from constants import NUMERIC_COLS_CATEGORY, NUMERIC_INT_COLS_CATEGORY
    def format_leaderboard_cell(cell, col):
        # Handle NaN/empty strings
        if pd.isna(cell) or (isinstance(cell, str) and cell.strip() == ""):
            return cell
        try:
            if col in NUMERIC_INT_COLS_CATEGORY:
                # Integer (rounded)
                return str(int(round(float(cell))))
            elif col in NUMERIC_COLS_CATEGORY:
                # Two decimal places
                return "{:.2f}".format(float(cell))
            else:
                return str(cell)
        except Exception:
            return str(cell)
    
    for idx, row in df.iterrows():
        html += '<tr>'
        for col in display_cols:
            cell = row[col]
            if col == "Model Name":
                # Gold/Silver/Bronze for 1/2/3
                rank = row["Rank_Internal"]
                if rank == 1:
                    style = "color: #ffd700; font-weight: bold; text-shadow: 0 0 4px #fff2;"
                elif rank == 2:
                    style = "color: #b0b0b0; font-weight: bold;"
                elif rank == 3:
                    style = "color: #cd7f32; font-weight: bold;"
                else:
                    style = "color: #fff; font-weight: 600;"

                # Badge HTML
                model_type = row["Model Type"] if "Model Type" in row else "unknown"
                think_type = row["Think"] if "Think" in row else "unknown"
                type_value = row["Type"] if "Type" in row else "unknown"
                from src.display.formatting import get_type_badge, get_think_badge, get_model_type_badge
                badge_html = (
                    get_type_badge(type_value)
                    + get_model_type_badge(model_type)
                    + get_think_badge(think_type)
                )

                display_name = get_display_model_name(str(cell))

                # --- Start of new logic for tooltip ---
                comment_value = ""
                # Check if 'Comment' column exists and the value is not NaN/empty
                if "Comment" in row and pd.notna(row["Comment"]) and str(row["Comment"]).strip() != "":
                    comment_value = str(row["Comment"]).strip()
                title_attribute = f' title="{comment_value}"' if comment_value else ""
                # --- End of new logic for tooltip ---

                # Link logic
                link_value = row["Link"] if "Link" in row and pd.notna(row["Link"]) and str(row["Link"]).strip() != "" else None
                if link_value:
                    clickable_name = f'<a href="{link_value}" target="_blank" style="color:inherit;">{display_name}</a>'
                else:
                    clickable_name = display_name

                html += f'<td><span style="{style}"{title_attribute}>{clickable_name}</span>{badge_html}</td>'
            elif col == "Overall":
                # Show stars
                from src.display.formatting import get_score_stars
                try:
                    unique_id = row.get("Model Name", None)
                    unique_id = unique_id.replace(" ", "_").replace("-", "_").replace("(", "_").replace(")", "_")
                    cell_html = get_score_stars(float(cell), unique_id=unique_id)
                except Exception:
                    cell_html = str(cell)
                html += f'<td>{cell_html}</td>'
            else:
                html += f'<td>{format_leaderboard_cell(cell, col)}</td>'
        html += '</tr>\n'
    html += '</tbody></table>'
    # Wrap in scrollable div for sticky header
    return f'<div class="leaderboard-table-container" style="max-height:900px;overflow-y:auto;">{html}</div>'

def render_length_category_table(leaderboard_df=None):
    """
    Renders a Category selector and a table showing length stats for the selected category.
    Uses Overall from leaderboard_df for ranking, coloring, and stars.
    """
    import gradio as gr

    categories = get_length_category_list()
    default_category = categories[0] if categories else ""
    # Merge Overall from leaderboard_df
    def get_merged_df(selected_category):
        df_cat = get_length_category_df(selected_category) if selected_category else None
        if leaderboard_df is not None and df_cat is not None:
            df_merged = df_cat.copy()
            # Use Overall and {Category} from leaderboard_df
            overall_map = dict(zip(leaderboard_df["Model Name"], leaderboard_df["Overall"]))
            category_map = dict(zip(leaderboard_df["Model Name"], leaderboard_df[selected_category]))
            df_merged["Overall"] = df_merged["Model Name"].map(overall_map)
            df_merged[selected_category] = df_merged["Model Name"].map(category_map)
            # Also map Model Type and Think
            if "Type" in leaderboard_df.columns:
                type_map = dict(zip(leaderboard_df["Model Name"], leaderboard_df["Type"]))
                df_merged["Type"] = df_merged["Model Name"].map(type_map)
            if "Model Type" in leaderboard_df.columns:
                model_type_map = dict(zip(leaderboard_df["Model Name"], leaderboard_df["Model Type"]))
                df_merged["Model Type"] = df_merged["Model Name"].map(model_type_map)
            if "Think" in leaderboard_df.columns:
                think_map = dict(zip(leaderboard_df["Model Name"], leaderboard_df["Think"]))
                df_merged["Think"] = df_merged["Model Name"].map(think_map)
            # Remove rows with missing Overall or {Category}
            df_merged = df_merged[df_merged["Overall"].notna() & df_merged[selected_category].notna()]
            return df_merged
        return df_cat

    df = get_merged_df(default_category)

    # Prepare med_len_map if possible
    med_len_map = None
    if leaderboard_df is not None and "Med. Len." in leaderboard_df.columns:
        med_len_map = dict(zip(leaderboard_df["Model Name"], leaderboard_df["Med. Len."]))

    with gr.Column():
        category_selector = gr.Dropdown(
            choices=categories,
            value=default_category,
            label="Select Category for Length Table",
            interactive=True,
        )

    table_html = gr.HTML(
        value=render_length_category_html(df, med_len_map=med_len_map) if df is not None else "<div>No data available.</div>",
        elem_id="length-category-table"
    )

    def update_table(selected_category):
        df = get_merged_df(selected_category)
        html = render_length_category_html(df, med_len_map=med_len_map)
        return html

    category_selector.change(
        fn=update_table,
        inputs=[category_selector],
        outputs=[table_html]
    )

    return {
        "category_selector": category_selector,
        "table_html": table_html,
    }

def create_leaderboard_tab(df, key):
    """
    df: DataFrame to display
    key: "Category" or "Language"
    column_selector_value: default columns to select
    """
    # Ensure df has Model, Model Type, Think columns for filtering
    # No need to create Model column, only use Model Name
    # Always ensure "Overall" column exists
    if "Overall" not in df.columns:
        return  # Or handle error appropriately
    # No additional mapping needed since DataFrame already has columns

    df_state = gr.State(df)

    # Create DataFrame including badge information (for upper table)
    df_badge = df.copy()
    # If Overall values are in the range 0~1, convert to 0~100
    if "Overall" in df_badge.columns and df_badge["Overall"].max() <= 1.0:
        df_badge["Overall"] = df_badge["Overall"] * 100
    # Remove Group column (only in display)
    for col_to_drop in ["Group"]:
        if col_to_drop in df_badge.columns:
            df_badge = df_badge.drop(columns=[col_to_drop])
    # Handle error if "Overall" column does not exist
    if "Overall" not in df_badge.columns:
        return  # Or handle error appropriately
    # Always sort by "Overall"
    # 1. Sort so that empty strings come to the top first
    df_badge = df_badge.sort_values("Overall", key=lambda x: (x == "").astype(int))
    # 2. Then sort the actual values in descending order (empty strings are already at the top, so no effect)
    df_badge = df_badge.sort_values("Overall", ascending=False, kind="mergesort").reset_index(drop=True)
    df_badge["Rank"] = df_badge.index + 1
    # Reorder "Rank" column to be right after "Model Name"
    cols = df_badge.columns.tolist()
    if "Model Name" in cols and "Rank" in cols:
        model_name_idx = cols.index("Model Name")
        cols.remove("Rank")
        cols.insert(model_name_idx + 1, "Rank")
        df_badge = df_badge[cols]
        
    with gr.Row():
        # Type Selector (Open/Proprietary)
        type_choices = ["Open", "Proprietary"]
        type_selector = gr.CheckboxGroup(
            choices=type_choices,
            value=type_choices,
            label="Select Type (Open/Proprietary)"
        )

        # Model Type Selector (Instruct/Think/Hybrid)
        model_type_choices = ["Instruct", "Think", "Hybrid"]
        model_type_selector = gr.CheckboxGroup(
            choices=model_type_choices,
            value=model_type_choices,
            label="Select Model Type (Instruct/Think/Hybrid)"
        )
        # Think Selector (On/Off)
        think_choices = ["On", "Off"]
        think_selector = gr.CheckboxGroup(
            choices=think_choices,
            value=think_choices,
            label="Select Think Mode (On/Off)"
        )
        # Add Gradio component for selecting sort criteria (always descending)
        # For language leaderboard, dynamically extract language columns + Avg. Len., Parameter Size (B)
        
        if key == "Language":
            import re
            language_columns = [col for col in df_badge.columns if re.fullmatch(r"[A-Z]{2}", col) or col == "VI"]
            available_sort_columns = ["Overall", "Med. Len.", "Med. Resp. Len.", "Parameter Size (B)"] + language_columns
        else:
            category_columns = [
                "Overall", "Med. Len.", "Med. Resp. Len.", "Parameter Size (B)", "Content Generation", "Editing", "Data Analysis", "Reasoning",
                "Hallucination", "Safety", "Repetition", "Summarization", "Translation", "Multi-Turn"
            ]
            available_sort_columns = [col for col in category_columns if col in df_badge.columns]
       
        sort_col_dropdown = gr.Dropdown(
            choices=available_sort_columns,
            value="Overall",
            label="Sort by",
            interactive=True,
        )

    # Sorting function
    leaderboard_html = render_leaderboard_html(df_badge.round(3), overall_col="Overall", key=key)
    leaderboard_html_comp = gr.HTML(value=leaderboard_html, elem_id="leaderboard-table")

    # Filtering logic for new selectors
    def unified_filter(types, model_types, thinks, df, sort_col):
        # Apply search filter first
        filtered = df.copy()
        if "Type" in filtered.columns and (not types or len(types) == 0):
            types = filtered["Type"].unique().tolist()
        if "Model Type" in filtered.columns and (not model_types or len(model_types) == 0):
            model_types = filtered["Model Type"].unique().tolist()
        if "Think" in filtered.columns and (not thinks or len(thinks) == 0):
            thinks = filtered["Think"].unique().tolist()
        # Defensive: always ensure "Overall" exists
        if "Type" in filtered.columns:
            filtered["Type"] = filtered["Type"].fillna("").astype(str)
            types_norm = [v.lower().strip() for v in types]
            filtered = filtered[filtered["Type"].str.lower().str.strip().isin(types_norm)]
        if "Model Type" in filtered.columns:
            filtered["Model Type"] = filtered["Model Type"].fillna("").astype(str)
            model_types_norm = [v.lower().strip() for v in model_types]
            filtered = filtered[filtered["Model Type"].str.lower().str.strip().isin(model_types_norm)]
        if "Think" in filtered.columns:
            filtered["Think"] = filtered["Think"].fillna("").astype(str)
            thinks_norm = [v.lower().strip() for v in thinks]
            filtered = filtered[filtered["Think"].str.lower().str.strip().isin(thinks_norm)]
        if "Overall" not in filtered.columns:
            html = "<div style='color:red'>No 'Overall' column found in data. Please check your input data.</div>"
            return html, sort_col
        # Always sort in descending order
        # To make empty strings come to the top, replace them with np.inf and sort descending
        sort_col_for_sort = filtered[sort_col].replace('', np.inf).astype(float)
        filtered = filtered.assign(sort_col_tmp=sort_col_for_sort)
        filtered = filtered.sort_values('sort_col_tmp', ascending=False, kind="mergesort").reset_index(drop=True)
        filtered = filtered.drop(columns=['sort_col_tmp'])
        # Add "Rank" column and reorder it to be right after "Model Name"
        filtered["Rank"] = filtered.index + 1
        cols = filtered.columns.tolist()
        if "Model Name" in cols and "Rank" in cols:
            model_name_idx = cols.index("Model Name")
            cols.remove("Rank")
            cols.insert(model_name_idx + 1, "Rank")
            filtered = filtered[cols]
        # Always remove Group column
        for col_to_drop in ["Group"]:
            if col_to_drop in filtered.columns:
                filtered = filtered.drop(columns=[col_to_drop])
        filtered._sort_col = sort_col
        # Extract top-5 models (currently sorted in descending order)
        top5_models = []
        if sort_col in filtered.columns and "Model Name" in filtered.columns:
            # 1. Sort so that empty strings come to the top first
            sort_col_for_sort = filtered[sort_col].replace('', np.inf).astype(float)
            filtered_df_sorted = filtered.assign(sort_col_tmp=sort_col_for_sort)
            filtered_df_sorted = filtered_df_sorted.sort_values('sort_col_tmp', ascending=False, kind="mergesort").reset_index(drop=True)
            top5_models = filtered_df_sorted["Model Name"].tolist()[:5]
        return render_leaderboard_html(filtered, overall_col="Overall", key=key), sort_col, top5_models

    # Download CSV function
    def dataframe_to_csv(data):
        import pandas as pd
        # Convert if data is not a DataFrame
        if isinstance(data, pd.DataFrame):
            df = data.copy() # Create a copy to avoid modifying the original DataFrame in memory
        else:
            df = pd.DataFrame(data)

        # Apply get_display_model_name to the "Model Name" column if it exists
        if "Model Name" in df.columns:
            df["Model Name"] = df["Model Name"].apply(get_display_model_name)

        csv_path = f"truebench_{key}.csv"
        df.to_csv(csv_path, index=False)
        return csv_path

    # Add DownloadButton (using CSS class)
    with gr.Row():
        with gr.Column(scale=1):
            pass  # Empty space
        with gr.Column(scale=0):
            download_btn = gr.DownloadButton(
                label="πŸ“₯ Download to CSV",
                value=dataframe_to_csv,
                inputs=[df_state],
                visible=True,
                elem_classes=["custom-download-btn"]
            )
    
    # Add custom CSS
    custom_css = """
    <style>
    .custom-download-btn >>> a {
        background: #e3e6f3 !important;
        color: #222 !important;
        border: 1px solid rgba(0, 0, 0, 0.1) !important;
        border-radius: 6px !important;
        padding: 1px 1px !important;
        font-size: 13px !important;
        font-weight: bold !important;
        text-shadow: 0 1px 1px rgba(0,0,0,0.1) !important;
        margin: 0 3px 3px 0 !important;
    }
    .custom-download-btn:hover {
        background: #f5f6fa !important;
        box-shadow: 0 2px 6px rgba(0, 0, 0, 0.1) !important;
    }
    </style>
    """
    gr.HTML(custom_css)

    sort_col_dropdown.change(
        fn=unified_filter,
        inputs=[type_selector, model_type_selector, think_selector, df_state, sort_col_dropdown],
        outputs=[leaderboard_html_comp, sort_col_dropdown, gr.State()]  # Add top5_models
    )
    type_selector.change(
        fn=unified_filter,
        inputs=[type_selector, model_type_selector, think_selector, df_state, sort_col_dropdown],
        outputs=[leaderboard_html_comp, sort_col_dropdown, gr.State()]
    )
    model_type_selector.change(
        fn=unified_filter,
        inputs=[type_selector, model_type_selector, think_selector, df_state, sort_col_dropdown],
        outputs=[leaderboard_html_comp, sort_col_dropdown, gr.State()]
    )
    think_selector.change(
        fn=unified_filter,
        inputs=[type_selector, model_type_selector, think_selector, df_state, sort_col_dropdown],
        outputs=[leaderboard_html_comp, sort_col_dropdown, gr.State()]
    )

    return {
        "type_selector": type_selector,
        "model_type_selector": model_type_selector,
        "think_selector": think_selector,
        "leaderboard_html_comp": leaderboard_html_comp,
        "sort_col_dropdown": sort_col_dropdown,
        "df_state": df_state,
        "unified_filter": unified_filter  # Exposed for direct external call
    }