Spaces:

SamsungResearch
/

TRUEBench

Running

File size: 21,121 Bytes

8a254d6

import gradio as gr
from src.display.formatting import render_leaderboard_html, get_display_model_name
from src.data_utils import get_length_category_list, get_length_category_df
import pandas as pd
import numpy as np

def render_length_category_html(df, med_len_map=None):
    """
    Render the length category table with Model Name colored by Rank (gold/silver/bronze), no Rank column.
    Model Name cell includes Think, Model Type badges. Overall column is always right after Model Name.
    Optionally, inserts Med. Len. column after Overall if med_len_map is provided.
    """
    if df is None or df.empty:
        return "<div>No data available.</div>"

    # Compute Rank based on Overall (descending)
    df = df.copy()
    # 1. Sort so that empty strings come to the top first
    df = df.sort_values("Overall", key=lambda x: (x == "").astype(int))
    # 2. Then sort the actual values in descending order (empty strings are already at the top, so no effect)
    df = df.sort_values("Overall", ascending=False, kind="mergesort").reset_index(drop=True)
    df["Rank_Internal"] = df["Overall"].rank(method="min", ascending=False).astype(int)

    # Ensure Think and Model Type columns exist for badge rendering
    # Rename columns to ensure exact match
    if "Type" not in df.columns:
        df["Type"] = "unknown"
    if "Model Type" not in df.columns:
        df["Model Type"] = "unknown"
    if "Think" not in df.columns:
        df["Think"] = "unknown"

    # Optionally add Med. Len. column
    if med_len_map is not None:
        df["Med. Len."] = df["Model Name"].map(med_len_map)

    # Determine display columns: Model Name, Overall, Med. Len., {Category}, (rest, excluding Rank_Internal, Model Type, Think)
    base_cols = [col for col in df.columns if col not in ["Rank_Internal", "Comment", "Group", "Link"]]

    # Find the dynamic category column (e.g., "Short", "Long", etc.)
    from src.data_utils import get_length_category_list
    category_cols = [col for col in get_length_category_list() if col in base_cols]
    category_col = category_cols[0] if category_cols else None

    # Build display_cols: Model Name, Overall, Med. Len., {Category}, (rest)
    display_cols = []
    if "Model Name" in base_cols:
        display_cols.append("Model Name")
    if "Overall" in base_cols:
        display_cols.append("Overall")
    if "Med. Len." in base_cols:
        display_cols.append("Med. Len.")
    if "Med. Resp. Len." in base_cols:
        display_cols.append("Med. Resp. Len.")
    if category_col:
        display_cols.append(category_col)
    for col in base_cols:
        if col not in display_cols:
            display_cols.append(col)

    # Build HTML table
    html = '<table class="pretty-leaderboard-table">\n<thead><tr>'
    for col in display_cols:
        # Info icon for Model Name, Med. Len. and Med. Resp. Len.
        if col == "Model Name":
            html += (
                f'<th>{col}'
                '<span class="info-icon" title="Hovering the mouse displays additional details, and clicking the model name navigates to the corresponding page.">ⓘ</span>'
                '</th>'
            )
        elif col == "Med. Len.":
            html += (
                f'<th>{col}'
                '<span class="info-icon" title="Median token length of think and response for the model.">ⓘ</span>'
                '</th>'
            )
        elif col == "Med. Resp. Len.":
            html += (
                f'<th>{col}'
                '<span class="info-icon" title="Median token length of the model\'s responses (excluding think).">ⓘ</span>'
                '</th>'
            )
        else:
            html += f'<th>{col}</th>'
    html += '</tr></thead>\n<tbody>\n'

    # --- Define number formatting function ---
    from constants import NUMERIC_COLS_CATEGORY, NUMERIC_INT_COLS_CATEGORY
    def format_leaderboard_cell(cell, col):
        # Handle NaN/empty strings
        if pd.isna(cell) or (isinstance(cell, str) and cell.strip() == ""):
            return cell
        try:
            if col in NUMERIC_INT_COLS_CATEGORY:
                # Integer (rounded)
                return str(int(round(float(cell))))
            elif col in NUMERIC_COLS_CATEGORY:
                # Two decimal places
                return "{:.2f}".format(float(cell))
            else:
                return str(cell)
        except Exception:
            return str(cell)
    
    for idx, row in df.iterrows():
        html += '<tr>'
        for col in display_cols:
            cell = row[col]
            if col == "Model Name":
                # Gold/Silver/Bronze for 1/2/3
                rank = row["Rank_Internal"]
                if rank == 1:
                    style = "color: #ffd700; font-weight: bold; text-shadow: 0 0 4px #fff2;"
                elif rank == 2:
                    style = "color: #b0b0b0; font-weight: bold;"
                elif rank == 3:
                    style = "color: #cd7f32; font-weight: bold;"
                else:
                    style = "color: #fff; font-weight: 600;"

                # Badge HTML
                model_type = row["Model Type"] if "Model Type" in row else "unknown"
                think_type = row["Think"] if "Think" in row else "unknown"
                type_value = row["Type"] if "Type" in row else "unknown"
                from src.display.formatting import get_type_badge, get_think_badge, get_model_type_badge
                badge_html = (
                    get_type_badge(type_value)
                    + get_model_type_badge(model_type)
                    + get_think_badge(think_type)
                )

                display_name = get_display_model_name(str(cell))

                # --- Start of new logic for tooltip ---
                comment_value = ""
                # Check if 'Comment' column exists and the value is not NaN/empty
                if "Comment" in row and pd.notna(row["Comment"]) and str(row["Comment"]).strip() != "":
                    comment_value = str(row["Comment"]).strip()
                title_attribute = f' title="{comment_value}"' if comment_value else ""
                # --- End of new logic for tooltip ---

                # Link logic
                link_value = row["Link"] if "Link" in row and pd.notna(row["Link"]) and str(row["Link"]).strip() != "" else None
                if link_value:
                    clickable_name = f'<a href="{link_value}" target="_blank" style="color:inherit;">{display_name}</a>'
                else:
                    clickable_name = display_name

                html += f'<td><span style="{style}"{title_attribute}>{clickable_name}</span>{badge_html}</td>'
            elif col == "Overall":
                # Show stars
                from src.display.formatting import get_score_stars
                try:
                    unique_id = row.get("Model Name", None)
                    unique_id = unique_id.replace(" ", "_").replace("-", "_").replace("(", "_").replace(")", "_")
                    cell_html = get_score_stars(float(cell), unique_id=unique_id)
                except Exception:
                    cell_html = str(cell)
                html += f'<td>{cell_html}</td>'
            else:
                html += f'<td>{format_leaderboard_cell(cell, col)}</td>'
        html += '</tr>\n'
    html += '</tbody></table>'
    # Wrap in scrollable div for sticky header
    return f'<div class="leaderboard-table-container" style="max-height:900px;overflow-y:auto;">{html}</div>'

def render_length_category_table(leaderboard_df=None):
    """
    Renders a Category selector and a table showing length stats for the selected category.
    Uses Overall from leaderboard_df for ranking, coloring, and stars.
    """
    import gradio as gr

    categories = get_length_category_list()
    default_category = categories[0] if categories else ""
    # Merge Overall from leaderboard_df
    def get_merged_df(selected_category):
        df_cat = get_length_category_df(selected_category) if selected_category else None
        if leaderboard_df is not None and df_cat is not None:
            df_merged = df_cat.copy()
            # Use Overall and {Category} from leaderboard_df
            overall_map = dict(zip(leaderboard_df["Model Name"], leaderboard_df["Overall"]))
            category_map = dict(zip(leaderboard_df["Model Name"], leaderboard_df[selected_category]))
            df_merged["Overall"] = df_merged["Model Name"].map(overall_map)
            df_merged[selected_category] = df_merged["Model Name"].map(category_map)
            # Also map Model Type and Think
            if "Type" in leaderboard_df.columns:
                type_map = dict(zip(leaderboard_df["Model Name"], leaderboard_df["Type"]))
                df_merged["Type"] = df_merged["Model Name"].map(type_map)
            if "Model Type" in leaderboard_df.columns:
                model_type_map = dict(zip(leaderboard_df["Model Name"], leaderboard_df["Model Type"]))
                df_merged["Model Type"] = df_merged["Model Name"].map(model_type_map)
            if "Think" in leaderboard_df.columns:
                think_map = dict(zip(leaderboard_df["Model Name"], leaderboard_df["Think"]))
                df_merged["Think"] = df_merged["Model Name"].map(think_map)
            # Remove rows with missing Overall or {Category}
            df_merged = df_merged[df_merged["Overall"].notna() & df_merged[selected_category].notna()]
            return df_merged
        return df_cat

    df = get_merged_df(default_category)

    # Prepare med_len_map if possible
    med_len_map = None
    if leaderboard_df is not None and "Med. Len." in leaderboard_df.columns:
        med_len_map = dict(zip(leaderboard_df["Model Name"], leaderboard_df["Med. Len."]))

    with gr.Column():
        category_selector = gr.Dropdown(
            choices=categories,
            value=default_category,
            label="Select Category for Length Table",
            interactive=True,
        )

    table_html = gr.HTML(
        value=render_length_category_html(df, med_len_map=med_len_map) if df is not None else "<div>No data available.</div>",
        elem_id="length-category-table"
    )

    def update_table(selected_category):
        df = get_merged_df(selected_category)
        html = render_length_category_html(df, med_len_map=med_len_map)
        return html

    category_selector.change(
        fn=update_table,
        inputs=[category_selector],
        outputs=[table_html]
    )

    return {
        "category_selector": category_selector,
        "table_html": table_html,
    }

def create_leaderboard_tab(df, key):
    """
    df: DataFrame to display
    key: "Category" or "Language"
    column_selector_value: default columns to select
    """
    # Ensure df has Model, Model Type, Think columns for filtering
    # No need to create Model column, only use Model Name
    # Always ensure "Overall" column exists
    if "Overall" not in df.columns:
        return  # Or handle error appropriately
    # No additional mapping needed since DataFrame already has columns

    df_state = gr.State(df)

    # Create DataFrame including badge information (for upper table)
    df_badge = df.copy()
    # If Overall values are in the range 0~1, convert to 0~100
    if "Overall" in df_badge.columns and df_badge["Overall"].max() <= 1.0:
        df_badge["Overall"] = df_badge["Overall"] * 100
    # Remove Group column (only in display)
    for col_to_drop in ["Group"]:
        if col_to_drop in df_badge.columns:
            df_badge = df_badge.drop(columns=[col_to_drop])
    # Handle error if "Overall" column does not exist
    if "Overall" not in df_badge.columns:
        return  # Or handle error appropriately
    # Always sort by "Overall"
    # 1. Sort so that empty strings come to the top first
    df_badge = df_badge.sort_values("Overall", key=lambda x: (x == "").astype(int))
    # 2. Then sort the actual values in descending order (empty strings are already at the top, so no effect)
    df_badge = df_badge.sort_values("Overall", ascending=False, kind="mergesort").reset_index(drop=True)
    df_badge["Rank"] = df_badge.index + 1
    # Reorder "Rank" column to be right after "Model Name"
    cols = df_badge.columns.tolist()
    if "Model Name" in cols and "Rank" in cols:
        model_name_idx = cols.index("Model Name")
        cols.remove("Rank")
        cols.insert(model_name_idx + 1, "Rank")
        df_badge = df_badge[cols]
        
    with gr.Row():
        # Type Selector (Open/Proprietary)
        type_choices = ["Open", "Proprietary"]
        type_selector = gr.CheckboxGroup(
            choices=type_choices,
            value=type_choices,
            label="Select Type (Open/Proprietary)"
        )

        # Model Type Selector (Instruct/Think/Hybrid)
        model_type_choices = ["Instruct", "Think", "Hybrid"]
        model_type_selector = gr.CheckboxGroup(
            choices=model_type_choices,
            value=model_type_choices,
            label="Select Model Type (Instruct/Think/Hybrid)"
        )
        # Think Selector (On/Off)
        think_choices = ["On", "Off"]
        think_selector = gr.CheckboxGroup(
            choices=think_choices,
            value=think_choices,
            label="Select Think Mode (On/Off)"
        )
        # Add Gradio component for selecting sort criteria (always descending)
        # For language leaderboard, dynamically extract language columns + Avg. Len., Parameter Size (B)
        
        if key == "Language":
            import re
            language_columns = [col for col in df_badge.columns if re.fullmatch(r"[A-Z]{2}", col) or col == "VI"]
            available_sort_columns = ["Overall", "Med. Len.", "Med. Resp. Len.", "Parameter Size (B)"] + language_columns
        else:
            category_columns = [
                "Overall", "Med. Len.", "Med. Resp. Len.", "Parameter Size (B)", "Content Generation", "Editing", "Data Analysis", "Reasoning",
                "Hallucination", "Safety", "Repetition", "Summarization", "Translation", "Multi-Turn"
            ]
            available_sort_columns = [col for col in category_columns if col in df_badge.columns]
       
        sort_col_dropdown = gr.Dropdown(
            choices=available_sort_columns,
            value="Overall",
            label="Sort by",
            interactive=True,
        )

    # Sorting function
    leaderboard_html = render_leaderboard_html(df_badge.round(3), overall_col="Overall", key=key)
    leaderboard_html_comp = gr.HTML(value=leaderboard_html, elem_id="leaderboard-table")

    # Filtering logic for new selectors
    def unified_filter(types, model_types, thinks, df, sort_col):
        # Apply search filter first
        filtered = df.copy()
        if "Type" in filtered.columns and (not types or len(types) == 0):
            types = filtered["Type"].unique().tolist()
        if "Model Type" in filtered.columns and (not model_types or len(model_types) == 0):
            model_types = filtered["Model Type"].unique().tolist()
        if "Think" in filtered.columns and (not thinks or len(thinks) == 0):
            thinks = filtered["Think"].unique().tolist()
        # Defensive: always ensure "Overall" exists
        if "Type" in filtered.columns:
            filtered["Type"] = filtered["Type"].fillna("").astype(str)
            types_norm = [v.lower().strip() for v in types]
            filtered = filtered[filtered["Type"].str.lower().str.strip().isin(types_norm)]
        if "Model Type" in filtered.columns:
            filtered["Model Type"] = filtered["Model Type"].fillna("").astype(str)
            model_types_norm = [v.lower().strip() for v in model_types]
            filtered = filtered[filtered["Model Type"].str.lower().str.strip().isin(model_types_norm)]
        if "Think" in filtered.columns:
            filtered["Think"] = filtered["Think"].fillna("").astype(str)
            thinks_norm = [v.lower().strip() for v in thinks]
            filtered = filtered[filtered["Think"].str.lower().str.strip().isin(thinks_norm)]
        if "Overall" not in filtered.columns:
            html = "<div style='color:red'>No 'Overall' column found in data. Please check your input data.</div>"
            return html, sort_col
        # Always sort in descending order
        # To make empty strings come to the top, replace them with np.inf and sort descending
        sort_col_for_sort = filtered[sort_col].replace('', np.inf).astype(float)
        filtered = filtered.assign(sort_col_tmp=sort_col_for_sort)
        filtered = filtered.sort_values('sort_col_tmp', ascending=False, kind="mergesort").reset_index(drop=True)
        filtered = filtered.drop(columns=['sort_col_tmp'])
        # Add "Rank" column and reorder it to be right after "Model Name"
        filtered["Rank"] = filtered.index + 1
        cols = filtered.columns.tolist()
        if "Model Name" in cols and "Rank" in cols:
            model_name_idx = cols.index("Model Name")
            cols.remove("Rank")
            cols.insert(model_name_idx + 1, "Rank")
            filtered = filtered[cols]
        # Always remove Group column
        for col_to_drop in ["Group"]:
            if col_to_drop in filtered.columns:
                filtered = filtered.drop(columns=[col_to_drop])
        filtered._sort_col = sort_col
        # Extract top-5 models (currently sorted in descending order)
        top5_models = []
        if sort_col in filtered.columns and "Model Name" in filtered.columns:
            # 1. Sort so that empty strings come to the top first
            sort_col_for_sort = filtered[sort_col].replace('', np.inf).astype(float)
            filtered_df_sorted = filtered.assign(sort_col_tmp=sort_col_for_sort)
            filtered_df_sorted = filtered_df_sorted.sort_values('sort_col_tmp', ascending=False, kind="mergesort").reset_index(drop=True)
            top5_models = filtered_df_sorted["Model Name"].tolist()[:5]
        return render_leaderboard_html(filtered, overall_col="Overall", key=key), sort_col, top5_models

    # Download CSV function
    def dataframe_to_csv(data):
        import pandas as pd
        # Convert if data is not a DataFrame
        if isinstance(data, pd.DataFrame):
            df = data.copy() # Create a copy to avoid modifying the original DataFrame in memory
        else:
            df = pd.DataFrame(data)

        # Apply get_display_model_name to the "Model Name" column if it exists
        if "Model Name" in df.columns:
            df["Model Name"] = df["Model Name"].apply(get_display_model_name)

        csv_path = f"truebench_{key}.csv"
        df.to_csv(csv_path, index=False)
        return csv_path

    # Add DownloadButton (using CSS class)
    with gr.Row():
        with gr.Column(scale=1):
            pass  # Empty space
        with gr.Column(scale=0):
            download_btn = gr.DownloadButton(
                label="📥 Download to CSV",
                value=dataframe_to_csv,
                inputs=[df_state],
                visible=True,
                elem_classes=["custom-download-btn"]
            )
    
    # Add custom CSS
    custom_css = """
    <style>
    .custom-download-btn >>> a {
        background: #e3e6f3 !important;
        color: #222 !important;
        border: 1px solid rgba(0, 0, 0, 0.1) !important;
        border-radius: 6px !important;
        padding: 1px 1px !important;
        font-size: 13px !important;
        font-weight: bold !important;
        text-shadow: 0 1px 1px rgba(0,0,0,0.1) !important;
        margin: 0 3px 3px 0 !important;
    }
    .custom-download-btn:hover {
        background: #f5f6fa !important;
        box-shadow: 0 2px 6px rgba(0, 0, 0, 0.1) !important;
    }
    </style>
    """
    gr.HTML(custom_css)

    sort_col_dropdown.change(
        fn=unified_filter,
        inputs=[type_selector, model_type_selector, think_selector, df_state, sort_col_dropdown],
        outputs=[leaderboard_html_comp, sort_col_dropdown, gr.State()]  # Add top5_models
    )
    type_selector.change(
        fn=unified_filter,
        inputs=[type_selector, model_type_selector, think_selector, df_state, sort_col_dropdown],
        outputs=[leaderboard_html_comp, sort_col_dropdown, gr.State()]
    )
    model_type_selector.change(
        fn=unified_filter,
        inputs=[type_selector, model_type_selector, think_selector, df_state, sort_col_dropdown],
        outputs=[leaderboard_html_comp, sort_col_dropdown, gr.State()]
    )
    think_selector.change(
        fn=unified_filter,
        inputs=[type_selector, model_type_selector, think_selector, df_state, sort_col_dropdown],
        outputs=[leaderboard_html_comp, sort_col_dropdown, gr.State()]
    )

    return {
        "type_selector": type_selector,
        "model_type_selector": model_type_selector,
        "think_selector": think_selector,
        "leaderboard_html_comp": leaderboard_html_comp,
        "sort_col_dropdown": sort_col_dropdown,
        "df_state": df_state,
        "unified_filter": unified_filter  # Exposed for direct external call
    }