Spaces:

SamsungResearch
/

TRUEBench

Running

File size: 27,616 Bytes

8a254d6

import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.graph_objs._figure import Figure
from typing import Optional, List, Dict, Any
from src.display.formatting import get_display_model_name

SORT_COLUMN_MAP = {
    "Average Accuracy": "Avg AC",
    "Tool Selection Quality": "Avg TSQ", 
    "Session Cost": "Avg Total Cost"
}

def get_theme_colors(theme: str = "light") -> Dict[str, Any]:
    """Return color settings for the given theme."""
    if theme == "dark":
        return {
            "paper_bg": "#181c3a",  # darker blue-gray
            "plot_bg": "#181c3a",
            "legend_font_color": "#F5F6F7",
            "legend_bg": 'rgba(35,36,74,0.92)',  # slightly lighter than bg, but still dark
            "annotation_color": '#F5F6F7'
        }
    else:
        return {
            "paper_bg": "#23244a",  # deep blue-gray
            "plot_bg": "#23244a",
            "legend_font_color": "#F5F6F7",
            "legend_bg": 'rgba(35,36,74,0.92)',  # match bg for harmony
            "annotation_color": '#F5F6F7'
        }

def create_empty_radar_chart(message: str) -> Figure:
    """Create an empty radar chart with a message."""
    fig = go.Figure()
    fig.add_annotation(
        text=f"📊 {message}",
        xref="paper", yref="paper",
        x=0.5, y=0.5,
        xanchor='center', yanchor='middle',
        font=dict(
            size=18, 
            color="#94A3B8",
            family="Verdana, sans-serif"
        ),
        showarrow=False,
        bgcolor="rgba(245, 246, 247, 0.05)",
        bordercolor="rgba(245, 246, 247, 0.2)",
        borderwidth=1,
        borderpad=20
    )
    fig.update_layout(
        paper_bgcolor="#01091A",
        plot_bgcolor="rgba(245, 246, 247, 0.02)", 
        height=800,
        width=800,
        margin=dict(t=100, b=80, l=80, r=80),
        title=dict(
            text="<b>Domain Performance Chart</b>",
            x=0.5,
            y=0.97,
            font=dict(
                size=22, 
                family="Verdana, sans-serif", 
                color="#F5F6F7",
                weight=700
            ),
        ),
        annotations=[
            dict(
                text="TRUEBench",
                xref="paper", yref="paper",
                x=0.98, y=0.02,
                xanchor='right', yanchor='bottom',
                font=dict(size=10, color='#64748B'),
                showarrow=False
            )
        ]
    )
    return fig

def create_len_overall_scatter(
    df: pd.DataFrame,
    selected_models: Optional[List[str]] = None,
    max_models: int = 30,
    y_col: str = "Overall",
    length_data: Optional[dict] = None,
    theme: str = "light",
    x_axis_data_source: str = "Med. Len."
) -> Figure:
    """
    Create scatter plot showing Med. Len. vs selected y_col for up to 10 selected models.
    Each dot is colored by Think (normal/reasoning), and the legend is by Think.
    DataFrame must include an 'Think' column.
    length_data: JSON data containing model length information by category
    theme: "light" or "dark" (default: "light")
    """
    import plotly.express as px
    import json

    # Defensive: check required columns
    required_cols = ['Model Name', 'Med. Len.', 'Med. Resp. Len.', y_col]
    for col in required_cols:
        if col not in df.columns:
            return create_empty_radar_chart(f"Column '{col}' not found in data")
    # Think column check
    think_col = None
    for candidate in ['Think']:
        if candidate in df.columns:
            think_col = candidate
            break
    if think_col is None:
        return create_empty_radar_chart("Column 'Think' not found in data")
    # Filter by selected_models
    if selected_models is not None and len(selected_models) > 0:
        df_filtered = df[df['Model Name'].isin(selected_models)].copy()
    else:
        # Default: top-N by Overall
        df_filtered = df.copy()
        df_filtered = df_filtered.sort_values('Overall', ascending=False).head(max_models)
    if df_filtered.empty:
        return create_empty_radar_chart(f"No data available for {x_axis_data_source} vs {y_col} analysis")

    # Determine x-axis data based on x_axis_data_source
    x_axis_col_name = x_axis_data_source # Use this for the DataFrame column
    length_data_key = 'Med' if x_axis_data_source == "Med. Len." else 'Med Resp'

    if y_col == "Overall":
        # For 'Overall' category, prefer direct DataFrame column reading
        df_filtered[x_axis_col_name] = pd.to_numeric(df_filtered[x_axis_col_name], errors='coerce')
    elif length_data:
        # For other categories, use length_data if available
        df_filtered[x_axis_col_name] = df_filtered['Model Name'].apply(
            lambda x: length_data.get(x, {}).get(y_col, {}).get(length_data_key, 0)
        )
    else:
        # Fallback if no length_data and not 'Overall' (though this case should ideally be handled by required_cols)
        df_filtered[x_axis_col_name] = pd.to_numeric(df_filtered[x_axis_col_name], errors='coerce')

    df_filtered[y_col] = pd.to_numeric(df_filtered[y_col], errors='coerce')
    if 'Type' in df_filtered.columns:
        df_filtered = df_filtered[df_filtered['Type'] != 'Proprietary']
    if 'Parameter Size (B)' in df_filtered.columns:
        df_filtered['Parameter Size (B)'] = pd.to_numeric(df_filtered['Parameter Size (B)'], errors='coerce')
        min_size = 20
        max_size = 80
        param_sizes = df_filtered['Parameter Size (B)'].fillna(5)
        log_sizes = np.log10(param_sizes)
        log_min = np.log10(5)
        log_max = np.log10(param_sizes.max())
        marker_sizes = min_size + ((log_sizes - log_min) / (log_max - log_min)) * (max_size - min_size)
    else:
        marker_sizes = [30] * len(df_filtered)

    legend_name_map = {
        'On': 'Thinking',
        'Off': 'Non-Thinking'
    }
    color_palette = {
        "Thinking": "#FCE39B",
        "Non-Thinking": "#FF9185"
    }
    df_filtered['MarkerType'] = df_filtered['Parameter Size (B)'].apply(
        lambda x: 'circle' if pd.notna(x) else 'star'
    )
    df_filtered['ThinkDisplay'] = df_filtered['Think'].map(legend_name_map).fillna(df_filtered['Think'])
    prefix_map = {
        'circle': 'Open',
        'star': 'Proprietary'
    }
    combinations = df_filtered[['ThinkDisplay', 'MarkerType']].drop_duplicates()
    marker_order = {'circle': 0, 'star': 1}
    think_order = {'Thinking': 0, 'Non-Thinking': 1}
    combinations['sort_key'] = combinations.apply(
        lambda row: (marker_order.get(row['MarkerType'], 99), think_order.get(row['ThinkDisplay'], 99)),
        axis=1
    )
    combinations = combinations.sort_values('sort_key')

    fig = go.Figure()
    legend_shown = set()
    median_x = df_filtered[x_axis_col_name].median()
    median_y = df_filtered[y_col].median()

    x_axis_display_name = x_axis_data_source.replace("Med.", "Median").replace("Len.", "Length")

    fig.add_vline(
        x=median_x,
        line_dash="dash",
        line_color="#64748B",
        opacity=0.6,
        line_width=1.5,
        annotation_text=f"{x_axis_display_name}",
        annotation_position="top right",
        annotation_font=dict(size=10, color="#64748B")
    )
    fig.add_hline(
        y=median_y,
        line_dash="dash",
        line_color="#64748B",
        opacity=0.6,
        line_width=1.5,
        annotation_text=f"Median {y_col}",
        annotation_position="bottom right",
        annotation_font=dict(size=10, color="#64748B")
    )

    for _, row in combinations.iterrows():
        think = row['ThinkDisplay']
        marker_type = row['MarkerType']
        prefix = prefix_map.get(marker_type, '')
        legend_name = f"{prefix} {think}"
        sub_df = df_filtered[
            (df_filtered['ThinkDisplay'] == think) &
            (df_filtered['MarkerType'] == marker_type)
        ]
        color = color_palette.get(think, "#1098F7")
        sub_marker_sizes = (
            marker_sizes[sub_df.index]
            if 'Parameter Size (B)' in df_filtered.columns and marker_type == 'circle'
            else [30] * len(sub_df)
        )
        show_legend = legend_name not in legend_shown
        legend_shown.add(legend_name)
        fig.add_trace(go.Scatter(
            x=sub_df[x_axis_col_name],
            y=sub_df[y_col],
            mode='markers+text',
            name=legend_name,
            legendgroup=legend_name,
            showlegend=show_legend,
            marker_symbol=marker_type,
            marker=dict(
                size=sub_marker_sizes,
                color=color,
                opacity=0.85,
                line=dict(width=2, color='#01091A')
            ),
            text=sub_df['Model Name'].apply(get_display_model_name),
            textposition="top center",
            textfont=dict(size=10, color='#94A3B8'),
            hovertemplate="<b>%{text}</b><br>" +
                        f"{x_axis_display_name}: "+"%{x:.2f}<br>" +
                        f"{y_col}: "+"%{y:.2f}<br>" +
                        f"Think: {legend_name}<br>" +
                        ("Parameter Size: %{customdata}B<br>" if marker_type == 'circle' else "") +
                        "<extra></extra>",
            customdata=sub_df['Parameter Size (B)'].values if marker_type == 'circle' else None
        ))

    # Theme colors
    theme_colors = get_theme_colors(theme)
    fig.update_layout(
        title=dict(
            text=f"<b>{y_col} {x_axis_display_name} vs Category Score</b>",
            x=0.5,
            y=0.97,
            font=dict(size=22, family="Verdana, sans-serif", color=theme_colors["legend_font_color"], weight=700)
        ),
        xaxis=dict(
            title=dict(
                text=f"<b>{y_col} {x_axis_display_name}</b>",
                font=dict(size=16, color=theme_colors["legend_font_color"])
            ),
            tickfont=dict(size=12, color="#94A3B8"),
            gridcolor="rgba(245, 246, 247, 0.1)",
            zerolinecolor="rgba(245, 246, 247, 0.2)"
        ),
        yaxis=dict(
            title=dict(
                text=f"<b>{y_col} Score</b>",
                font=dict(size=16, color=theme_colors["legend_font_color"])
            ),
            tickfont=dict(size=12, color="#94A3B8"),
            gridcolor="rgba(245, 246, 247, 0.1)",
            zerolinecolor="rgba(245, 246, 247, 0.2)"
        ),
        paper_bgcolor=theme_colors["paper_bg"],
        plot_bgcolor=theme_colors["plot_bg"],
        height=900,
        width=1450,
        showlegend=True,
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1,
            xanchor="center",
            x=0.5,
            font=dict(size=12, family="Verdana, sans-serif", color=theme_colors["legend_font_color"]),
            bgcolor=theme_colors["legend_bg"],
            bordercolor='rgba(245, 246, 247, 0.2)',
            borderwidth=1
        ),
        margin=dict(t=100, b=80, l=80, r=80)
    )
    return fig

def create_language_radar_chart(
    df: pd.DataFrame,
    metric_type: str,
    selected_models: Optional[List[str]] = None,
    max_models: int = 5,
    theme: str = "light"
) -> Figure:
    """
    Create a radar chart showing model performance across languages for the selected models.
    theme: "light" or "dark" (default: "light")
    """
    language_domains = ['KO', 'EN', 'JA', 'ZH', 'PL', 'DE', 'PT', 'ES', 'FR', 'IT', 'RU', 'VI']
    if selected_models is None or len(selected_models) == 0:
        actual_metric_type = SORT_COLUMN_MAP.get(metric_type, metric_type)
        if actual_metric_type in df.columns:
            selected_models = df.nlargest(max_models, actual_metric_type)['Model Name'].tolist()
        else:
            selected_models = df.head(max_models)['Model Name'].tolist()
    selected_models = selected_models[:max_models]
    harmonious_palette_light = [
        {'fill': 'rgba(79,143,198,0.25)', 'line': '#4F8FC6', 'name': 'BlueGray'},
        {'fill': 'rgba(109,213,237,0.25)', 'line': '#6DD5ED', 'name': 'SkyBlue'},
        {'fill': 'rgba(162,89,247,0.25)', 'line': '#A259F7', 'name': 'Violet'},
        {'fill': 'rgba(67,233,123,0.25)', 'line': '#43E97B', 'name': 'Mint'},
        {'fill': 'rgba(255,215,0,0.20)', 'line': '#FFD700', 'name': 'Gold'}
    ]
    harmonious_palette_dark = [
        {'fill': 'rgba(144,202,249,0.25)', 'line': '#90CAF9', 'name': 'LightBlue'},
        {'fill': 'rgba(128,203,196,0.25)', 'line': '#80CBC4', 'name': 'Mint'},
        {'fill': 'rgba(179,157,219,0.25)', 'line': '#B39DDB', 'name': 'Lavender'},
        {'fill': 'rgba(244,143,177,0.25)', 'line': '#F48FB1', 'name': 'Pink'},
        {'fill': 'rgba(255,213,79,0.20)', 'line': '#FFD54F', 'name': 'Gold'}
    ]
    palette = harmonious_palette_light if theme == "light" else harmonious_palette_dark
    fig = go.Figure()
    for idx, model_name in enumerate(selected_models):
        model_data = df[df['Model Name'] == model_name]
        if model_data.empty:
            continue
        model_row = model_data.iloc[0]
        values = []
        for lang in language_domains:
            val = model_row[lang] if lang in model_row else 0
            if pd.isna(val) or val == '':
                val = 0
            else:
                val = float(val)
            values.append(val)
        values_plot = values + [values[0]]
        domains_plot = language_domains + [language_domains[0]]
        colors = palette[idx % len(palette)]
        fig.add_trace(
            go.Scatterpolar(
                r=values_plot,
                theta=domains_plot,
                fill='toself',
                fillcolor=colors['fill'],
                line=dict(
                    color=colors['line'],
                    width=3,
                    shape='spline',
                    smoothing=0.5
                ),
                marker=dict(
                    size=10,
                    color=colors['line'],
                    symbol='circle',
                    line=dict(width=2, color='#01091A' if theme == "light" else '#e3e6f3')
                ),
                name=get_display_model_name(model_name),
                mode="lines+markers",
                hovertemplate="<b>%{fullData.name}</b><br>" +
                             "<span style='color: #94A3B8'>%{theta}</span><br>" +
                             "<b style='font-size: 12px'>%{r:.3f}</b><br>" +
                             "<extra></extra>",
                hoverlabel=dict(
                    bgcolor="rgba(1, 9, 26, 0.95)" if theme == "dark" else "rgba(227,230,243,0.95)",
                    bordercolor=colors['line'],
                    font=dict(color="#F5F6F7" if theme == "dark" else "#23244a", size=12, family="Verdana, sans-serif")
                )
            )
        )
    max_range = 100.0
    tick_vals = [i * max_range / 5 for i in range(6)]
    tick_text = [f"{val:.2f}" for val in tick_vals]
    theme_colors = get_theme_colors(theme)
    fig.update_layout(
        polar=dict(
            bgcolor=theme_colors["plot_bg"],
            domain=dict(x=[0,1], y=[0,1]),
            radialaxis=dict(
                visible=True,
                range=[0, max_range],
                showline=True,
                linewidth=2,
                linecolor='rgba(245, 246, 247, 0.2)',
                gridcolor='rgba(245, 246, 247, 0.1)',
                gridwidth=1,
                tickvals=tick_vals,
                ticktext=tick_text,
                tickfont=dict(
                    size=11,
                    color='#94A3B8',
                    family="'Geist Mono', monospace"
                ),
                tickangle=0
            ),
            angularaxis=dict(
                showline=True,
                linewidth=2,
                linecolor='rgba(245, 246, 247, 0.2)',
                gridcolor='rgba(245, 246, 247, 0.08)',
                tickfont=dict(
                    size=14,
                    family="Verdana, sans-serif",
                    color=theme_colors["legend_font_color"],
                    weight=600
                ),
                ticktext=[
                    "📝 Content Gen",
                    "✂️ Editing",
                    "📊 Data Analysis",
                    "🧠 Reasoning",
                    "🦄 Hallucination",
                    "🛡️ Safety",
                    "🔁 Repetition",
                    "📝 Summarization",
                    "🌐 Translation",
                    "💬 Multi-Turn"
                ],
                rotation=90,
                direction="clockwise",
            ),
        ),
        showlegend=True,
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=-0.15,
            xanchor="center",
            x=0.5,
            font=dict(
                size=12,
                family="Verdana, sans-serif",
                color=theme_colors["legend_font_color"]
            ),
            bgcolor=theme_colors["legend_bg"],
            bordercolor='rgba(245, 246, 247, 0.2)',
            borderwidth=1,
            itemsizing='constant',
            itemwidth=30
        ),
        title=dict(
            text=f"<b>Language Performance</b>",
            x=0.5,
            y=0.97,
            font=dict(
                size=22,
                family="Verdana, sans-serif",
                color=theme_colors["legend_font_color"],
                weight=700
            ),
        ),
        paper_bgcolor=theme_colors["paper_bg"],
        plot_bgcolor=theme_colors["plot_bg"],
        height=900,
        width=1450,
        margin=dict(t=100, b=80, l=80, r=80),
        annotations=[
            dict(
                text="TRUEBench",
                xref="paper", yref="paper",
                x=0.98, y=0.02,
                xanchor='right', yanchor='bottom',
                font=dict(size=10, color=theme_colors["annotation_color"]),
                showarrow=False
            )
        ]
    )
    return fig

def load_leaderboard_data() -> pd.DataFrame:
    """Load and prepare the leaderboard data (Category)."""
    from src.data_loader import get_category_dataframe
    return get_category_dataframe(processed=True)

def load_leaderboard_language_data() -> pd.DataFrame:
    """Load and prepare the leaderboard data (Language)."""
    from src.data_loader import get_language_dataframe
    return get_language_dataframe(processed=True)

def create_domain_radar_chart(
    df: pd.DataFrame,
    metric_type: str,
    selected_models: Optional[List[str]] = None,
    max_models: int = 5,
    theme: str = "light"
) -> Figure:
    """
    Create a radar chart showing model performance across domains for the selected metric.
    theme: "light" or "dark" (default: "light")
    """
    actual_metric_type = SORT_COLUMN_MAP.get(metric_type, metric_type)
    domain_mapping = {
        'Avg AC': {
            'Content Generation': '📝 Content Generation',
            'Editing': '✂️ Editing',
            'Data Analysis': '📊 Data Analysis',
            'Reasoning': '🧠 Reasoning',
            'Hallucination': '🦄 Hallucination',
            'Safety': '🛡️ Safety',
            'Repetition': '🔁 Repetition',
            'Summarization': '📝 Summarization',
            'Translation': '🌐 Translation',
            'Multi-Turn': '💬 Multi-Turn'
        },
        'Avg TSQ': {
            'Content Generation': 'Content Generation',
            'Editing': 'Editing',
            'Data Analysis': 'Data Analysis',
            'Reasoning': 'Reasoning',
            'Hallucination': 'Hallucination',
            'Safety': 'Safety',
            'Repetition': 'Repetition',
            'Summarization': 'Summarization',
            'Translation': 'Translation',
            'Multi-Turn': 'Multi-Turn'
        },
        'Avg Total Cost': {
            'Content Generation': 'Content Generation',
            'Editing': 'Editing',
            'Data Analysis': 'Data Analysis',
            'Reasoning': 'Reasoning',
            'Hallucination': 'Hallucination',
            'Safety': 'Safety',
            'Repetition': 'Repetition',
            'Summarization': 'Summarization',
            'Translation': 'Translation',
            'Multi-Turn': 'Multi-Turn'
        },
        'Avg Session Duration': {
            'Content Generation': 'Content Generation',
            'Editing': 'Editing',
            'Data Analysis': 'Data Analysis',
            'Reasoning': 'Reasoning',
            'Hallucination': 'Hallucination',
            'Safety': 'Safety',
            'Repetition': 'Repetition',
            'Summarization': 'Summarization',
            'Translation': 'Translation',
            'Multi-Turn': 'Multi-Turn'
        },
        'Avg Turns': {
            'Content Generation': 'Content Generation',
            'Editing': 'Editing',
            'Data Analysis': 'Data Analysis',
            'Reasoning': 'Reasoning',
            'Hallucination': 'Hallucination',
            'Safety': 'Safety',
            'Repetition': 'Repetition',
            'Summarization': 'Summarization',
            'Translation': 'Translation',
            'Multi-Turn': 'Multi-Turn'
        }
    }
    if actual_metric_type not in domain_mapping:
        return create_empty_radar_chart(f"Domain breakdown not available for {metric_type}")
    if selected_models is None or len(selected_models) == 0:
        if actual_metric_type in df.columns:
            selected_models = df.nlargest(max_models, actual_metric_type)['Model Name'].tolist()
        else:
            selected_models = df.head(max_models)['Model Name'].tolist()
    selected_models = selected_models[:max_models]
    domains = list(domain_mapping[actual_metric_type].keys())
    domain_columns = list(domain_mapping[actual_metric_type].values())
    harmonious_palette_light = [
        {'fill': 'rgba(79,143,198,0.25)', 'line': '#4F8FC6', 'name': 'BlueGray'},
        {'fill': 'rgba(109,213,237,0.25)', 'line': '#6DD5ED', 'name': 'SkyBlue'},
        {'fill': 'rgba(162,89,247,0.25)', 'line': '#A259F7', 'name': 'Violet'},
        {'fill': 'rgba(67,233,123,0.25)', 'line': '#43E97B', 'name': 'Mint'},
        {'fill': 'rgba(255,215,0,0.20)', 'line': '#FFD700', 'name': 'Gold'}
    ]
    harmonious_palette_dark = [
        {'fill': 'rgba(144,202,249,0.25)', 'line': '#90CAF9', 'name': 'LightBlue'},
        {'fill': 'rgba(128,203,196,0.25)', 'line': '#80CBC4', 'name': 'Mint'},
        {'fill': 'rgba(179,157,219,0.25)', 'line': '#B39DDB', 'name': 'Lavender'},
        {'fill': 'rgba(244,143,177,0.25)', 'line': '#F48FB1', 'name': 'Pink'},
        {'fill': 'rgba(255,213,79,0.20)', 'line': '#FFD54F', 'name': 'Gold'}
    ]
    palette = harmonious_palette_light if theme == "light" else harmonious_palette_dark
    fig = go.Figure()
    for idx, model_name in enumerate(selected_models):
        model_data = df[df['Model Name'] == model_name]
        if model_data.empty:
            continue
        model_row = model_data.iloc[0]
        values = []
        for domain, _ in zip(domains, domain_columns):
            if domain in df.columns and domain in model_row:
                val = model_row[domain]
                if pd.isna(val) or val == '':
                    val = 0
                else:
                    val = float(val)
                values.append(val)
            else:
                values.append(0)
        values_plot = values + [values[0]]
        domains_plot = domains + [domains[0]]
        colors = palette[idx % len(palette)]
        fig.add_trace(
            go.Scatterpolar(
                r=values_plot,
                theta=domains_plot,
                fill='toself',
                fillcolor=colors['fill'],
                line=dict(
                    color=colors['line'],
                    width=3,
                    shape='spline',
                    smoothing=0.5
                ),
                marker=dict(
                    size=10,
                    color=colors['line'],
                    symbol='circle',
                    line=dict(width=2, color='#01091A' if theme == "light" else '#e3e6f3')
                ),
                name=get_display_model_name(model_name),
                mode="lines+markers",
                hovertemplate="<b>%{fullData.name}</b><br>" +
                             "<span style='color: #94A3B8'>%{theta}</span><br>" +
                             "<b style='font-size: 12px'>%{r:.3f}</b><br>" +
                             "<extra></extra>",
                hoverlabel=dict(
                    bgcolor="rgba(1, 9, 26, 0.95)" if theme == "dark" else "rgba(227,230,243,0.95)",
                    bordercolor=colors['line'],
                    font=dict(color="#F5F6F7" if theme == "dark" else "#23244a", size=12, family="Verdana, sans-serif")
                )
            )
        )
    max_range = 100.0
    tick_vals = [i * max_range / 5 for i in range(6)]
    tick_text = [f"{val:.2f}" for val in tick_vals]
    theme_colors = get_theme_colors(theme)
    fig.update_layout(
        polar=dict(
            bgcolor=theme_colors["plot_bg"],
            radialaxis=dict(
                visible=True,
                range=[0, max_range],
                showline=True,
                linewidth=2,
                linecolor='rgba(245, 246, 247, 0.2)',
                gridcolor='rgba(245, 246, 247, 0.1)',
                gridwidth=1,
                tickvals=tick_vals,
                ticktext=tick_text,
                tickfont=dict(
                    size=11, 
                    color='#94A3B8',
                    family="'Geist Mono', monospace"
                ),
                tickangle=0
            ),
            angularaxis=dict(
                showline=True,
                linewidth=2,
                linecolor='rgba(245, 246, 247, 0.2)',
                gridcolor='rgba(245, 246, 247, 0.08)',
                tickfont=dict(
                    size=14, 
                    family="Verdana, sans-serif",
                    color=theme_colors["legend_font_color"],
                    weight=600
                ),
                rotation=90,
                direction="clockwise",
            ),
        ),
        showlegend=True,
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=-0.15,
            xanchor="center",
            x=0.5,
            font=dict(
                size=12,
                family="Verdana, sans-serif",
                color=theme_colors["legend_font_color"]
            ),
            bgcolor=theme_colors["legend_bg"],
            bordercolor='rgba(245, 246, 247, 0.2)',
            borderwidth=1,
            itemsizing='constant',
            itemwidth=30
        ),
        title=dict(
            text=f"<b>Category Performance</b>",
            x=0.5,
            y=0.97,
            font=dict(
                size=22, 
                family="Verdana, sans-serif", 
                color=theme_colors["legend_font_color"],
                weight=700
            ),
        ),
        paper_bgcolor=theme_colors["paper_bg"],
        plot_bgcolor=theme_colors["plot_bg"],
        height=900,
        width=1450,
        margin=dict(t=100, b=80, l=80, r=80),
        annotations=[
            dict(
                text="TRUEBench",
                xref="paper", yref="paper",
                x=0.98, y=0.02,
                xanchor='right', yanchor='bottom',
                font=dict(size=10, color=theme_colors["annotation_color"]),
                showarrow=False
            )
        ]
    )
    return fig