import pandas as pd import numpy as np import plotly.graph_objects as go from plotly.graph_objs._figure import Figure from typing import Optional, List, Dict, Any from src.display.formatting import get_display_model_name SORT_COLUMN_MAP = { "Average Accuracy": "Avg AC", "Tool Selection Quality": "Avg TSQ", "Session Cost": "Avg Total Cost" } def get_theme_colors(theme: str = "light") -> Dict[str, Any]: """Return color settings for the given theme.""" if theme == "dark": return { "paper_bg": "#181c3a", # darker blue-gray "plot_bg": "#181c3a", "legend_font_color": "#F5F6F7", "legend_bg": 'rgba(35,36,74,0.92)', # slightly lighter than bg, but still dark "annotation_color": '#F5F6F7' } else: return { "paper_bg": "#23244a", # deep blue-gray "plot_bg": "#23244a", "legend_font_color": "#F5F6F7", "legend_bg": 'rgba(35,36,74,0.92)', # match bg for harmony "annotation_color": '#F5F6F7' } def create_empty_radar_chart(message: str) -> Figure: """Create an empty radar chart with a message.""" fig = go.Figure() fig.add_annotation( text=f"📊 {message}", xref="paper", yref="paper", x=0.5, y=0.5, xanchor='center', yanchor='middle', font=dict( size=18, color="#94A3B8", family="Verdana, sans-serif" ), showarrow=False, bgcolor="rgba(245, 246, 247, 0.05)", bordercolor="rgba(245, 246, 247, 0.2)", borderwidth=1, borderpad=20 ) fig.update_layout( paper_bgcolor="#01091A", plot_bgcolor="rgba(245, 246, 247, 0.02)", height=800, width=800, margin=dict(t=100, b=80, l=80, r=80), title=dict( text="Domain Performance Chart", x=0.5, y=0.97, font=dict( size=22, family="Verdana, sans-serif", color="#F5F6F7", weight=700 ), ), annotations=[ dict( text="TRUEBench", xref="paper", yref="paper", x=0.98, y=0.02, xanchor='right', yanchor='bottom', font=dict(size=10, color='#64748B'), showarrow=False ) ] ) return fig def create_len_overall_scatter( df: pd.DataFrame, selected_models: Optional[List[str]] = None, max_models: int = 30, y_col: str = "Overall", length_data: Optional[dict] = None, theme: str = "light", x_axis_data_source: str = "Med. Len." ) -> Figure: """ Create scatter plot showing Med. Len. vs selected y_col for up to 10 selected models. Each dot is colored by Think (normal/reasoning), and the legend is by Think. DataFrame must include an 'Think' column. length_data: JSON data containing model length information by category theme: "light" or "dark" (default: "light") """ import plotly.express as px import json # Defensive: check required columns required_cols = ['Model Name', 'Med. Len.', 'Med. Resp. Len.', y_col] for col in required_cols: if col not in df.columns: return create_empty_radar_chart(f"Column '{col}' not found in data") # Think column check think_col = None for candidate in ['Think']: if candidate in df.columns: think_col = candidate break if think_col is None: return create_empty_radar_chart("Column 'Think' not found in data") # Filter by selected_models if selected_models is not None and len(selected_models) > 0: df_filtered = df[df['Model Name'].isin(selected_models)].copy() else: # Default: top-N by Overall df_filtered = df.copy() df_filtered = df_filtered.sort_values('Overall', ascending=False).head(max_models) if df_filtered.empty: return create_empty_radar_chart(f"No data available for {x_axis_data_source} vs {y_col} analysis") # Determine x-axis data based on x_axis_data_source x_axis_col_name = x_axis_data_source # Use this for the DataFrame column length_data_key = 'Med' if x_axis_data_source == "Med. Len." else 'Med Resp' if y_col == "Overall": # For 'Overall' category, prefer direct DataFrame column reading df_filtered[x_axis_col_name] = pd.to_numeric(df_filtered[x_axis_col_name], errors='coerce') elif length_data: # For other categories, use length_data if available df_filtered[x_axis_col_name] = df_filtered['Model Name'].apply( lambda x: length_data.get(x, {}).get(y_col, {}).get(length_data_key, 0) ) else: # Fallback if no length_data and not 'Overall' (though this case should ideally be handled by required_cols) df_filtered[x_axis_col_name] = pd.to_numeric(df_filtered[x_axis_col_name], errors='coerce') df_filtered[y_col] = pd.to_numeric(df_filtered[y_col], errors='coerce') if 'Type' in df_filtered.columns: df_filtered = df_filtered[df_filtered['Type'] != 'Proprietary'] if 'Parameter Size (B)' in df_filtered.columns: df_filtered['Parameter Size (B)'] = pd.to_numeric(df_filtered['Parameter Size (B)'], errors='coerce') min_size = 20 max_size = 80 param_sizes = df_filtered['Parameter Size (B)'].fillna(5) log_sizes = np.log10(param_sizes) log_min = np.log10(5) log_max = np.log10(param_sizes.max()) marker_sizes = min_size + ((log_sizes - log_min) / (log_max - log_min)) * (max_size - min_size) else: marker_sizes = [30] * len(df_filtered) legend_name_map = { 'On': 'Thinking', 'Off': 'Non-Thinking' } color_palette = { "Thinking": "#FCE39B", "Non-Thinking": "#FF9185" } df_filtered['MarkerType'] = df_filtered['Parameter Size (B)'].apply( lambda x: 'circle' if pd.notna(x) else 'star' ) df_filtered['ThinkDisplay'] = df_filtered['Think'].map(legend_name_map).fillna(df_filtered['Think']) prefix_map = { 'circle': 'Open', 'star': 'Proprietary' } combinations = df_filtered[['ThinkDisplay', 'MarkerType']].drop_duplicates() marker_order = {'circle': 0, 'star': 1} think_order = {'Thinking': 0, 'Non-Thinking': 1} combinations['sort_key'] = combinations.apply( lambda row: (marker_order.get(row['MarkerType'], 99), think_order.get(row['ThinkDisplay'], 99)), axis=1 ) combinations = combinations.sort_values('sort_key') fig = go.Figure() legend_shown = set() median_x = df_filtered[x_axis_col_name].median() median_y = df_filtered[y_col].median() x_axis_display_name = x_axis_data_source.replace("Med.", "Median").replace("Len.", "Length") fig.add_vline( x=median_x, line_dash="dash", line_color="#64748B", opacity=0.6, line_width=1.5, annotation_text=f"{x_axis_display_name}", annotation_position="top right", annotation_font=dict(size=10, color="#64748B") ) fig.add_hline( y=median_y, line_dash="dash", line_color="#64748B", opacity=0.6, line_width=1.5, annotation_text=f"Median {y_col}", annotation_position="bottom right", annotation_font=dict(size=10, color="#64748B") ) for _, row in combinations.iterrows(): think = row['ThinkDisplay'] marker_type = row['MarkerType'] prefix = prefix_map.get(marker_type, '') legend_name = f"{prefix} {think}" sub_df = df_filtered[ (df_filtered['ThinkDisplay'] == think) & (df_filtered['MarkerType'] == marker_type) ] color = color_palette.get(think, "#1098F7") sub_marker_sizes = ( marker_sizes[sub_df.index] if 'Parameter Size (B)' in df_filtered.columns and marker_type == 'circle' else [30] * len(sub_df) ) show_legend = legend_name not in legend_shown legend_shown.add(legend_name) fig.add_trace(go.Scatter( x=sub_df[x_axis_col_name], y=sub_df[y_col], mode='markers+text', name=legend_name, legendgroup=legend_name, showlegend=show_legend, marker_symbol=marker_type, marker=dict( size=sub_marker_sizes, color=color, opacity=0.85, line=dict(width=2, color='#01091A') ), text=sub_df['Model Name'].apply(get_display_model_name), textposition="top center", textfont=dict(size=10, color='#94A3B8'), hovertemplate="%{text}
" + f"{x_axis_display_name}: "+"%{x:.2f}
" + f"{y_col}: "+"%{y:.2f}
" + f"Think: {legend_name}
" + ("Parameter Size: %{customdata}B
" if marker_type == 'circle' else "") + "", customdata=sub_df['Parameter Size (B)'].values if marker_type == 'circle' else None )) # Theme colors theme_colors = get_theme_colors(theme) fig.update_layout( title=dict( text=f"{y_col} {x_axis_display_name} vs Category Score", x=0.5, y=0.97, font=dict(size=22, family="Verdana, sans-serif", color=theme_colors["legend_font_color"], weight=700) ), xaxis=dict( title=dict( text=f"{y_col} {x_axis_display_name}", font=dict(size=16, color=theme_colors["legend_font_color"]) ), tickfont=dict(size=12, color="#94A3B8"), gridcolor="rgba(245, 246, 247, 0.1)", zerolinecolor="rgba(245, 246, 247, 0.2)" ), yaxis=dict( title=dict( text=f"{y_col} Score", font=dict(size=16, color=theme_colors["legend_font_color"]) ), tickfont=dict(size=12, color="#94A3B8"), gridcolor="rgba(245, 246, 247, 0.1)", zerolinecolor="rgba(245, 246, 247, 0.2)" ), paper_bgcolor=theme_colors["paper_bg"], plot_bgcolor=theme_colors["plot_bg"], height=900, width=1450, showlegend=True, legend=dict( orientation="h", yanchor="bottom", y=1, xanchor="center", x=0.5, font=dict(size=12, family="Verdana, sans-serif", color=theme_colors["legend_font_color"]), bgcolor=theme_colors["legend_bg"], bordercolor='rgba(245, 246, 247, 0.2)', borderwidth=1 ), margin=dict(t=100, b=80, l=80, r=80) ) return fig def create_language_radar_chart( df: pd.DataFrame, metric_type: str, selected_models: Optional[List[str]] = None, max_models: int = 5, theme: str = "light" ) -> Figure: """ Create a radar chart showing model performance across languages for the selected models. theme: "light" or "dark" (default: "light") """ language_domains = ['KO', 'EN', 'JA', 'ZH', 'PL', 'DE', 'PT', 'ES', 'FR', 'IT', 'RU', 'VI'] if selected_models is None or len(selected_models) == 0: actual_metric_type = SORT_COLUMN_MAP.get(metric_type, metric_type) if actual_metric_type in df.columns: selected_models = df.nlargest(max_models, actual_metric_type)['Model Name'].tolist() else: selected_models = df.head(max_models)['Model Name'].tolist() selected_models = selected_models[:max_models] harmonious_palette_light = [ {'fill': 'rgba(79,143,198,0.25)', 'line': '#4F8FC6', 'name': 'BlueGray'}, {'fill': 'rgba(109,213,237,0.25)', 'line': '#6DD5ED', 'name': 'SkyBlue'}, {'fill': 'rgba(162,89,247,0.25)', 'line': '#A259F7', 'name': 'Violet'}, {'fill': 'rgba(67,233,123,0.25)', 'line': '#43E97B', 'name': 'Mint'}, {'fill': 'rgba(255,215,0,0.20)', 'line': '#FFD700', 'name': 'Gold'} ] harmonious_palette_dark = [ {'fill': 'rgba(144,202,249,0.25)', 'line': '#90CAF9', 'name': 'LightBlue'}, {'fill': 'rgba(128,203,196,0.25)', 'line': '#80CBC4', 'name': 'Mint'}, {'fill': 'rgba(179,157,219,0.25)', 'line': '#B39DDB', 'name': 'Lavender'}, {'fill': 'rgba(244,143,177,0.25)', 'line': '#F48FB1', 'name': 'Pink'}, {'fill': 'rgba(255,213,79,0.20)', 'line': '#FFD54F', 'name': 'Gold'} ] palette = harmonious_palette_light if theme == "light" else harmonious_palette_dark fig = go.Figure() for idx, model_name in enumerate(selected_models): model_data = df[df['Model Name'] == model_name] if model_data.empty: continue model_row = model_data.iloc[0] values = [] for lang in language_domains: val = model_row[lang] if lang in model_row else 0 if pd.isna(val) or val == '': val = 0 else: val = float(val) values.append(val) values_plot = values + [values[0]] domains_plot = language_domains + [language_domains[0]] colors = palette[idx % len(palette)] fig.add_trace( go.Scatterpolar( r=values_plot, theta=domains_plot, fill='toself', fillcolor=colors['fill'], line=dict( color=colors['line'], width=3, shape='spline', smoothing=0.5 ), marker=dict( size=10, color=colors['line'], symbol='circle', line=dict(width=2, color='#01091A' if theme == "light" else '#e3e6f3') ), name=get_display_model_name(model_name), mode="lines+markers", hovertemplate="%{fullData.name}
" + "%{theta}
" + "%{r:.3f}
" + "", hoverlabel=dict( bgcolor="rgba(1, 9, 26, 0.95)" if theme == "dark" else "rgba(227,230,243,0.95)", bordercolor=colors['line'], font=dict(color="#F5F6F7" if theme == "dark" else "#23244a", size=12, family="Verdana, sans-serif") ) ) ) max_range = 100.0 tick_vals = [i * max_range / 5 for i in range(6)] tick_text = [f"{val:.2f}" for val in tick_vals] theme_colors = get_theme_colors(theme) fig.update_layout( polar=dict( bgcolor=theme_colors["plot_bg"], domain=dict(x=[0,1], y=[0,1]), radialaxis=dict( visible=True, range=[0, max_range], showline=True, linewidth=2, linecolor='rgba(245, 246, 247, 0.2)', gridcolor='rgba(245, 246, 247, 0.1)', gridwidth=1, tickvals=tick_vals, ticktext=tick_text, tickfont=dict( size=11, color='#94A3B8', family="'Geist Mono', monospace" ), tickangle=0 ), angularaxis=dict( showline=True, linewidth=2, linecolor='rgba(245, 246, 247, 0.2)', gridcolor='rgba(245, 246, 247, 0.08)', tickfont=dict( size=14, family="Verdana, sans-serif", color=theme_colors["legend_font_color"], weight=600 ), ticktext=[ "📝 Content Gen", "✂️ Editing", "📊 Data Analysis", "🧠 Reasoning", "🦄 Hallucination", "🛡️ Safety", "🔁 Repetition", "📝 Summarization", "🌐 Translation", "💬 Multi-Turn" ], rotation=90, direction="clockwise", ), ), showlegend=True, legend=dict( orientation="h", yanchor="bottom", y=-0.15, xanchor="center", x=0.5, font=dict( size=12, family="Verdana, sans-serif", color=theme_colors["legend_font_color"] ), bgcolor=theme_colors["legend_bg"], bordercolor='rgba(245, 246, 247, 0.2)', borderwidth=1, itemsizing='constant', itemwidth=30 ), title=dict( text=f"Language Performance", x=0.5, y=0.97, font=dict( size=22, family="Verdana, sans-serif", color=theme_colors["legend_font_color"], weight=700 ), ), paper_bgcolor=theme_colors["paper_bg"], plot_bgcolor=theme_colors["plot_bg"], height=900, width=1450, margin=dict(t=100, b=80, l=80, r=80), annotations=[ dict( text="TRUEBench", xref="paper", yref="paper", x=0.98, y=0.02, xanchor='right', yanchor='bottom', font=dict(size=10, color=theme_colors["annotation_color"]), showarrow=False ) ] ) return fig def load_leaderboard_data() -> pd.DataFrame: """Load and prepare the leaderboard data (Category).""" from src.data_loader import get_category_dataframe return get_category_dataframe(processed=True) def load_leaderboard_language_data() -> pd.DataFrame: """Load and prepare the leaderboard data (Language).""" from src.data_loader import get_language_dataframe return get_language_dataframe(processed=True) def create_domain_radar_chart( df: pd.DataFrame, metric_type: str, selected_models: Optional[List[str]] = None, max_models: int = 5, theme: str = "light" ) -> Figure: """ Create a radar chart showing model performance across domains for the selected metric. theme: "light" or "dark" (default: "light") """ actual_metric_type = SORT_COLUMN_MAP.get(metric_type, metric_type) domain_mapping = { 'Avg AC': { 'Content Generation': '📝 Content Generation', 'Editing': '✂️ Editing', 'Data Analysis': '📊 Data Analysis', 'Reasoning': '🧠 Reasoning', 'Hallucination': '🦄 Hallucination', 'Safety': '🛡️ Safety', 'Repetition': '🔁 Repetition', 'Summarization': '📝 Summarization', 'Translation': '🌐 Translation', 'Multi-Turn': '💬 Multi-Turn' }, 'Avg TSQ': { 'Content Generation': 'Content Generation', 'Editing': 'Editing', 'Data Analysis': 'Data Analysis', 'Reasoning': 'Reasoning', 'Hallucination': 'Hallucination', 'Safety': 'Safety', 'Repetition': 'Repetition', 'Summarization': 'Summarization', 'Translation': 'Translation', 'Multi-Turn': 'Multi-Turn' }, 'Avg Total Cost': { 'Content Generation': 'Content Generation', 'Editing': 'Editing', 'Data Analysis': 'Data Analysis', 'Reasoning': 'Reasoning', 'Hallucination': 'Hallucination', 'Safety': 'Safety', 'Repetition': 'Repetition', 'Summarization': 'Summarization', 'Translation': 'Translation', 'Multi-Turn': 'Multi-Turn' }, 'Avg Session Duration': { 'Content Generation': 'Content Generation', 'Editing': 'Editing', 'Data Analysis': 'Data Analysis', 'Reasoning': 'Reasoning', 'Hallucination': 'Hallucination', 'Safety': 'Safety', 'Repetition': 'Repetition', 'Summarization': 'Summarization', 'Translation': 'Translation', 'Multi-Turn': 'Multi-Turn' }, 'Avg Turns': { 'Content Generation': 'Content Generation', 'Editing': 'Editing', 'Data Analysis': 'Data Analysis', 'Reasoning': 'Reasoning', 'Hallucination': 'Hallucination', 'Safety': 'Safety', 'Repetition': 'Repetition', 'Summarization': 'Summarization', 'Translation': 'Translation', 'Multi-Turn': 'Multi-Turn' } } if actual_metric_type not in domain_mapping: return create_empty_radar_chart(f"Domain breakdown not available for {metric_type}") if selected_models is None or len(selected_models) == 0: if actual_metric_type in df.columns: selected_models = df.nlargest(max_models, actual_metric_type)['Model Name'].tolist() else: selected_models = df.head(max_models)['Model Name'].tolist() selected_models = selected_models[:max_models] domains = list(domain_mapping[actual_metric_type].keys()) domain_columns = list(domain_mapping[actual_metric_type].values()) harmonious_palette_light = [ {'fill': 'rgba(79,143,198,0.25)', 'line': '#4F8FC6', 'name': 'BlueGray'}, {'fill': 'rgba(109,213,237,0.25)', 'line': '#6DD5ED', 'name': 'SkyBlue'}, {'fill': 'rgba(162,89,247,0.25)', 'line': '#A259F7', 'name': 'Violet'}, {'fill': 'rgba(67,233,123,0.25)', 'line': '#43E97B', 'name': 'Mint'}, {'fill': 'rgba(255,215,0,0.20)', 'line': '#FFD700', 'name': 'Gold'} ] harmonious_palette_dark = [ {'fill': 'rgba(144,202,249,0.25)', 'line': '#90CAF9', 'name': 'LightBlue'}, {'fill': 'rgba(128,203,196,0.25)', 'line': '#80CBC4', 'name': 'Mint'}, {'fill': 'rgba(179,157,219,0.25)', 'line': '#B39DDB', 'name': 'Lavender'}, {'fill': 'rgba(244,143,177,0.25)', 'line': '#F48FB1', 'name': 'Pink'}, {'fill': 'rgba(255,213,79,0.20)', 'line': '#FFD54F', 'name': 'Gold'} ] palette = harmonious_palette_light if theme == "light" else harmonious_palette_dark fig = go.Figure() for idx, model_name in enumerate(selected_models): model_data = df[df['Model Name'] == model_name] if model_data.empty: continue model_row = model_data.iloc[0] values = [] for domain, _ in zip(domains, domain_columns): if domain in df.columns and domain in model_row: val = model_row[domain] if pd.isna(val) or val == '': val = 0 else: val = float(val) values.append(val) else: values.append(0) values_plot = values + [values[0]] domains_plot = domains + [domains[0]] colors = palette[idx % len(palette)] fig.add_trace( go.Scatterpolar( r=values_plot, theta=domains_plot, fill='toself', fillcolor=colors['fill'], line=dict( color=colors['line'], width=3, shape='spline', smoothing=0.5 ), marker=dict( size=10, color=colors['line'], symbol='circle', line=dict(width=2, color='#01091A' if theme == "light" else '#e3e6f3') ), name=get_display_model_name(model_name), mode="lines+markers", hovertemplate="%{fullData.name}
" + "%{theta}
" + "%{r:.3f}
" + "", hoverlabel=dict( bgcolor="rgba(1, 9, 26, 0.95)" if theme == "dark" else "rgba(227,230,243,0.95)", bordercolor=colors['line'], font=dict(color="#F5F6F7" if theme == "dark" else "#23244a", size=12, family="Verdana, sans-serif") ) ) ) max_range = 100.0 tick_vals = [i * max_range / 5 for i in range(6)] tick_text = [f"{val:.2f}" for val in tick_vals] theme_colors = get_theme_colors(theme) fig.update_layout( polar=dict( bgcolor=theme_colors["plot_bg"], radialaxis=dict( visible=True, range=[0, max_range], showline=True, linewidth=2, linecolor='rgba(245, 246, 247, 0.2)', gridcolor='rgba(245, 246, 247, 0.1)', gridwidth=1, tickvals=tick_vals, ticktext=tick_text, tickfont=dict( size=11, color='#94A3B8', family="'Geist Mono', monospace" ), tickangle=0 ), angularaxis=dict( showline=True, linewidth=2, linecolor='rgba(245, 246, 247, 0.2)', gridcolor='rgba(245, 246, 247, 0.08)', tickfont=dict( size=14, family="Verdana, sans-serif", color=theme_colors["legend_font_color"], weight=600 ), rotation=90, direction="clockwise", ), ), showlegend=True, legend=dict( orientation="h", yanchor="bottom", y=-0.15, xanchor="center", x=0.5, font=dict( size=12, family="Verdana, sans-serif", color=theme_colors["legend_font_color"] ), bgcolor=theme_colors["legend_bg"], bordercolor='rgba(245, 246, 247, 0.2)', borderwidth=1, itemsizing='constant', itemwidth=30 ), title=dict( text=f"Category Performance", x=0.5, y=0.97, font=dict( size=22, family="Verdana, sans-serif", color=theme_colors["legend_font_color"], weight=700 ), ), paper_bgcolor=theme_colors["paper_bg"], plot_bgcolor=theme_colors["plot_bg"], height=900, width=1450, margin=dict(t=100, b=80, l=80, r=80), annotations=[ dict( text="TRUEBench", xref="paper", yref="paper", x=0.98, y=0.02, xanchor='right', yanchor='bottom', font=dict(size=10, color=theme_colors["annotation_color"]), showarrow=False ) ] ) return fig