Spaces:
Running
Running
import pandas as pd | |
import numpy as np | |
import plotly.graph_objects as go | |
from plotly.graph_objs._figure import Figure | |
from typing import Optional, List, Dict, Any | |
from src.display.formatting import get_display_model_name | |
SORT_COLUMN_MAP = { | |
"Average Accuracy": "Avg AC", | |
"Tool Selection Quality": "Avg TSQ", | |
"Session Cost": "Avg Total Cost" | |
} | |
def get_theme_colors(theme: str = "light") -> Dict[str, Any]: | |
"""Return color settings for the given theme.""" | |
if theme == "dark": | |
return { | |
"paper_bg": "#181c3a", # darker blue-gray | |
"plot_bg": "#181c3a", | |
"legend_font_color": "#F5F6F7", | |
"legend_bg": 'rgba(35,36,74,0.92)', # slightly lighter than bg, but still dark | |
"annotation_color": '#F5F6F7' | |
} | |
else: | |
return { | |
"paper_bg": "#23244a", # deep blue-gray | |
"plot_bg": "#23244a", | |
"legend_font_color": "#F5F6F7", | |
"legend_bg": 'rgba(35,36,74,0.92)', # match bg for harmony | |
"annotation_color": '#F5F6F7' | |
} | |
def create_empty_radar_chart(message: str) -> Figure: | |
"""Create an empty radar chart with a message.""" | |
fig = go.Figure() | |
fig.add_annotation( | |
text=f"📊 {message}", | |
xref="paper", yref="paper", | |
x=0.5, y=0.5, | |
xanchor='center', yanchor='middle', | |
font=dict( | |
size=18, | |
color="#94A3B8", | |
family="Verdana, sans-serif" | |
), | |
showarrow=False, | |
bgcolor="rgba(245, 246, 247, 0.05)", | |
bordercolor="rgba(245, 246, 247, 0.2)", | |
borderwidth=1, | |
borderpad=20 | |
) | |
fig.update_layout( | |
paper_bgcolor="#01091A", | |
plot_bgcolor="rgba(245, 246, 247, 0.02)", | |
height=800, | |
width=800, | |
margin=dict(t=100, b=80, l=80, r=80), | |
title=dict( | |
text="<b>Domain Performance Chart</b>", | |
x=0.5, | |
y=0.97, | |
font=dict( | |
size=22, | |
family="Verdana, sans-serif", | |
color="#F5F6F7", | |
weight=700 | |
), | |
), | |
annotations=[ | |
dict( | |
text="TRUEBench", | |
xref="paper", yref="paper", | |
x=0.98, y=0.02, | |
xanchor='right', yanchor='bottom', | |
font=dict(size=10, color='#64748B'), | |
showarrow=False | |
) | |
] | |
) | |
return fig | |
def create_len_overall_scatter( | |
df: pd.DataFrame, | |
selected_models: Optional[List[str]] = None, | |
max_models: int = 30, | |
y_col: str = "Overall", | |
length_data: Optional[dict] = None, | |
theme: str = "light", | |
x_axis_data_source: str = "Med. Len." | |
) -> Figure: | |
""" | |
Create scatter plot showing Med. Len. vs selected y_col for up to 10 selected models. | |
Each dot is colored by Think (normal/reasoning), and the legend is by Think. | |
DataFrame must include an 'Think' column. | |
length_data: JSON data containing model length information by category | |
theme: "light" or "dark" (default: "light") | |
""" | |
import plotly.express as px | |
import json | |
# Defensive: check required columns | |
required_cols = ['Model Name', 'Med. Len.', 'Med. Resp. Len.', y_col] | |
for col in required_cols: | |
if col not in df.columns: | |
return create_empty_radar_chart(f"Column '{col}' not found in data") | |
# Think column check | |
think_col = None | |
for candidate in ['Think']: | |
if candidate in df.columns: | |
think_col = candidate | |
break | |
if think_col is None: | |
return create_empty_radar_chart("Column 'Think' not found in data") | |
# Filter by selected_models | |
if selected_models is not None and len(selected_models) > 0: | |
df_filtered = df[df['Model Name'].isin(selected_models)].copy() | |
else: | |
# Default: top-N by Overall | |
df_filtered = df.copy() | |
df_filtered = df_filtered.sort_values('Overall', ascending=False).head(max_models) | |
if df_filtered.empty: | |
return create_empty_radar_chart(f"No data available for {x_axis_data_source} vs {y_col} analysis") | |
# Determine x-axis data based on x_axis_data_source | |
x_axis_col_name = x_axis_data_source # Use this for the DataFrame column | |
length_data_key = 'Med' if x_axis_data_source == "Med. Len." else 'Med Resp' | |
if y_col == "Overall": | |
# For 'Overall' category, prefer direct DataFrame column reading | |
df_filtered[x_axis_col_name] = pd.to_numeric(df_filtered[x_axis_col_name], errors='coerce') | |
elif length_data: | |
# For other categories, use length_data if available | |
df_filtered[x_axis_col_name] = df_filtered['Model Name'].apply( | |
lambda x: length_data.get(x, {}).get(y_col, {}).get(length_data_key, 0) | |
) | |
else: | |
# Fallback if no length_data and not 'Overall' (though this case should ideally be handled by required_cols) | |
df_filtered[x_axis_col_name] = pd.to_numeric(df_filtered[x_axis_col_name], errors='coerce') | |
df_filtered[y_col] = pd.to_numeric(df_filtered[y_col], errors='coerce') | |
if 'Type' in df_filtered.columns: | |
df_filtered = df_filtered[df_filtered['Type'] != 'Proprietary'] | |
if 'Parameter Size (B)' in df_filtered.columns: | |
df_filtered['Parameter Size (B)'] = pd.to_numeric(df_filtered['Parameter Size (B)'], errors='coerce') | |
min_size = 20 | |
max_size = 80 | |
param_sizes = df_filtered['Parameter Size (B)'].fillna(5) | |
log_sizes = np.log10(param_sizes) | |
log_min = np.log10(5) | |
log_max = np.log10(param_sizes.max()) | |
marker_sizes = min_size + ((log_sizes - log_min) / (log_max - log_min)) * (max_size - min_size) | |
else: | |
marker_sizes = [30] * len(df_filtered) | |
legend_name_map = { | |
'On': 'Thinking', | |
'Off': 'Non-Thinking' | |
} | |
color_palette = { | |
"Thinking": "#FCE39B", | |
"Non-Thinking": "#FF9185" | |
} | |
df_filtered['MarkerType'] = df_filtered['Parameter Size (B)'].apply( | |
lambda x: 'circle' if pd.notna(x) else 'star' | |
) | |
df_filtered['ThinkDisplay'] = df_filtered['Think'].map(legend_name_map).fillna(df_filtered['Think']) | |
prefix_map = { | |
'circle': 'Open', | |
'star': 'Proprietary' | |
} | |
combinations = df_filtered[['ThinkDisplay', 'MarkerType']].drop_duplicates() | |
marker_order = {'circle': 0, 'star': 1} | |
think_order = {'Thinking': 0, 'Non-Thinking': 1} | |
combinations['sort_key'] = combinations.apply( | |
lambda row: (marker_order.get(row['MarkerType'], 99), think_order.get(row['ThinkDisplay'], 99)), | |
axis=1 | |
) | |
combinations = combinations.sort_values('sort_key') | |
fig = go.Figure() | |
legend_shown = set() | |
median_x = df_filtered[x_axis_col_name].median() | |
median_y = df_filtered[y_col].median() | |
x_axis_display_name = x_axis_data_source.replace("Med.", "Median").replace("Len.", "Length") | |
fig.add_vline( | |
x=median_x, | |
line_dash="dash", | |
line_color="#64748B", | |
opacity=0.6, | |
line_width=1.5, | |
annotation_text=f"{x_axis_display_name}", | |
annotation_position="top right", | |
annotation_font=dict(size=10, color="#64748B") | |
) | |
fig.add_hline( | |
y=median_y, | |
line_dash="dash", | |
line_color="#64748B", | |
opacity=0.6, | |
line_width=1.5, | |
annotation_text=f"Median {y_col}", | |
annotation_position="bottom right", | |
annotation_font=dict(size=10, color="#64748B") | |
) | |
for _, row in combinations.iterrows(): | |
think = row['ThinkDisplay'] | |
marker_type = row['MarkerType'] | |
prefix = prefix_map.get(marker_type, '') | |
legend_name = f"{prefix} {think}" | |
sub_df = df_filtered[ | |
(df_filtered['ThinkDisplay'] == think) & | |
(df_filtered['MarkerType'] == marker_type) | |
] | |
color = color_palette.get(think, "#1098F7") | |
sub_marker_sizes = ( | |
marker_sizes[sub_df.index] | |
if 'Parameter Size (B)' in df_filtered.columns and marker_type == 'circle' | |
else [30] * len(sub_df) | |
) | |
show_legend = legend_name not in legend_shown | |
legend_shown.add(legend_name) | |
fig.add_trace(go.Scatter( | |
x=sub_df[x_axis_col_name], | |
y=sub_df[y_col], | |
mode='markers+text', | |
name=legend_name, | |
legendgroup=legend_name, | |
showlegend=show_legend, | |
marker_symbol=marker_type, | |
marker=dict( | |
size=sub_marker_sizes, | |
color=color, | |
opacity=0.85, | |
line=dict(width=2, color='#01091A') | |
), | |
text=sub_df['Model Name'].apply(get_display_model_name), | |
textposition="top center", | |
textfont=dict(size=10, color='#94A3B8'), | |
hovertemplate="<b>%{text}</b><br>" + | |
f"{x_axis_display_name}: "+"%{x:.2f}<br>" + | |
f"{y_col}: "+"%{y:.2f}<br>" + | |
f"Think: {legend_name}<br>" + | |
("Parameter Size: %{customdata}B<br>" if marker_type == 'circle' else "") + | |
"<extra></extra>", | |
customdata=sub_df['Parameter Size (B)'].values if marker_type == 'circle' else None | |
)) | |
# Theme colors | |
theme_colors = get_theme_colors(theme) | |
fig.update_layout( | |
title=dict( | |
text=f"<b>{y_col} {x_axis_display_name} vs Category Score</b>", | |
x=0.5, | |
y=0.97, | |
font=dict(size=22, family="Verdana, sans-serif", color=theme_colors["legend_font_color"], weight=700) | |
), | |
xaxis=dict( | |
title=dict( | |
text=f"<b>{y_col} {x_axis_display_name}</b>", | |
font=dict(size=16, color=theme_colors["legend_font_color"]) | |
), | |
tickfont=dict(size=12, color="#94A3B8"), | |
gridcolor="rgba(245, 246, 247, 0.1)", | |
zerolinecolor="rgba(245, 246, 247, 0.2)" | |
), | |
yaxis=dict( | |
title=dict( | |
text=f"<b>{y_col} Score</b>", | |
font=dict(size=16, color=theme_colors["legend_font_color"]) | |
), | |
tickfont=dict(size=12, color="#94A3B8"), | |
gridcolor="rgba(245, 246, 247, 0.1)", | |
zerolinecolor="rgba(245, 246, 247, 0.2)" | |
), | |
paper_bgcolor=theme_colors["paper_bg"], | |
plot_bgcolor=theme_colors["plot_bg"], | |
height=900, | |
width=1450, | |
showlegend=True, | |
legend=dict( | |
orientation="h", | |
yanchor="bottom", | |
y=1, | |
xanchor="center", | |
x=0.5, | |
font=dict(size=12, family="Verdana, sans-serif", color=theme_colors["legend_font_color"]), | |
bgcolor=theme_colors["legend_bg"], | |
bordercolor='rgba(245, 246, 247, 0.2)', | |
borderwidth=1 | |
), | |
margin=dict(t=100, b=80, l=80, r=80) | |
) | |
return fig | |
def create_language_radar_chart( | |
df: pd.DataFrame, | |
metric_type: str, | |
selected_models: Optional[List[str]] = None, | |
max_models: int = 5, | |
theme: str = "light" | |
) -> Figure: | |
""" | |
Create a radar chart showing model performance across languages for the selected models. | |
theme: "light" or "dark" (default: "light") | |
""" | |
language_domains = ['KO', 'EN', 'JA', 'ZH', 'PL', 'DE', 'PT', 'ES', 'FR', 'IT', 'RU', 'VI'] | |
if selected_models is None or len(selected_models) == 0: | |
actual_metric_type = SORT_COLUMN_MAP.get(metric_type, metric_type) | |
if actual_metric_type in df.columns: | |
selected_models = df.nlargest(max_models, actual_metric_type)['Model Name'].tolist() | |
else: | |
selected_models = df.head(max_models)['Model Name'].tolist() | |
selected_models = selected_models[:max_models] | |
harmonious_palette_light = [ | |
{'fill': 'rgba(79,143,198,0.25)', 'line': '#4F8FC6', 'name': 'BlueGray'}, | |
{'fill': 'rgba(109,213,237,0.25)', 'line': '#6DD5ED', 'name': 'SkyBlue'}, | |
{'fill': 'rgba(162,89,247,0.25)', 'line': '#A259F7', 'name': 'Violet'}, | |
{'fill': 'rgba(67,233,123,0.25)', 'line': '#43E97B', 'name': 'Mint'}, | |
{'fill': 'rgba(255,215,0,0.20)', 'line': '#FFD700', 'name': 'Gold'} | |
] | |
harmonious_palette_dark = [ | |
{'fill': 'rgba(144,202,249,0.25)', 'line': '#90CAF9', 'name': 'LightBlue'}, | |
{'fill': 'rgba(128,203,196,0.25)', 'line': '#80CBC4', 'name': 'Mint'}, | |
{'fill': 'rgba(179,157,219,0.25)', 'line': '#B39DDB', 'name': 'Lavender'}, | |
{'fill': 'rgba(244,143,177,0.25)', 'line': '#F48FB1', 'name': 'Pink'}, | |
{'fill': 'rgba(255,213,79,0.20)', 'line': '#FFD54F', 'name': 'Gold'} | |
] | |
palette = harmonious_palette_light if theme == "light" else harmonious_palette_dark | |
fig = go.Figure() | |
for idx, model_name in enumerate(selected_models): | |
model_data = df[df['Model Name'] == model_name] | |
if model_data.empty: | |
continue | |
model_row = model_data.iloc[0] | |
values = [] | |
for lang in language_domains: | |
val = model_row[lang] if lang in model_row else 0 | |
if pd.isna(val) or val == '': | |
val = 0 | |
else: | |
val = float(val) | |
values.append(val) | |
values_plot = values + [values[0]] | |
domains_plot = language_domains + [language_domains[0]] | |
colors = palette[idx % len(palette)] | |
fig.add_trace( | |
go.Scatterpolar( | |
r=values_plot, | |
theta=domains_plot, | |
fill='toself', | |
fillcolor=colors['fill'], | |
line=dict( | |
color=colors['line'], | |
width=3, | |
shape='spline', | |
smoothing=0.5 | |
), | |
marker=dict( | |
size=10, | |
color=colors['line'], | |
symbol='circle', | |
line=dict(width=2, color='#01091A' if theme == "light" else '#e3e6f3') | |
), | |
name=get_display_model_name(model_name), | |
mode="lines+markers", | |
hovertemplate="<b>%{fullData.name}</b><br>" + | |
"<span style='color: #94A3B8'>%{theta}</span><br>" + | |
"<b style='font-size: 12px'>%{r:.3f}</b><br>" + | |
"<extra></extra>", | |
hoverlabel=dict( | |
bgcolor="rgba(1, 9, 26, 0.95)" if theme == "dark" else "rgba(227,230,243,0.95)", | |
bordercolor=colors['line'], | |
font=dict(color="#F5F6F7" if theme == "dark" else "#23244a", size=12, family="Verdana, sans-serif") | |
) | |
) | |
) | |
max_range = 100.0 | |
tick_vals = [i * max_range / 5 for i in range(6)] | |
tick_text = [f"{val:.2f}" for val in tick_vals] | |
theme_colors = get_theme_colors(theme) | |
fig.update_layout( | |
polar=dict( | |
bgcolor=theme_colors["plot_bg"], | |
domain=dict(x=[0,1], y=[0,1]), | |
radialaxis=dict( | |
visible=True, | |
range=[0, max_range], | |
showline=True, | |
linewidth=2, | |
linecolor='rgba(245, 246, 247, 0.2)', | |
gridcolor='rgba(245, 246, 247, 0.1)', | |
gridwidth=1, | |
tickvals=tick_vals, | |
ticktext=tick_text, | |
tickfont=dict( | |
size=11, | |
color='#94A3B8', | |
family="'Geist Mono', monospace" | |
), | |
tickangle=0 | |
), | |
angularaxis=dict( | |
showline=True, | |
linewidth=2, | |
linecolor='rgba(245, 246, 247, 0.2)', | |
gridcolor='rgba(245, 246, 247, 0.08)', | |
tickfont=dict( | |
size=14, | |
family="Verdana, sans-serif", | |
color=theme_colors["legend_font_color"], | |
weight=600 | |
), | |
ticktext=[ | |
"📝 Content Gen", | |
"✂️ Editing", | |
"📊 Data Analysis", | |
"🧠 Reasoning", | |
"🦄 Hallucination", | |
"🛡️ Safety", | |
"🔁 Repetition", | |
"📝 Summarization", | |
"🌐 Translation", | |
"💬 Multi-Turn" | |
], | |
rotation=90, | |
direction="clockwise", | |
), | |
), | |
showlegend=True, | |
legend=dict( | |
orientation="h", | |
yanchor="bottom", | |
y=-0.15, | |
xanchor="center", | |
x=0.5, | |
font=dict( | |
size=12, | |
family="Verdana, sans-serif", | |
color=theme_colors["legend_font_color"] | |
), | |
bgcolor=theme_colors["legend_bg"], | |
bordercolor='rgba(245, 246, 247, 0.2)', | |
borderwidth=1, | |
itemsizing='constant', | |
itemwidth=30 | |
), | |
title=dict( | |
text=f"<b>Language Performance</b>", | |
x=0.5, | |
y=0.97, | |
font=dict( | |
size=22, | |
family="Verdana, sans-serif", | |
color=theme_colors["legend_font_color"], | |
weight=700 | |
), | |
), | |
paper_bgcolor=theme_colors["paper_bg"], | |
plot_bgcolor=theme_colors["plot_bg"], | |
height=900, | |
width=1450, | |
margin=dict(t=100, b=80, l=80, r=80), | |
annotations=[ | |
dict( | |
text="TRUEBench", | |
xref="paper", yref="paper", | |
x=0.98, y=0.02, | |
xanchor='right', yanchor='bottom', | |
font=dict(size=10, color=theme_colors["annotation_color"]), | |
showarrow=False | |
) | |
] | |
) | |
return fig | |
def load_leaderboard_data() -> pd.DataFrame: | |
"""Load and prepare the leaderboard data (Category).""" | |
from src.data_loader import get_category_dataframe | |
return get_category_dataframe(processed=True) | |
def load_leaderboard_language_data() -> pd.DataFrame: | |
"""Load and prepare the leaderboard data (Language).""" | |
from src.data_loader import get_language_dataframe | |
return get_language_dataframe(processed=True) | |
def create_domain_radar_chart( | |
df: pd.DataFrame, | |
metric_type: str, | |
selected_models: Optional[List[str]] = None, | |
max_models: int = 5, | |
theme: str = "light" | |
) -> Figure: | |
""" | |
Create a radar chart showing model performance across domains for the selected metric. | |
theme: "light" or "dark" (default: "light") | |
""" | |
actual_metric_type = SORT_COLUMN_MAP.get(metric_type, metric_type) | |
domain_mapping = { | |
'Avg AC': { | |
'Content Generation': '📝 Content Generation', | |
'Editing': '✂️ Editing', | |
'Data Analysis': '📊 Data Analysis', | |
'Reasoning': '🧠 Reasoning', | |
'Hallucination': '🦄 Hallucination', | |
'Safety': '🛡️ Safety', | |
'Repetition': '🔁 Repetition', | |
'Summarization': '📝 Summarization', | |
'Translation': '🌐 Translation', | |
'Multi-Turn': '💬 Multi-Turn' | |
}, | |
'Avg TSQ': { | |
'Content Generation': 'Content Generation', | |
'Editing': 'Editing', | |
'Data Analysis': 'Data Analysis', | |
'Reasoning': 'Reasoning', | |
'Hallucination': 'Hallucination', | |
'Safety': 'Safety', | |
'Repetition': 'Repetition', | |
'Summarization': 'Summarization', | |
'Translation': 'Translation', | |
'Multi-Turn': 'Multi-Turn' | |
}, | |
'Avg Total Cost': { | |
'Content Generation': 'Content Generation', | |
'Editing': 'Editing', | |
'Data Analysis': 'Data Analysis', | |
'Reasoning': 'Reasoning', | |
'Hallucination': 'Hallucination', | |
'Safety': 'Safety', | |
'Repetition': 'Repetition', | |
'Summarization': 'Summarization', | |
'Translation': 'Translation', | |
'Multi-Turn': 'Multi-Turn' | |
}, | |
'Avg Session Duration': { | |
'Content Generation': 'Content Generation', | |
'Editing': 'Editing', | |
'Data Analysis': 'Data Analysis', | |
'Reasoning': 'Reasoning', | |
'Hallucination': 'Hallucination', | |
'Safety': 'Safety', | |
'Repetition': 'Repetition', | |
'Summarization': 'Summarization', | |
'Translation': 'Translation', | |
'Multi-Turn': 'Multi-Turn' | |
}, | |
'Avg Turns': { | |
'Content Generation': 'Content Generation', | |
'Editing': 'Editing', | |
'Data Analysis': 'Data Analysis', | |
'Reasoning': 'Reasoning', | |
'Hallucination': 'Hallucination', | |
'Safety': 'Safety', | |
'Repetition': 'Repetition', | |
'Summarization': 'Summarization', | |
'Translation': 'Translation', | |
'Multi-Turn': 'Multi-Turn' | |
} | |
} | |
if actual_metric_type not in domain_mapping: | |
return create_empty_radar_chart(f"Domain breakdown not available for {metric_type}") | |
if selected_models is None or len(selected_models) == 0: | |
if actual_metric_type in df.columns: | |
selected_models = df.nlargest(max_models, actual_metric_type)['Model Name'].tolist() | |
else: | |
selected_models = df.head(max_models)['Model Name'].tolist() | |
selected_models = selected_models[:max_models] | |
domains = list(domain_mapping[actual_metric_type].keys()) | |
domain_columns = list(domain_mapping[actual_metric_type].values()) | |
harmonious_palette_light = [ | |
{'fill': 'rgba(79,143,198,0.25)', 'line': '#4F8FC6', 'name': 'BlueGray'}, | |
{'fill': 'rgba(109,213,237,0.25)', 'line': '#6DD5ED', 'name': 'SkyBlue'}, | |
{'fill': 'rgba(162,89,247,0.25)', 'line': '#A259F7', 'name': 'Violet'}, | |
{'fill': 'rgba(67,233,123,0.25)', 'line': '#43E97B', 'name': 'Mint'}, | |
{'fill': 'rgba(255,215,0,0.20)', 'line': '#FFD700', 'name': 'Gold'} | |
] | |
harmonious_palette_dark = [ | |
{'fill': 'rgba(144,202,249,0.25)', 'line': '#90CAF9', 'name': 'LightBlue'}, | |
{'fill': 'rgba(128,203,196,0.25)', 'line': '#80CBC4', 'name': 'Mint'}, | |
{'fill': 'rgba(179,157,219,0.25)', 'line': '#B39DDB', 'name': 'Lavender'}, | |
{'fill': 'rgba(244,143,177,0.25)', 'line': '#F48FB1', 'name': 'Pink'}, | |
{'fill': 'rgba(255,213,79,0.20)', 'line': '#FFD54F', 'name': 'Gold'} | |
] | |
palette = harmonious_palette_light if theme == "light" else harmonious_palette_dark | |
fig = go.Figure() | |
for idx, model_name in enumerate(selected_models): | |
model_data = df[df['Model Name'] == model_name] | |
if model_data.empty: | |
continue | |
model_row = model_data.iloc[0] | |
values = [] | |
for domain, _ in zip(domains, domain_columns): | |
if domain in df.columns and domain in model_row: | |
val = model_row[domain] | |
if pd.isna(val) or val == '': | |
val = 0 | |
else: | |
val = float(val) | |
values.append(val) | |
else: | |
values.append(0) | |
values_plot = values + [values[0]] | |
domains_plot = domains + [domains[0]] | |
colors = palette[idx % len(palette)] | |
fig.add_trace( | |
go.Scatterpolar( | |
r=values_plot, | |
theta=domains_plot, | |
fill='toself', | |
fillcolor=colors['fill'], | |
line=dict( | |
color=colors['line'], | |
width=3, | |
shape='spline', | |
smoothing=0.5 | |
), | |
marker=dict( | |
size=10, | |
color=colors['line'], | |
symbol='circle', | |
line=dict(width=2, color='#01091A' if theme == "light" else '#e3e6f3') | |
), | |
name=get_display_model_name(model_name), | |
mode="lines+markers", | |
hovertemplate="<b>%{fullData.name}</b><br>" + | |
"<span style='color: #94A3B8'>%{theta}</span><br>" + | |
"<b style='font-size: 12px'>%{r:.3f}</b><br>" + | |
"<extra></extra>", | |
hoverlabel=dict( | |
bgcolor="rgba(1, 9, 26, 0.95)" if theme == "dark" else "rgba(227,230,243,0.95)", | |
bordercolor=colors['line'], | |
font=dict(color="#F5F6F7" if theme == "dark" else "#23244a", size=12, family="Verdana, sans-serif") | |
) | |
) | |
) | |
max_range = 100.0 | |
tick_vals = [i * max_range / 5 for i in range(6)] | |
tick_text = [f"{val:.2f}" for val in tick_vals] | |
theme_colors = get_theme_colors(theme) | |
fig.update_layout( | |
polar=dict( | |
bgcolor=theme_colors["plot_bg"], | |
radialaxis=dict( | |
visible=True, | |
range=[0, max_range], | |
showline=True, | |
linewidth=2, | |
linecolor='rgba(245, 246, 247, 0.2)', | |
gridcolor='rgba(245, 246, 247, 0.1)', | |
gridwidth=1, | |
tickvals=tick_vals, | |
ticktext=tick_text, | |
tickfont=dict( | |
size=11, | |
color='#94A3B8', | |
family="'Geist Mono', monospace" | |
), | |
tickangle=0 | |
), | |
angularaxis=dict( | |
showline=True, | |
linewidth=2, | |
linecolor='rgba(245, 246, 247, 0.2)', | |
gridcolor='rgba(245, 246, 247, 0.08)', | |
tickfont=dict( | |
size=14, | |
family="Verdana, sans-serif", | |
color=theme_colors["legend_font_color"], | |
weight=600 | |
), | |
rotation=90, | |
direction="clockwise", | |
), | |
), | |
showlegend=True, | |
legend=dict( | |
orientation="h", | |
yanchor="bottom", | |
y=-0.15, | |
xanchor="center", | |
x=0.5, | |
font=dict( | |
size=12, | |
family="Verdana, sans-serif", | |
color=theme_colors["legend_font_color"] | |
), | |
bgcolor=theme_colors["legend_bg"], | |
bordercolor='rgba(245, 246, 247, 0.2)', | |
borderwidth=1, | |
itemsizing='constant', | |
itemwidth=30 | |
), | |
title=dict( | |
text=f"<b>Category Performance</b>", | |
x=0.5, | |
y=0.97, | |
font=dict( | |
size=22, | |
family="Verdana, sans-serif", | |
color=theme_colors["legend_font_color"], | |
weight=700 | |
), | |
), | |
paper_bgcolor=theme_colors["paper_bg"], | |
plot_bgcolor=theme_colors["plot_bg"], | |
height=900, | |
width=1450, | |
margin=dict(t=100, b=80, l=80, r=80), | |
annotations=[ | |
dict( | |
text="TRUEBench", | |
xref="paper", yref="paper", | |
x=0.98, y=0.02, | |
xanchor='right', yanchor='bottom', | |
font=dict(size=10, color=theme_colors["annotation_color"]), | |
showarrow=False | |
) | |
] | |
) | |
return fig | |