Spaces:

SamsungResearch
/

TRUEBench

Running

TRUEBench / vis_utils.py

송종윤/AI Productivity팀(SR)/삼성전자

Initial commit

8a254d6 6 days ago

27.6 kB

	import pandas as pd
	import numpy as np
	import plotly.graph_objects as go
	from plotly.graph_objs._figure import Figure
	from typing import Optional, List, Dict, Any
	from src.display.formatting import get_display_model_name

	SORT_COLUMN_MAP = {
	"Average Accuracy": "Avg AC",
	"Tool Selection Quality": "Avg TSQ",
	"Session Cost": "Avg Total Cost"
	}

	def get_theme_colors(theme: str = "light") -> Dict[str, Any]:
	"""Return color settings for the given theme."""
	if theme == "dark":
	return {
	"paper_bg": "#181c3a", # darker blue-gray
	"plot_bg": "#181c3a",
	"legend_font_color": "#F5F6F7",
	"legend_bg": 'rgba(35,36,74,0.92)', # slightly lighter than bg, but still dark
	"annotation_color": '#F5F6F7'
	}
	else:
	return {
	"paper_bg": "#23244a", # deep blue-gray
	"plot_bg": "#23244a",
	"legend_font_color": "#F5F6F7",
	"legend_bg": 'rgba(35,36,74,0.92)', # match bg for harmony
	"annotation_color": '#F5F6F7'
	}

	def create_empty_radar_chart(message: str) -> Figure:
	"""Create an empty radar chart with a message."""
	fig = go.Figure()
	fig.add_annotation(
	text=f"📊 {message}",
	xref="paper", yref="paper",
	x=0.5, y=0.5,
	xanchor='center', yanchor='middle',
	font=dict(
	size=18,
	color="#94A3B8",
	family="Verdana, sans-serif"
	),
	showarrow=False,
	bgcolor="rgba(245, 246, 247, 0.05)",
	bordercolor="rgba(245, 246, 247, 0.2)",
	borderwidth=1,
	borderpad=20
	)
	fig.update_layout(
	paper_bgcolor="#01091A",
	plot_bgcolor="rgba(245, 246, 247, 0.02)",
	height=800,
	width=800,
	margin=dict(t=100, b=80, l=80, r=80),
	title=dict(
	text="<b>Domain Performance Chart</b>",
	x=0.5,
	y=0.97,
	font=dict(
	size=22,
	family="Verdana, sans-serif",
	color="#F5F6F7",
	weight=700
	),
	),
	annotations=[
	dict(
	text="TRUEBench",
	xref="paper", yref="paper",
	x=0.98, y=0.02,
	xanchor='right', yanchor='bottom',
	font=dict(size=10, color='#64748B'),
	showarrow=False
	)
	]
	)
	return fig

	def create_len_overall_scatter(
	df: pd.DataFrame,
	selected_models: Optional[List[str]] = None,
	max_models: int = 30,
	y_col: str = "Overall",
	length_data: Optional[dict] = None,
	theme: str = "light",
	x_axis_data_source: str = "Med. Len."
	) -> Figure:
	"""
	Create scatter plot showing Med. Len. vs selected y_col for up to 10 selected models.
	Each dot is colored by Think (normal/reasoning), and the legend is by Think.
	DataFrame must include an 'Think' column.
	length_data: JSON data containing model length information by category
	theme: "light" or "dark" (default: "light")
	"""
	import plotly.express as px
	import json

	# Defensive: check required columns
	required_cols = ['Model Name', 'Med. Len.', 'Med. Resp. Len.', y_col]
	for col in required_cols:
	if col not in df.columns:
	return create_empty_radar_chart(f"Column '{col}' not found in data")
	# Think column check
	think_col = None
	for candidate in ['Think']:
	if candidate in df.columns:
	think_col = candidate
	break
	if think_col is None:
	return create_empty_radar_chart("Column 'Think' not found in data")
	# Filter by selected_models
	if selected_models is not None and len(selected_models) > 0:
	df_filtered = df[df['Model Name'].isin(selected_models)].copy()
	else:
	# Default: top-N by Overall
	df_filtered = df.copy()
	df_filtered = df_filtered.sort_values('Overall', ascending=False).head(max_models)
	if df_filtered.empty:
	return create_empty_radar_chart(f"No data available for {x_axis_data_source} vs {y_col} analysis")

	# Determine x-axis data based on x_axis_data_source
	x_axis_col_name = x_axis_data_source # Use this for the DataFrame column
	length_data_key = 'Med' if x_axis_data_source == "Med. Len." else 'Med Resp'

	if y_col == "Overall":
	# For 'Overall' category, prefer direct DataFrame column reading
	df_filtered[x_axis_col_name] = pd.to_numeric(df_filtered[x_axis_col_name], errors='coerce')
	elif length_data:
	# For other categories, use length_data if available
	df_filtered[x_axis_col_name] = df_filtered['Model Name'].apply(
	lambda x: length_data.get(x, {}).get(y_col, {}).get(length_data_key, 0)
	)
	else:
	# Fallback if no length_data and not 'Overall' (though this case should ideally be handled by required_cols)
	df_filtered[x_axis_col_name] = pd.to_numeric(df_filtered[x_axis_col_name], errors='coerce')

	df_filtered[y_col] = pd.to_numeric(df_filtered[y_col], errors='coerce')
	if 'Type' in df_filtered.columns:
	df_filtered = df_filtered[df_filtered['Type'] != 'Proprietary']
	if 'Parameter Size (B)' in df_filtered.columns:
	df_filtered['Parameter Size (B)'] = pd.to_numeric(df_filtered['Parameter Size (B)'], errors='coerce')
	min_size = 20
	max_size = 80
	param_sizes = df_filtered['Parameter Size (B)'].fillna(5)
	log_sizes = np.log10(param_sizes)
	log_min = np.log10(5)
	log_max = np.log10(param_sizes.max())
	marker_sizes = min_size + ((log_sizes - log_min) / (log_max - log_min)) * (max_size - min_size)
	else:
	marker_sizes = [30] * len(df_filtered)

	legend_name_map = {
	'On': 'Thinking',
	'Off': 'Non-Thinking'
	}
	color_palette = {
	"Thinking": "#FCE39B",
	"Non-Thinking": "#FF9185"
	}
	df_filtered['MarkerType'] = df_filtered['Parameter Size (B)'].apply(
	lambda x: 'circle' if pd.notna(x) else 'star'
	)
	df_filtered['ThinkDisplay'] = df_filtered['Think'].map(legend_name_map).fillna(df_filtered['Think'])
	prefix_map = {
	'circle': 'Open',
	'star': 'Proprietary'
	}
	combinations = df_filtered[['ThinkDisplay', 'MarkerType']].drop_duplicates()
	marker_order = {'circle': 0, 'star': 1}
	think_order = {'Thinking': 0, 'Non-Thinking': 1}
	combinations['sort_key'] = combinations.apply(
	lambda row: (marker_order.get(row['MarkerType'], 99), think_order.get(row['ThinkDisplay'], 99)),
	axis=1
	)
	combinations = combinations.sort_values('sort_key')

	fig = go.Figure()
	legend_shown = set()
	median_x = df_filtered[x_axis_col_name].median()
	median_y = df_filtered[y_col].median()

	x_axis_display_name = x_axis_data_source.replace("Med.", "Median").replace("Len.", "Length")

	fig.add_vline(
	x=median_x,
	line_dash="dash",
	line_color="#64748B",
	opacity=0.6,
	line_width=1.5,
	annotation_text=f"{x_axis_display_name}",
	annotation_position="top right",
	annotation_font=dict(size=10, color="#64748B")
	)
	fig.add_hline(
	y=median_y,
	line_dash="dash",
	line_color="#64748B",
	opacity=0.6,
	line_width=1.5,
	annotation_text=f"Median {y_col}",
	annotation_position="bottom right",
	annotation_font=dict(size=10, color="#64748B")
	)

	for _, row in combinations.iterrows():
	think = row['ThinkDisplay']
	marker_type = row['MarkerType']
	prefix = prefix_map.get(marker_type, '')
	legend_name = f"{prefix} {think}"
	sub_df = df_filtered[
	(df_filtered['ThinkDisplay'] == think) &
	(df_filtered['MarkerType'] == marker_type)
	]
	color = color_palette.get(think, "#1098F7")
	sub_marker_sizes = (
	marker_sizes[sub_df.index]
	if 'Parameter Size (B)' in df_filtered.columns and marker_type == 'circle'
	else [30] * len(sub_df)
	)
	show_legend = legend_name not in legend_shown
	legend_shown.add(legend_name)
	fig.add_trace(go.Scatter(
	x=sub_df[x_axis_col_name],
	y=sub_df[y_col],
	mode='markers+text',
	name=legend_name,
	legendgroup=legend_name,
	showlegend=show_legend,
	marker_symbol=marker_type,
	marker=dict(
	size=sub_marker_sizes,
	color=color,
	opacity=0.85,
	line=dict(width=2, color='#01091A')
	),
	text=sub_df['Model Name'].apply(get_display_model_name),
	textposition="top center",
	textfont=dict(size=10, color='#94A3B8'),
	hovertemplate="<b>%{text}</b><br>" +
	f"{x_axis_display_name}: "+"%{x:.2f}<br>" +
	f"{y_col}: "+"%{y:.2f}<br>" +
	f"Think: {legend_name}<br>" +
	("Parameter Size: %{customdata}B<br>" if marker_type == 'circle' else "") +
	"<extra></extra>",
	customdata=sub_df['Parameter Size (B)'].values if marker_type == 'circle' else None
	))

	# Theme colors
	theme_colors = get_theme_colors(theme)
	fig.update_layout(
	title=dict(
	text=f"<b>{y_col} {x_axis_display_name} vs Category Score</b>",
	x=0.5,
	y=0.97,
	font=dict(size=22, family="Verdana, sans-serif", color=theme_colors["legend_font_color"], weight=700)
	),
	xaxis=dict(
	title=dict(
	text=f"<b>{y_col} {x_axis_display_name}</b>",
	font=dict(size=16, color=theme_colors["legend_font_color"])
	),
	tickfont=dict(size=12, color="#94A3B8"),
	gridcolor="rgba(245, 246, 247, 0.1)",
	zerolinecolor="rgba(245, 246, 247, 0.2)"
	),
	yaxis=dict(
	title=dict(
	text=f"<b>{y_col} Score</b>",
	font=dict(size=16, color=theme_colors["legend_font_color"])
	),
	tickfont=dict(size=12, color="#94A3B8"),
	gridcolor="rgba(245, 246, 247, 0.1)",
	zerolinecolor="rgba(245, 246, 247, 0.2)"
	),
	paper_bgcolor=theme_colors["paper_bg"],
	plot_bgcolor=theme_colors["plot_bg"],
	height=900,
	width=1450,
	showlegend=True,
	legend=dict(
	orientation="h",
	yanchor="bottom",
	y=1,
	xanchor="center",
	x=0.5,
	font=dict(size=12, family="Verdana, sans-serif", color=theme_colors["legend_font_color"]),
	bgcolor=theme_colors["legend_bg"],
	bordercolor='rgba(245, 246, 247, 0.2)',
	borderwidth=1
	),
	margin=dict(t=100, b=80, l=80, r=80)
	)
	return fig

	def create_language_radar_chart(
	df: pd.DataFrame,
	metric_type: str,
	selected_models: Optional[List[str]] = None,
	max_models: int = 5,
	theme: str = "light"
	) -> Figure:
	"""
	Create a radar chart showing model performance across languages for the selected models.
	theme: "light" or "dark" (default: "light")
	"""
	language_domains = ['KO', 'EN', 'JA', 'ZH', 'PL', 'DE', 'PT', 'ES', 'FR', 'IT', 'RU', 'VI']
	if selected_models is None or len(selected_models) == 0:
	actual_metric_type = SORT_COLUMN_MAP.get(metric_type, metric_type)
	if actual_metric_type in df.columns:
	selected_models = df.nlargest(max_models, actual_metric_type)['Model Name'].tolist()
	else:
	selected_models = df.head(max_models)['Model Name'].tolist()
	selected_models = selected_models[:max_models]
	harmonious_palette_light = [
	{'fill': 'rgba(79,143,198,0.25)', 'line': '#4F8FC6', 'name': 'BlueGray'},
	{'fill': 'rgba(109,213,237,0.25)', 'line': '#6DD5ED', 'name': 'SkyBlue'},
	{'fill': 'rgba(162,89,247,0.25)', 'line': '#A259F7', 'name': 'Violet'},
	{'fill': 'rgba(67,233,123,0.25)', 'line': '#43E97B', 'name': 'Mint'},
	{'fill': 'rgba(255,215,0,0.20)', 'line': '#FFD700', 'name': 'Gold'}
	]
	harmonious_palette_dark = [
	{'fill': 'rgba(144,202,249,0.25)', 'line': '#90CAF9', 'name': 'LightBlue'},
	{'fill': 'rgba(128,203,196,0.25)', 'line': '#80CBC4', 'name': 'Mint'},
	{'fill': 'rgba(179,157,219,0.25)', 'line': '#B39DDB', 'name': 'Lavender'},
	{'fill': 'rgba(244,143,177,0.25)', 'line': '#F48FB1', 'name': 'Pink'},
	{'fill': 'rgba(255,213,79,0.20)', 'line': '#FFD54F', 'name': 'Gold'}
	]
	palette = harmonious_palette_light if theme == "light" else harmonious_palette_dark
	fig = go.Figure()
	for idx, model_name in enumerate(selected_models):
	model_data = df[df['Model Name'] == model_name]
	if model_data.empty:
	continue
	model_row = model_data.iloc[0]
	values = []
	for lang in language_domains:
	val = model_row[lang] if lang in model_row else 0
	if pd.isna(val) or val == '':
	val = 0
	else:
	val = float(val)
	values.append(val)
	values_plot = values + [values[0]]
	domains_plot = language_domains + [language_domains[0]]
	colors = palette[idx % len(palette)]
	fig.add_trace(
	go.Scatterpolar(
	r=values_plot,
	theta=domains_plot,
	fill='toself',
	fillcolor=colors['fill'],
	line=dict(
	color=colors['line'],
	width=3,
	shape='spline',
	smoothing=0.5
	),
	marker=dict(
	size=10,
	color=colors['line'],
	symbol='circle',
	line=dict(width=2, color='#01091A' if theme == "light" else '#e3e6f3')
	),
	name=get_display_model_name(model_name),
	mode="lines+markers",
	hovertemplate="<b>%{fullData.name}</b><br>" +
	"<span style='color: #94A3B8'>%{theta}</span><br>" +
	"<b style='font-size: 12px'>%{r:.3f}</b><br>" +
	"<extra></extra>",
	hoverlabel=dict(
	bgcolor="rgba(1, 9, 26, 0.95)" if theme == "dark" else "rgba(227,230,243,0.95)",
	bordercolor=colors['line'],
	font=dict(color="#F5F6F7" if theme == "dark" else "#23244a", size=12, family="Verdana, sans-serif")
	)
	)
	)
	max_range = 100.0
	tick_vals = [i * max_range / 5 for i in range(6)]
	tick_text = [f"{val:.2f}" for val in tick_vals]
	theme_colors = get_theme_colors(theme)
	fig.update_layout(
	polar=dict(
	bgcolor=theme_colors["plot_bg"],
	domain=dict(x=[0,1], y=[0,1]),
	radialaxis=dict(
	visible=True,
	range=[0, max_range],
	showline=True,
	linewidth=2,
	linecolor='rgba(245, 246, 247, 0.2)',
	gridcolor='rgba(245, 246, 247, 0.1)',
	gridwidth=1,
	tickvals=tick_vals,
	ticktext=tick_text,
	tickfont=dict(
	size=11,
	color='#94A3B8',
	family="'Geist Mono', monospace"
	),
	tickangle=0
	),
	angularaxis=dict(
	showline=True,
	linewidth=2,
	linecolor='rgba(245, 246, 247, 0.2)',
	gridcolor='rgba(245, 246, 247, 0.08)',
	tickfont=dict(
	size=14,
	family="Verdana, sans-serif",
	color=theme_colors["legend_font_color"],
	weight=600
	),
	ticktext=[
	"📝 Content Gen",
	"✂️ Editing",
	"📊 Data Analysis",
	"🧠 Reasoning",
	"🦄 Hallucination",
	"🛡️ Safety",
	"🔁 Repetition",
	"📝 Summarization",
	"🌐 Translation",
	"💬 Multi-Turn"
	],
	rotation=90,
	direction="clockwise",
	),
	),
	showlegend=True,
	legend=dict(
	orientation="h",
	yanchor="bottom",
	y=-0.15,
	xanchor="center",
	x=0.5,
	font=dict(
	size=12,
	family="Verdana, sans-serif",
	color=theme_colors["legend_font_color"]
	),
	bgcolor=theme_colors["legend_bg"],
	bordercolor='rgba(245, 246, 247, 0.2)',
	borderwidth=1,
	itemsizing='constant',
	itemwidth=30
	),
	title=dict(
	text=f"<b>Language Performance</b>",
	x=0.5,
	y=0.97,
	font=dict(
	size=22,
	family="Verdana, sans-serif",
	color=theme_colors["legend_font_color"],
	weight=700
	),
	),
	paper_bgcolor=theme_colors["paper_bg"],
	plot_bgcolor=theme_colors["plot_bg"],
	height=900,
	width=1450,
	margin=dict(t=100, b=80, l=80, r=80),
	annotations=[
	dict(
	text="TRUEBench",
	xref="paper", yref="paper",
	x=0.98, y=0.02,
	xanchor='right', yanchor='bottom',
	font=dict(size=10, color=theme_colors["annotation_color"]),
	showarrow=False
	)
	]
	)
	return fig

	def load_leaderboard_data() -> pd.DataFrame:
	"""Load and prepare the leaderboard data (Category)."""
	from src.data_loader import get_category_dataframe
	return get_category_dataframe(processed=True)

	def load_leaderboard_language_data() -> pd.DataFrame:
	"""Load and prepare the leaderboard data (Language)."""
	from src.data_loader import get_language_dataframe
	return get_language_dataframe(processed=True)

	def create_domain_radar_chart(
	df: pd.DataFrame,
	metric_type: str,
	selected_models: Optional[List[str]] = None,
	max_models: int = 5,
	theme: str = "light"
	) -> Figure:
	"""
	Create a radar chart showing model performance across domains for the selected metric.
	theme: "light" or "dark" (default: "light")
	"""
	actual_metric_type = SORT_COLUMN_MAP.get(metric_type, metric_type)
	domain_mapping = {
	'Avg AC': {
	'Content Generation': '📝 Content Generation',
	'Editing': '✂️ Editing',
	'Data Analysis': '📊 Data Analysis',
	'Reasoning': '🧠 Reasoning',
	'Hallucination': '🦄 Hallucination',
	'Safety': '🛡️ Safety',
	'Repetition': '🔁 Repetition',
	'Summarization': '📝 Summarization',
	'Translation': '🌐 Translation',
	'Multi-Turn': '💬 Multi-Turn'
	},
	'Avg TSQ': {
	'Content Generation': 'Content Generation',
	'Editing': 'Editing',
	'Data Analysis': 'Data Analysis',
	'Reasoning': 'Reasoning',
	'Hallucination': 'Hallucination',
	'Safety': 'Safety',
	'Repetition': 'Repetition',
	'Summarization': 'Summarization',
	'Translation': 'Translation',
	'Multi-Turn': 'Multi-Turn'
	},
	'Avg Total Cost': {
	'Content Generation': 'Content Generation',
	'Editing': 'Editing',
	'Data Analysis': 'Data Analysis',
	'Reasoning': 'Reasoning',
	'Hallucination': 'Hallucination',
	'Safety': 'Safety',
	'Repetition': 'Repetition',
	'Summarization': 'Summarization',
	'Translation': 'Translation',
	'Multi-Turn': 'Multi-Turn'
	},
	'Avg Session Duration': {
	'Content Generation': 'Content Generation',
	'Editing': 'Editing',
	'Data Analysis': 'Data Analysis',
	'Reasoning': 'Reasoning',
	'Hallucination': 'Hallucination',
	'Safety': 'Safety',
	'Repetition': 'Repetition',
	'Summarization': 'Summarization',
	'Translation': 'Translation',
	'Multi-Turn': 'Multi-Turn'
	},
	'Avg Turns': {
	'Content Generation': 'Content Generation',
	'Editing': 'Editing',
	'Data Analysis': 'Data Analysis',
	'Reasoning': 'Reasoning',
	'Hallucination': 'Hallucination',
	'Safety': 'Safety',
	'Repetition': 'Repetition',
	'Summarization': 'Summarization',
	'Translation': 'Translation',
	'Multi-Turn': 'Multi-Turn'
	}
	}
	if actual_metric_type not in domain_mapping:
	return create_empty_radar_chart(f"Domain breakdown not available for {metric_type}")
	if selected_models is None or len(selected_models) == 0:
	if actual_metric_type in df.columns:
	selected_models = df.nlargest(max_models, actual_metric_type)['Model Name'].tolist()
	else:
	selected_models = df.head(max_models)['Model Name'].tolist()
	selected_models = selected_models[:max_models]
	domains = list(domain_mapping[actual_metric_type].keys())
	domain_columns = list(domain_mapping[actual_metric_type].values())
	harmonious_palette_light = [
	{'fill': 'rgba(79,143,198,0.25)', 'line': '#4F8FC6', 'name': 'BlueGray'},
	{'fill': 'rgba(109,213,237,0.25)', 'line': '#6DD5ED', 'name': 'SkyBlue'},
	{'fill': 'rgba(162,89,247,0.25)', 'line': '#A259F7', 'name': 'Violet'},
	{'fill': 'rgba(67,233,123,0.25)', 'line': '#43E97B', 'name': 'Mint'},
	{'fill': 'rgba(255,215,0,0.20)', 'line': '#FFD700', 'name': 'Gold'}
	]
	harmonious_palette_dark = [
	{'fill': 'rgba(144,202,249,0.25)', 'line': '#90CAF9', 'name': 'LightBlue'},
	{'fill': 'rgba(128,203,196,0.25)', 'line': '#80CBC4', 'name': 'Mint'},
	{'fill': 'rgba(179,157,219,0.25)', 'line': '#B39DDB', 'name': 'Lavender'},
	{'fill': 'rgba(244,143,177,0.25)', 'line': '#F48FB1', 'name': 'Pink'},
	{'fill': 'rgba(255,213,79,0.20)', 'line': '#FFD54F', 'name': 'Gold'}
	]
	palette = harmonious_palette_light if theme == "light" else harmonious_palette_dark
	fig = go.Figure()
	for idx, model_name in enumerate(selected_models):
	model_data = df[df['Model Name'] == model_name]
	if model_data.empty:
	continue
	model_row = model_data.iloc[0]
	values = []
	for domain, _ in zip(domains, domain_columns):
	if domain in df.columns and domain in model_row:
	val = model_row[domain]
	if pd.isna(val) or val == '':
	val = 0
	else:
	val = float(val)
	values.append(val)
	else:
	values.append(0)
	values_plot = values + [values[0]]
	domains_plot = domains + [domains[0]]
	colors = palette[idx % len(palette)]
	fig.add_trace(
	go.Scatterpolar(
	r=values_plot,
	theta=domains_plot,
	fill='toself',
	fillcolor=colors['fill'],
	line=dict(
	color=colors['line'],
	width=3,
	shape='spline',
	smoothing=0.5
	),
	marker=dict(
	size=10,
	color=colors['line'],
	symbol='circle',
	line=dict(width=2, color='#01091A' if theme == "light" else '#e3e6f3')
	),
	name=get_display_model_name(model_name),
	mode="lines+markers",
	hovertemplate="<b>%{fullData.name}</b><br>" +
	"<span style='color: #94A3B8'>%{theta}</span><br>" +
	"<b style='font-size: 12px'>%{r:.3f}</b><br>" +
	"<extra></extra>",
	hoverlabel=dict(
	bgcolor="rgba(1, 9, 26, 0.95)" if theme == "dark" else "rgba(227,230,243,0.95)",
	bordercolor=colors['line'],
	font=dict(color="#F5F6F7" if theme == "dark" else "#23244a", size=12, family="Verdana, sans-serif")
	)
	)
	)
	max_range = 100.0
	tick_vals = [i * max_range / 5 for i in range(6)]
	tick_text = [f"{val:.2f}" for val in tick_vals]
	theme_colors = get_theme_colors(theme)
	fig.update_layout(
	polar=dict(
	bgcolor=theme_colors["plot_bg"],
	radialaxis=dict(
	visible=True,
	range=[0, max_range],
	showline=True,
	linewidth=2,
	linecolor='rgba(245, 246, 247, 0.2)',
	gridcolor='rgba(245, 246, 247, 0.1)',
	gridwidth=1,
	tickvals=tick_vals,
	ticktext=tick_text,
	tickfont=dict(
	size=11,
	color='#94A3B8',
	family="'Geist Mono', monospace"
	),
	tickangle=0
	),
	angularaxis=dict(
	showline=True,
	linewidth=2,
	linecolor='rgba(245, 246, 247, 0.2)',
	gridcolor='rgba(245, 246, 247, 0.08)',
	tickfont=dict(
	size=14,
	family="Verdana, sans-serif",
	color=theme_colors["legend_font_color"],
	weight=600
	),
	rotation=90,
	direction="clockwise",
	),
	),
	showlegend=True,
	legend=dict(
	orientation="h",
	yanchor="bottom",
	y=-0.15,
	xanchor="center",
	x=0.5,
	font=dict(
	size=12,
	family="Verdana, sans-serif",
	color=theme_colors["legend_font_color"]
	),
	bgcolor=theme_colors["legend_bg"],
	bordercolor='rgba(245, 246, 247, 0.2)',
	borderwidth=1,
	itemsizing='constant',
	itemwidth=30
	),
	title=dict(
	text=f"<b>Category Performance</b>",
	x=0.5,
	y=0.97,
	font=dict(
	size=22,
	family="Verdana, sans-serif",
	color=theme_colors["legend_font_color"],
	weight=700
	),
	),
	paper_bgcolor=theme_colors["paper_bg"],
	plot_bgcolor=theme_colors["plot_bg"],
	height=900,
	width=1450,
	margin=dict(t=100, b=80, l=80, r=80),
	annotations=[
	dict(
	text="TRUEBench",
	xref="paper", yref="paper",
	x=0.98, y=0.02,
	xanchor='right', yanchor='bottom',
	font=dict(size=10, color=theme_colors["annotation_color"]),
	showarrow=False
	)
	]
	)
	return fig