Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import gradio as gr | |
import pandas as pd | |
import plotly.graph_objects as go | |
# Utility functions (moved from utils.py) | |
def get_chart_colors(): | |
return { | |
"Private": "#1098F7", # Airglow Blue for Proprietary | |
"Open source": "#58BC82", # Green for Open source | |
"performance_bands": ["#DCFCE7", "#FEF9C3", "#FEE2E2"], | |
"text": "#F5F6F7", | |
"background": "#01091A", | |
"grid": (0, 0, 0, 0.1), # RGBA tuple for grid | |
} | |
def get_rank_badge(rank): | |
"""Generate HTML for rank badge with appropriate styling""" | |
badge_styles = { | |
1: ("1st", "linear-gradient(145deg, #ffd700, #ffc400)", "#000"), | |
2: ("2nd", "linear-gradient(145deg, #9ca3af, #787C7E)", "#fff"), | |
3: ("3rd", "linear-gradient(145deg, #CD7F32, #b36a1d)", "#fff"), | |
} | |
if rank in badge_styles: | |
label, gradient, text_color = badge_styles[rank] | |
return f""" | |
<div style=" | |
display: inline-flex; | |
align-items: center; | |
justify-content: center; | |
min-width: 48px; | |
padding: 4px 12px; | |
background: {gradient}; | |
color: {text_color}; | |
border-radius: 6px; | |
font-weight: 600; | |
font-size: 0.9em; | |
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2); | |
"> | |
{label} | |
</div> | |
""" | |
return f""" | |
<div style=" | |
display: inline-flex; | |
align-items: center; | |
justify-content: center; | |
min-width: 28px; | |
color: #a1a1aa; | |
font-weight: 500; | |
"> | |
{rank} | |
</div> | |
""" | |
def get_type_badge(model_type): | |
"""Generate HTML for model type badge""" | |
colors = get_chart_colors() | |
colors = {"Private": colors["Private"], "Open source": colors["Open source"]} | |
bg_color = colors.get(model_type, "#4F46E5") | |
return f""" | |
<div style=" | |
display: inline-flex; | |
align-items: center; | |
padding: 4px 8px; | |
background: {bg_color}; | |
color: white; | |
border-radius: 4px; | |
font-size: 0.85em; | |
font-weight: 500; | |
"> | |
{model_type} | |
</div> | |
""" | |
def get_output_type_badge(output_type): | |
"""Generate HTML for output type badge""" | |
if output_type == "Reasoning": | |
bg_color = "#9333ea" # Purple for reasoning | |
else: | |
bg_color = "#6b7280" # Gray for normal | |
return f""" | |
<div style=" | |
display: inline-flex; | |
align-items: center; | |
gap: 4px; | |
padding: 4px 8px; | |
background: {bg_color}; | |
color: white; | |
border-radius: 4px; | |
font-size: 0.85em; | |
font-weight: 500; | |
"> | |
{output_type} | |
</div> | |
""" | |
def get_score_bar(score): | |
"""Generate HTML for score bar with gradient styling""" | |
width = score * 100 | |
return f""" | |
<div style="display: flex; align-items: center; gap: 12px; width: 100%;"> | |
<div style=" | |
flex-grow: 1; | |
height: 8px; | |
background: rgba(245, 246, 247, 0.1); | |
border-radius: 4px; | |
overflow: hidden; | |
max-width: 200px; | |
"> | |
<div style=" | |
width: {width}%; | |
height: 100%; | |
background: linear-gradient(90deg, #E35454, #1098F7); | |
border-radius: 4px; | |
transition: width 0.3s ease; | |
"></div> | |
</div> | |
<span style=" | |
font-family: 'SF Mono', monospace; | |
font-weight: 600; | |
color: #F5F6F7; | |
min-width: 60px; | |
">{score:.3f}</span> | |
</div> | |
""" | |
# Define column mapping once for reuse across all functions | |
SORT_COLUMN_MAP = { | |
"Avg Action Completion": "Avg AC", | |
"Avg Tool Selection Quality": "Avg TSQ", | |
"Avg Session Cost": "Avg Total Cost", | |
} | |
def create_leaderboard_v2_tab(): | |
"""Create the main leaderboard v2 tab with interactive table""" | |
def load_leaderboard_data(): | |
"""Load and prepare the leaderboard data""" | |
df = pd.read_csv('results_v2.csv') | |
# Clean and prepare data | |
df = df.copy() | |
# Round numeric columns for better display | |
numeric_cols = ['Avg AC', 'Avg TSQ', 'Avg Total Cost', 'Avg Session Duration', 'Avg Turns'] | |
for col in numeric_cols: | |
if col in df.columns: | |
df[col] = pd.to_numeric(df[col], errors='coerce').round(3) | |
# Fill NaN values appropriately | |
df = df.fillna('') | |
return df | |
def generate_html_table(filtered_df, domain_filter): | |
"""Generate styled HTML table with rank badges and score bars""" | |
table_html = """ | |
<style> | |
/* Dark theme table styling */ | |
.v2-table-container { | |
background: var(--bg-card); | |
border-radius: 16px; | |
overflow: hidden; | |
border: 1px solid var(--border-subtle); | |
margin-top: 20px; | |
} | |
.v2-styled-table { | |
width: 100%; | |
border-collapse: collapse; | |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif; | |
background: var(--bg-card); | |
color: var(--text-primary); | |
} | |
.v2-styled-table thead { | |
position: sticky; | |
top: 0; | |
background: rgba(227, 84, 84, 0.1); | |
z-index: 1; | |
} | |
.v2-styled-table th { | |
padding: 14px 12px; | |
text-align: left; | |
font-weight: 600; | |
color: var(--text-primary); | |
border-bottom: 2px solid var(--accent-primary); | |
font-size: 13px; | |
text-transform: uppercase; | |
letter-spacing: 0.05em; | |
} | |
.v2-styled-table td { | |
padding: 12px; | |
border-bottom: 1px solid var(--border-subtle); | |
color: var(--text-primary); | |
transition: all 0.2s ease; | |
} | |
.v2-styled-table tbody tr { | |
transition: all 0.3s ease; | |
} | |
.v2-styled-table tbody tr:hover { | |
background: rgba(227, 84, 84, 0.15) !important; | |
box-shadow: 0 0 20px rgba(227, 84, 84, 0.3), inset 0 0 20px rgba(227, 84, 84, 0.1); | |
transform: scale(1.01); | |
} | |
.v2-styled-table tbody tr:nth-child(even) { | |
background: var(--bg-secondary); | |
} | |
.model-name { | |
font-weight: 500; | |
color: var(--accent-primary); | |
transition: color 0.2s ease; | |
} | |
/* Keep model name color consistent on hover to emphasize row highlight */ | |
.v2-styled-table tr:hover .model-name { | |
color: var(--accent-secondary); | |
} | |
.numeric-cell { | |
font-family: 'Geist Mono', monospace; | |
font-size: 13px; | |
text-align: center; | |
} | |
/* Score bar specific styling */ | |
.score-cell { | |
min-width: 180px; | |
} | |
</style> | |
<div class="v2-table-container"> | |
<table class="v2-styled-table"> | |
<thead> | |
<tr> | |
<th style="width: 80px;">Rank</th> | |
<th>Model</th> | |
<th style="width: 120px;">Type</th> | |
<th style="width: 120px;">Output Type</th> | |
<th>Vendor</th> | |
<th style="width: 200px;">Avg Action Completion</th> | |
<th style="width: 200px;">Avg Tool Selection Quality</th> | |
<th>Avg Cost ($)</th> | |
<th>Avg Duration (s)</th> | |
<th>Avg Turns</th> | |
</tr> | |
</thead> | |
<tbody> | |
""" | |
# Generate table rows | |
for idx, (_, row) in enumerate(filtered_df.iterrows()): | |
rank = idx + 1 | |
table_html += f""" | |
<tr> | |
<td>{get_rank_badge(rank)}</td> | |
<td class="model-name">{row['Model']}</td> | |
<td>{get_type_badge(row['Model Type'])}</td> | |
<td>{get_output_type_badge(row.get('Output Type', 'Normal'))}</td> | |
<td>{row['Vendor']}</td> | |
""" | |
# Get appropriate values based on domain filter | |
if domain_filter != "All": | |
# For specific domain, show domain-specific scores | |
ac_col = f'{domain_filter} AC' | |
tsq_col = f'{domain_filter} TSQ' | |
# AC score | |
if ac_col in row and row[ac_col] != '': | |
ac_value = float(row[ac_col]) | |
table_html += f'<td class="score-cell">{get_score_bar(ac_value)}</td>' | |
else: | |
table_html += '<td class="numeric-cell">-</td>' | |
# TSQ score | |
if tsq_col in row and row[tsq_col] != '': | |
tsq_value = float(row[tsq_col]) | |
table_html += f'<td class="score-cell">{get_score_bar(tsq_value)}</td>' | |
else: | |
table_html += '<td class="numeric-cell">-</td>' | |
else: | |
# For "All", show overall averages | |
table_html += f""" | |
<td class="score-cell">{get_score_bar(row['Avg AC'])}</td> | |
<td class="score-cell">{get_score_bar(row['Avg TSQ'])}</td> | |
""" | |
# Add appropriate cost, duration, and turns based on domain filter | |
if domain_filter != "All": | |
# Use domain-specific values | |
cost_col = f'{domain_filter} Cost' | |
duration_col = f'{domain_filter} Duration' | |
turns_col = f'{domain_filter} Turns' | |
cost = row.get(cost_col, '') | |
duration = row.get(duration_col, '') | |
turns = row.get(turns_col, '') | |
# Convert to float if not empty | |
if cost != '': | |
cost = float(cost) | |
if duration != '': | |
duration = float(duration) | |
if turns != '': | |
turns = float(turns) | |
else: | |
# Use overall averages for "All" domain | |
cost = row.get('Avg Total Cost', row.get('Cost ($)', '')) | |
duration = row.get('Avg Session Duration', row.get('Duration (s)', '')) | |
turns = row.get('Avg Turns', row.get('Turns', '')) | |
# Format the values for display | |
if cost != '': | |
cost_display = f'{cost:.3f}' | |
else: | |
cost_display = '-' | |
if duration != '': | |
duration_display = f'{duration:.1f}' | |
else: | |
duration_display = '-' | |
if turns != '': | |
turns_display = f'{turns:.1f}' | |
else: | |
turns_display = '-' | |
table_html += f""" | |
<td class="numeric-cell">${cost_display}</td> | |
<td class="numeric-cell">{duration_display}</td> | |
<td class="numeric-cell">{turns_display}</td> | |
</tr> | |
""" | |
table_html += """ | |
</tbody> | |
</table> | |
</div> | |
""" | |
return table_html | |
def update_leaderboard_title(domain_filter): | |
"""Update the leaderboard title based on selected domain""" | |
# Strip emoji prefix from domain filter | |
domain_filter_clean = domain_filter | |
if domain_filter.startswith('π'): | |
domain_filter_clean = "All" | |
elif domain_filter.startswith('π¦'): | |
domain_filter_clean = "Banking" | |
elif domain_filter.startswith('π₯'): | |
domain_filter_clean = "Healthcare" | |
elif domain_filter.startswith('π‘οΈ'): | |
domain_filter_clean = "Insurance" | |
elif domain_filter.startswith('π°'): | |
domain_filter_clean = "Investment" | |
elif domain_filter.startswith('π±'): | |
domain_filter_clean = "Telecom" | |
return f""" | |
<div class="dark-container pulse" style="margin-bottom: 24px;"> | |
<div class="section-header"> | |
<span class="section-icon" style="color: var(--accent-primary);">π</span> | |
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;"> | |
Agent Leaderboard for {domain_filter_clean} | |
</h3> | |
</div> | |
<div class="dataframe-container"> | |
""" | |
def filter_and_sort_data(domain_filter, model_type_filter, reasoning_filter, sort_by, sort_order): | |
"""Filter and sort the leaderboard data""" | |
df = load_leaderboard_data() | |
# Apply filters | |
filtered_df = df.copy() | |
# Strip emoji prefix from domain filter | |
domain_filter_clean = domain_filter | |
if domain_filter.startswith('π'): | |
domain_filter_clean = "All" | |
elif domain_filter.startswith('π¦'): | |
domain_filter_clean = "Banking" | |
elif domain_filter.startswith('π₯'): | |
domain_filter_clean = "Healthcare" | |
elif domain_filter.startswith('π‘οΈ'): | |
domain_filter_clean = "Insurance" | |
elif domain_filter.startswith('π°'): | |
domain_filter_clean = "Investment" | |
elif domain_filter.startswith('π±'): | |
domain_filter_clean = "Telecom" | |
# Domain filtering (check if model performs well in specific domain) | |
if domain_filter_clean != "All": | |
domain_col_map = { | |
"Banking": "Banking AC", | |
"Healthcare": "Healthcare AC", | |
"Insurance": "Insurance AC", | |
"Investment": "Investment AC", | |
"Telecom": "Telecom AC" | |
} | |
if domain_filter_clean in domain_col_map: | |
domain_col = domain_col_map[domain_filter_clean] | |
# Only show models that have data for this domain | |
filtered_df = filtered_df[filtered_df[domain_col] != ''] | |
# Model type filtering | |
if model_type_filter != "All": | |
if model_type_filter == "Open Source": | |
filtered_df = filtered_df[filtered_df['Model Type'] == 'Open source'] | |
elif model_type_filter == "Proprietary": | |
filtered_df = filtered_df[filtered_df['Model Type'] == 'Proprietary'] | |
# Reasoning filtering | |
if reasoning_filter != "All": | |
if reasoning_filter == "Reasoning": | |
filtered_df = filtered_df[filtered_df['Output Type'] == 'Reasoning'] | |
elif reasoning_filter == "Normal": | |
filtered_df = filtered_df[filtered_df['Output Type'] == 'Normal'] | |
# Map display name to actual column name using shared mapping | |
actual_sort_column = SORT_COLUMN_MAP.get(sort_by, sort_by) | |
# If domain is selected and sorting by AC or TSQ, use domain-specific column | |
if domain_filter_clean != "All": | |
if actual_sort_column == "Avg AC": | |
actual_sort_column = f"{domain_filter_clean} AC" | |
elif actual_sort_column == "Avg TSQ": | |
actual_sort_column = f"{domain_filter_clean} TSQ" | |
elif actual_sort_column == "Avg Total Cost": | |
actual_sort_column = f"{domain_filter_clean} Cost" | |
elif actual_sort_column == "Avg Session Duration": | |
actual_sort_column = f"{domain_filter_clean} Duration" | |
elif actual_sort_column == "Avg Turns": | |
actual_sort_column = f"{domain_filter_clean} Turns" | |
if actual_sort_column and actual_sort_column in filtered_df.columns: | |
ascending = (sort_order == "Ascending") | |
filtered_df = filtered_df.sort_values(by=actual_sort_column, ascending=ascending, na_position='last') | |
# Generate HTML table | |
return generate_html_table(filtered_df, domain_filter_clean) | |
# Load initial data | |
initial_table = filter_and_sort_data("π All", "All", "All", "Avg AC", "Descending") | |
initial_df = load_leaderboard_data() # Load raw data for model selector | |
# Custom CSS for Galileo dark theme | |
custom_css = """ | |
<style> | |
/* Import Geist fonts */ | |
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap'); | |
@font-face { | |
font-family: 'Geist'; | |
src: url('https://raw.githubusercontent.com/vercel/geist-font/main/packages/next/dist/fonts/geist-sans/Geist-Variable.woff2') format('woff2'); | |
font-weight: 100 900; | |
font-style: normal; | |
} | |
@font-face { | |
font-family: 'Geist Mono'; | |
src: url('https://raw.githubusercontent.com/vercel/geist-font/main/packages/next/dist/fonts/geist-mono/GeistMono-Variable.woff2') format('woff2'); | |
font-weight: 100 900; | |
font-style: normal; | |
} | |
/* Root variables for enhanced color scheme */ | |
:root { | |
--bg-primary: #01091A; | |
--bg-secondary: rgba(245, 246, 247, 0.03); | |
--bg-card: rgba(245, 246, 247, 0.02); | |
--border-subtle: rgba(245, 246, 247, 0.08); | |
--border-default: rgba(245, 246, 247, 0.12); | |
--border-strong: rgba(245, 246, 247, 0.2); | |
--text-primary: #F5F6F7; | |
--text-secondary: #94A3B8; | |
--text-muted: #64748B; | |
--accent-primary: #E35454; | |
--accent-secondary: #1098F7; | |
--accent-tertiary: #F5F6F7; | |
--glow-primary: rgba(227, 84, 84, 0.4); | |
--glow-secondary: rgba(16, 152, 247, 0.4); | |
--glow-tertiary: rgba(245, 246, 247, 0.3); | |
} | |
/* Global font and background */ | |
.gradio-container { | |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, 'Inter', sans-serif !important; | |
background: var(--bg-primary) !important; | |
color: var(--text-primary) !important; | |
} | |
/* Headers and text */ | |
h1, h2, h3, h4 { | |
color: var(--text-primary) !important; | |
font-weight: 700 !important; | |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important; | |
} | |
p, span, div { | |
color: var(--text-primary) !important; | |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important; | |
} | |
/* Labels and info text */ | |
label { | |
color: var(--text-primary) !important; | |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important; | |
} | |
.gr-box label { | |
color: var(--text-primary) !important; | |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important; | |
} | |
.gr-info { | |
color: var(--text-secondary) !important; | |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important; | |
} | |
/* Simple metric cards */ | |
.metric-card { | |
background: var(--bg-card); | |
border-radius: 16px; | |
padding: 24px; | |
position: relative; | |
border: 1px solid var(--border-subtle); | |
transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1); | |
} | |
.metric-card:hover { | |
transform: translateY(-4px); | |
border-color: var(--accent-primary); | |
box-shadow: 0 8px 24px rgba(227, 84, 84, 0.2); | |
} | |
/* Metric icon with glow effect */ | |
.metric-icon { | |
width: 48px; | |
height: 48px; | |
display: flex; | |
align-items: center; | |
justify-content: center; | |
font-size: 2rem; | |
margin-bottom: 16px; | |
filter: drop-shadow(0 0 20px currentColor); | |
transition: all 0.3s ease; | |
} | |
.metric-card:hover .metric-icon { | |
transform: scale(1.1); | |
filter: drop-shadow(0 0 30px currentColor); | |
} | |
/* Metric values and labels */ | |
.metric-card .metric-label { | |
font-family: 'Geist Mono', monospace !important; | |
letter-spacing: 0.1em !important; | |
color: var(--text-secondary) !important; | |
font-size: 0.875rem !important; | |
text-transform: uppercase !important; | |
margin-bottom: 8px !important; | |
} | |
.metric-card .metric-value { | |
font-family: 'Geist', sans-serif !important; | |
font-weight: 700 !important; | |
font-size: 1.25rem !important; | |
color: var(--text-primary) !important; | |
margin-bottom: 8px !important; | |
} | |
.metric-card .metric-description { | |
color: var(--text-secondary) !important; | |
font-size: 0.875rem !important; | |
line-height: 1.5 !important; | |
} | |
/* Enhanced radio buttons with primary accent */ | |
input[type="radio"] { | |
background-color: var(--bg-secondary) !important; | |
border-color: var(--border-default) !important; | |
} | |
input[type="radio"]:checked { | |
background-color: var(--accent-primary) !important; | |
border-color: var(--accent-primary) !important; | |
box-shadow: 0 0 10px var(--glow-primary) !important; | |
} | |
.gr-check-radio label { | |
color: var(--text-primary) !important; | |
transition: color 0.2s ease !important; | |
} | |
.gr-check-radio:hover label { | |
color: var(--accent-primary) !important; | |
} | |
/* Gradio's selected radio button styling - comprehensive targeting */ | |
.gr-radio .wrap > label.selected, | |
.gr-radio .wrap > label:has(input:checked), | |
input[type="radio"]:checked ~ span, | |
label:has(> input[type="radio"]:checked) { | |
background: transparent !important; | |
border-color: var(--accent-primary) !important; | |
color: var(--text-primary) !important; | |
font-weight: 600 !important; | |
} | |
/* Enhanced dropdown styling */ | |
.dropdown { | |
border-color: var(--border-default) !important; | |
background: var(--bg-card) !important; | |
color: var(--text-primary) !important; | |
transition: all 0.2s ease !important; | |
} | |
.dropdown:hover { | |
border-color: var(--accent-primary) !important; | |
box-shadow: 0 0 15px var(--glow-primary) !important; | |
} | |
select, .gr-dropdown { | |
background: var(--bg-card) !important; | |
color: var(--text-primary) !important; | |
border: 1px solid var(--border-default) !important; | |
transition: all 0.2s ease !important; | |
} | |
select:hover, .gr-dropdown:hover { | |
border-color: var(--accent-primary) !important; | |
box-shadow: 0 0 15px var(--glow-primary) !important; | |
} | |
select option, .gr-dropdown option { | |
background: var(--bg-primary) !important; | |
color: var(--text-primary) !important; | |
} | |
/* Enhanced table styling with gradient accents */ | |
.dataframe { | |
background: var(--bg-card) !important; | |
border-radius: 16px !important; | |
overflow: hidden !important; | |
border: 1px solid var(--border-subtle) !important; | |
font-size: 14px !important; | |
max-height: 600px !important; | |
overflow-y: auto !important; | |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important; | |
box-shadow: 0 4px 16px rgba(0, 0, 0, 0.3) !important; | |
} | |
/* Fixed table layout for better column control */ | |
.dataframe table { | |
table-layout: fixed !important; | |
width: 100% !important; | |
} | |
.dataframe th { | |
background: rgba(227, 84, 84, 0.1) !important; | |
color: var(--text-primary) !important; | |
font-weight: 600 !important; | |
padding: 14px 8px !important; | |
text-align: left !important; | |
border-bottom: 2px solid var(--accent-primary) !important; | |
position: relative !important; | |
white-space: nowrap !important; | |
overflow: hidden !important; | |
text-overflow: ellipsis !important; | |
font-size: 13px !important; | |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important; | |
text-transform: uppercase !important; | |
letter-spacing: 0.05em !important; | |
} | |
/* Column-specific widths */ | |
.dataframe th:nth-child(2), /* Model */ | |
.dataframe td:nth-child(2) { | |
min-width: 200px !important; | |
max-width: 250px !important; | |
} | |
.dataframe th:nth-child(3), /* Model Type */ | |
.dataframe td:nth-child(3) { | |
min-width: 100px !important; | |
max-width: 120px !important; | |
} | |
.dataframe th:nth-child(4), /* Output Type */ | |
.dataframe td:nth-child(4) { | |
min-width: 100px !important; | |
max-width: 120px !important; | |
} | |
.dataframe th:nth-child(5), /* Vendor */ | |
.dataframe td:nth-child(5) { | |
min-width: 100px !important; | |
max-width: 120px !important; | |
} | |
/* Numeric columns - smaller width */ | |
.dataframe th:nth-child(6), .dataframe th:nth-child(7), | |
.dataframe th:nth-child(8), .dataframe th:nth-child(9), | |
.dataframe th:nth-child(10), | |
.dataframe td:nth-child(6), .dataframe td:nth-child(7), | |
.dataframe td:nth-child(8), .dataframe td:nth-child(9), | |
.dataframe td:nth-child(10) { | |
min-width: 80px !important; | |
max-width: 100px !important; | |
text-align: center !important; | |
font-family: 'Geist Mono', monospace !important; | |
font-size: 13px !important; | |
} | |
.dataframe td { | |
padding: 12px 8px !important; | |
border-bottom: 1px solid var(--border-subtle) !important; | |
color: var(--text-primary) !important; | |
white-space: nowrap !important; | |
overflow: hidden !important; | |
text-overflow: ellipsis !important; | |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important; | |
transition: all 0.2s ease !important; | |
} | |
/* Model names - keep consistent color on hover */ | |
.dataframe td:nth-child(2) { | |
font-weight: 500 !important; | |
color: var(--accent-primary) !important; | |
transition: all 0.2s ease !important; | |
} | |
/* Keep model name color consistent to emphasize row highlight */ | |
.dataframe tr:hover td:nth-child(2) { | |
color: var(--accent-secondary) !important; | |
} | |
.dataframe tbody tr { | |
transition: all 0.3s ease !important; | |
} | |
.dataframe tr:hover { | |
background: rgba(227, 84, 84, 0.15) !important; | |
box-shadow: 0 0 20px rgba(227, 84, 84, 0.3), inset 0 0 20px rgba(227, 84, 84, 0.1) !important; | |
transform: scale(1.01) !important; | |
} | |
.dataframe tr:nth-child(even) { | |
background: var(--bg-secondary) !important; | |
} | |
/* Tooltip on hover for truncated text */ | |
.dataframe td:hover, | |
.dataframe th:hover { | |
overflow: visible !important; | |
position: relative !important; | |
z-index: 10 !important; | |
} | |
/* Horizontal scroll styling */ | |
.dataframe-container { | |
overflow-x: auto !important; | |
overflow-y: visible !important; | |
max-width: 100% !important; | |
-webkit-overflow-scrolling: touch !important; | |
position: relative !important; | |
} | |
/* Simple scrollbar */ | |
.dataframe-container::-webkit-scrollbar { | |
height: 10px !important; | |
} | |
.dataframe-container::-webkit-scrollbar-track { | |
background: var(--bg-secondary) !important; | |
border-radius: 5px !important; | |
} | |
.dataframe-container::-webkit-scrollbar-thumb { | |
background: var(--accent-secondary) !important; | |
border-radius: 4px !important; | |
} | |
.dataframe-container::-webkit-scrollbar-thumb:hover { | |
background: var(--accent-primary) !important; | |
} | |
/* Responsive design for smaller screens */ | |
@media (max-width: 1200px) { | |
.dataframe th:nth-child(9), /* Vendor column */ | |
.dataframe td:nth-child(9), | |
.dataframe th:nth-child(10), /* Last columns */ | |
.dataframe td:nth-child(10) { | |
display: none !important; | |
} | |
} | |
@media (max-width: 900px) { | |
.dataframe th { | |
font-size: 12px !important; | |
padding: 8px 4px !important; | |
} | |
.dataframe td { | |
font-size: 12px !important; | |
padding: 8px 4px !important; | |
} | |
.dataframe th:nth-child(2), | |
.dataframe td:nth-child(2) { | |
min-width: 150px !important; | |
max-width: 200px !important; | |
} | |
} | |
/* Simple button styling */ | |
button { | |
background: var(--bg-card) !important; | |
color: var(--text-primary) !important; | |
border: 1px solid var(--border-default) !important; | |
transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important; | |
} | |
button:hover { | |
transform: translateY(-2px) !important; | |
border-color: var(--accent-primary) !important; | |
box-shadow: 0 4px 16px rgba(227, 84, 84, 0.2) !important; | |
} | |
/* Enhanced info boxes */ | |
.info-box { | |
background: var(--bg-card); | |
border: 1px solid var(--border-subtle); | |
border-radius: 12px; | |
padding: 20px; | |
margin: 8px 0; | |
backdrop-filter: blur(10px); | |
position: relative; | |
overflow: hidden; | |
transition: all 0.3s ease; | |
} | |
.info-box::before { | |
content: ''; | |
position: absolute; | |
top: 0; | |
left: -100%; | |
width: 100%; | |
height: 100%; | |
background: linear-gradient(90deg, transparent, rgba(227, 84, 84, 0.1), transparent); | |
transition: left 0.6s ease; | |
} | |
.info-box:hover::before { | |
left: 100%; | |
} | |
.info-box:hover { | |
border-color: var(--accent-primary); | |
box-shadow: 0 4px 20px var(--glow-primary); | |
} | |
/* Enhanced dark containers */ | |
.dark-container { | |
background: var(--bg-card); | |
border: 1px solid var(--border-subtle); | |
border-radius: 20px; | |
padding: 28px; | |
box-shadow: 0 8px 32px rgba(0, 0, 0, 0.4); | |
backdrop-filter: blur(10px); | |
position: relative; | |
overflow: hidden; | |
} | |
.dark-container::after { | |
content: ''; | |
position: absolute; | |
top: -50%; | |
right: -50%; | |
width: 200%; | |
height: 200%; | |
background: radial-gradient(circle, var(--glow-primary) 0%, transparent 70%); | |
opacity: 0.05; | |
pointer-events: none; | |
} | |
/* Section headers with glow */ | |
.section-header { | |
display: flex; | |
align-items: center; | |
gap: 12px; | |
margin-bottom: 24px; | |
} | |
.section-icon { | |
filter: drop-shadow(0 0 12px currentColor); | |
transition: all 0.3s ease; | |
} | |
.dark-container:hover .section-icon { | |
filter: drop-shadow(0 0 20px currentColor); | |
transform: scale(1.1); | |
} | |
/* Text effects */ | |
/* Simple scrollbar styling */ | |
::-webkit-scrollbar { | |
width: 8px; | |
height: 8px; | |
} | |
::-webkit-scrollbar-track { | |
background: var(--bg-secondary); | |
border-radius: 4px; | |
} | |
::-webkit-scrollbar-thumb { | |
background: var(--accent-secondary); | |
border-radius: 4px; | |
} | |
::-webkit-scrollbar-thumb:hover { | |
background: var(--accent-primary); | |
} | |
/* Pulse animation for important elements */ | |
@keyframes pulse-glow { | |
0% { box-shadow: 0 0 0 0 var(--glow-primary); } | |
70% { box-shadow: 0 0 0 10px transparent; } | |
100% { box-shadow: 0 0 0 0 transparent; } | |
} | |
.pulse { | |
animation: pulse-glow 2s infinite; | |
} | |
/* Center align charts */ | |
.chart-container { | |
display: flex; | |
justify-content: center; | |
align-items: center; | |
width: 100%; | |
margin: 0 auto; | |
} | |
.chart-container > div { | |
width: 100%; | |
max-width: 1400px; | |
margin: 0 auto; | |
} | |
/* Ensure plots are centered */ | |
.plot-container { | |
margin: 0 auto !important; | |
display: flex !important; | |
justify-content: center !important; | |
} | |
.js-plotly-plot { | |
margin: 0 auto !important; | |
} | |
</style> | |
<script> | |
// Function to update radio button styling | |
function updateRadioStyling() { | |
// Remove selected class from all labels first | |
document.querySelectorAll('.selected').forEach(function(label) { | |
label.classList.remove('selected'); | |
}); | |
// Apply selected class to checked radio buttons | |
document.querySelectorAll('input[type="radio"]:checked').forEach(function(input) { | |
var label = input.closest('label'); | |
if (label) { | |
label.classList.add('selected'); | |
// For domain radio buttons, apply special styling | |
if (label.closest('.domain-radio')) { | |
label.style.background = 'linear-gradient(145deg, rgba(227, 84, 84, 0.2), rgba(227, 84, 84, 0.1))'; | |
label.style.borderColor = 'var(--accent-primary)'; | |
label.style.transform = 'scale(1.05)'; | |
label.style.fontWeight = '600'; | |
} | |
} | |
}); | |
} | |
// Wait for Gradio to initialize | |
function initializeRadioStyles() { | |
updateRadioStyling(); | |
// Create observer to watch for changes | |
var observer = new MutationObserver(function(mutations) { | |
mutations.forEach(function(mutation) { | |
if (mutation.type === 'attributes' && mutation.attributeName === 'checked') { | |
updateRadioStyling(); | |
} | |
}); | |
}); | |
// Observe all radio inputs | |
document.querySelectorAll('input[type="radio"]').forEach(function(radio) { | |
observer.observe(radio, { attributes: true }); | |
}); | |
} | |
// Try multiple initialization strategies | |
document.addEventListener('DOMContentLoaded', function() { | |
setTimeout(initializeRadioStyles, 100); | |
setTimeout(initializeRadioStyles, 500); | |
setTimeout(initializeRadioStyles, 1000); | |
}); | |
// Also check when window loads | |
window.addEventListener('load', function() { | |
setTimeout(initializeRadioStyles, 100); | |
}); | |
// Listen for Gradio's custom events | |
document.addEventListener('gradio:loaded', initializeRadioStyles); | |
</script> | |
""" | |
gr.HTML(custom_css) | |
# Header button above title | |
gr.HTML(""" | |
<style> | |
/* Enhanced button styling with better gradio compatibility */ | |
.custom-button-container { | |
text-align: center; | |
padding: 20px 0 10px 0; | |
margin-bottom: 10px; | |
} | |
.header-action-button { | |
display: inline-block !important; | |
padding: 14px 28px !important; | |
background: linear-gradient(135deg, #E35454 0%, #C84545 100%) !important; | |
color: #FFFFFF !important; | |
text-decoration: none !important; | |
border-radius: 16px !important; | |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important; | |
font-weight: 700 !important; | |
font-size: 1.1rem !important; | |
transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important; | |
border: none !important; | |
cursor: pointer !important; | |
box-shadow: 0 8px 24px rgba(227, 84, 84, 0.4), 0 4px 12px rgba(0, 0, 0, 0.3) !important; | |
position: relative !important; | |
overflow: hidden !important; | |
text-shadow: 0 1px 2px rgba(0, 0, 0, 0.3) !important; | |
} | |
.header-action-button::before { | |
content: ''; | |
position: absolute; | |
top: 0; | |
left: -100%; | |
width: 100%; | |
height: 100%; | |
background: linear-gradient(90deg, transparent, rgba(255, 255, 255, 0.2), transparent); | |
transition: left 0.6s; | |
} | |
.header-action-button:hover::before { | |
left: 100%; | |
} | |
.header-action-button:hover { | |
transform: translateY(-3px) !important; | |
box-shadow: 0 12px 32px rgba(227, 84, 84, 0.5), 0 8px 16px rgba(0, 0, 0, 0.4) !important; | |
background: linear-gradient(135deg, #F46464 0%, #D84F4F 100%) !important; | |
color: #FFFFFF !important; | |
text-decoration: none !important; | |
} | |
.header-action-button:active { | |
transform: translateY(-1px) !important; | |
} | |
.action-button-icon { | |
font-size: 1.2rem !important; | |
margin-right: 8px !important; | |
filter: drop-shadow(0 0 8px rgba(255, 255, 255, 0.3)); | |
} | |
/* Navigation buttons styling */ | |
.nav-buttons-container { | |
display: flex; | |
justify-content: center; | |
align-items: center; | |
gap: 16px; | |
flex-wrap: wrap; | |
margin: 24px 0; | |
padding: 0 20px; | |
} | |
.nav-link-button { | |
display: inline-flex !important; | |
align-items: center !important; | |
gap: 8px !important; | |
padding: 12px 20px !important; | |
background: rgba(1, 9, 26, 0.8) !important; | |
color: #F5F6F7 !important; | |
text-decoration: none !important; | |
border-radius: 12px !important; | |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important; | |
font-weight: 600 !important; | |
font-size: 0.95rem !important; | |
transition: all 0.3s ease !important; | |
border: 2px solid rgba(245, 246, 247, 0.15) !important; | |
backdrop-filter: blur(10px) !important; | |
-webkit-backdrop-filter: blur(10px) !important; | |
position: relative !important; | |
overflow: hidden !important; | |
box-shadow: 0 4px 16px rgba(0, 0, 0, 0.3) !important; | |
} | |
.nav-link-button::before { | |
content: ''; | |
position: absolute; | |
top: 0; | |
left: 0; | |
right: 0; | |
bottom: 0; | |
background: linear-gradient(135deg, rgba(227, 84, 84, 0.1) 0%, rgba(16, 152, 247, 0.1) 100%); | |
opacity: 0; | |
transition: opacity 0.3s ease; | |
} | |
.nav-link-button:hover::before { | |
opacity: 1; | |
} | |
.nav-link-button:hover { | |
transform: translateY(-3px) scale(1.02) !important; | |
border-color: #E35454 !important; | |
box-shadow: 0 8px 24px rgba(227, 84, 84, 0.3), 0 4px 12px rgba(0, 0, 0, 0.4) !important; | |
text-decoration: none !important; | |
color: #FFFFFF !important; | |
} | |
.nav-link-button.primary-nav { | |
background: linear-gradient(135deg, #1098F7 0%, #0A6BC4 100%) !important; | |
border-color: #1098F7 !important; | |
color: #FFFFFF !important; | |
font-weight: 700 !important; | |
} | |
.nav-link-button.primary-nav:hover { | |
background: linear-gradient(135deg, #2AA8FF 0%, #0550A0 100%) !important; | |
border-color: #2AA8FF !important; | |
box-shadow: 0 8px 24px rgba(16, 152, 247, 0.4), 0 4px 12px rgba(0, 0, 0, 0.4) !important; | |
color: #FFFFFF !important; | |
} | |
.nav-button-icon { | |
font-size: 1.1rem !important; | |
filter: drop-shadow(0 0 6px currentColor); | |
} | |
/* Responsive design */ | |
@media (max-width: 768px) { | |
.nav-buttons-container { | |
gap: 12px; | |
padding: 0 10px; | |
} | |
.nav-link-button { | |
font-size: 0.85rem !important; | |
padding: 10px 16px !important; | |
} | |
.header-action-button { | |
font-size: 1rem !important; | |
padding: 12px 24px !important; | |
} | |
} | |
@media (max-width: 480px) { | |
.nav-buttons-container { | |
flex-direction: column; | |
gap: 8px; | |
} | |
.nav-link-button { | |
width: 200px; | |
justify-content: center; | |
} | |
} | |
</style> | |
<div class="custom-button-container"> | |
<a href="https://app.galileo.ai/sign-up?utm_medium=referral&utm_source=HF&utm_campaign=agent_leaderboard_v2" target="_blank" class="header-action-button"> | |
<span class="action-button-icon">π</span>Evaluate your GenAI for free | |
</a> | |
</div> | |
""") | |
gr.HTML(""" | |
<div style="text-align: center; padding: 20px 0;"> | |
<h1 style="font-size: 3rem; margin-bottom: 12px; color: var(--text-primary); | |
text-shadow: 0 0 20px rgba(227, 84, 84, 0.3); font-family: 'Geist', sans-serif; font-weight: 800;"> | |
π Galileo Agent Leaderboard v2 | |
</h1> | |
<p style="color: var(--text-secondary); font-size: 1.2rem; margin-top: 0; font-family: 'Geist', sans-serif;"> | |
Comprehensive performance metrics for LLM agents across business domains | |
</p> | |
</div> | |
""") | |
# Links section below title | |
gr.HTML(""" | |
<div class="nav-buttons-container"> | |
<a href="http://galileo.ai/blog/agent-leaderboard-v2" target="_blank" class="nav-link-button"> | |
<span class="nav-button-icon">π</span> | |
Blog | |
</a> | |
<a href="https://github.com/rungalileo/agent-leaderboard" target="_blank" class="nav-link-button"> | |
<span class="nav-button-icon">π</span> | |
GitHub | |
</a> | |
<a href="https://huggingface.co/datasets/galileo-ai/agent-leaderboard-v2" target="_blank" class="nav-link-button"> | |
<span class="nav-button-icon">π€</span> | |
Dataset | |
</a> | |
<a href="https://huggingface.co/spaces/galileo-ai/agent-leaderboard/discussions/new" target="_blank" class="nav-link-button"> | |
<span class="nav-button-icon">β</span> | |
Add Model | |
</a> | |
</div> | |
""") | |
# Metrics overview cards with insights | |
gr.HTML(""" | |
<div style="margin-bottom: 40px;"> | |
<!-- Ultra-modern metric cards with advanced styling --> | |
<style> | |
.insight-card { | |
background: linear-gradient(145deg, rgba(245, 246, 247, 0.03) 0%, rgba(227, 84, 84, 0.08) 100%); | |
border-radius: 16px; | |
padding: 20px; | |
position: relative; | |
border: 1px solid var(--border-subtle); | |
transition: all 0.4s cubic-bezier(0.4, 0, 0.2, 1); | |
overflow: hidden; | |
backdrop-filter: blur(20px); | |
-webkit-backdrop-filter: blur(20px); | |
} | |
.insight-card::before { | |
content: ''; | |
position: absolute; | |
inset: 0; | |
border-radius: 24px; | |
padding: 1px; | |
background: linear-gradient(145deg, var(--border-subtle), var(--border-default)); | |
-webkit-mask: linear-gradient(#fff 0 0) content-box, linear-gradient(#fff 0 0); | |
-webkit-mask-composite: source-out; | |
mask-composite: subtract; | |
pointer-events: none; | |
} | |
.insight-card::after { | |
content: ''; | |
position: absolute; | |
top: -100%; | |
left: -100%; | |
width: 300%; | |
height: 300%; | |
background: radial-gradient(circle at center, var(--glow-primary) 0%, transparent 70%); | |
opacity: 0; | |
transition: opacity 0.6s ease, transform 0.6s ease; | |
pointer-events: none; | |
} | |
.insight-card:hover::after { | |
opacity: 0.15; | |
transform: translate(50%, 50%); | |
} | |
.insight-card:hover { | |
transform: translateY(-8px); | |
border-color: var(--accent-primary); | |
box-shadow: | |
0 24px 48px rgba(227, 84, 84, 0.2), | |
0 12px 24px rgba(0, 0, 0, 0.3), | |
inset 0 1px 0 rgba(255, 255, 255, 0.1); | |
} | |
.insight-card.secondary-accent:hover { | |
border-color: var(--accent-primary); | |
} | |
.insight-card.secondary-accent::after { | |
background: radial-gradient(circle at center, var(--glow-primary) 0%, transparent 70%); | |
} | |
.insight-card.tertiary-accent:hover { | |
border-color: var(--accent-primary); | |
} | |
.insight-card.tertiary-accent::after { | |
background: radial-gradient(circle at center, var(--glow-primary) 0%, transparent 70%); | |
} | |
.card-header { | |
display: flex; | |
align-items: center; | |
gap: 8px; | |
margin-bottom: 12px; | |
} | |
.card-icon { | |
display: flex; | |
align-items: center; | |
justify-content: center; | |
font-size: 2rem; | |
margin-right: 8px; | |
} | |
.card-title { | |
flex: 1; | |
} | |
.card-label { | |
font-family: 'Geist Mono', monospace; | |
font-size: 0.7rem; | |
letter-spacing: 0.05em; | |
text-transform: uppercase; | |
color: var(--text-secondary); | |
margin-bottom: 2px; | |
} | |
.card-value { | |
font-family: 'Geist', sans-serif; | |
font-size: 1.1rem; | |
font-weight: 700; | |
color: var(--text-primary); | |
line-height: 1.1; | |
} | |
.insight-list { | |
list-style: none; | |
padding: 0; | |
margin: 0; | |
} | |
.insight-list li { | |
margin-bottom: 8px; | |
} | |
.insight-item { | |
display: flex; | |
align-items: center; | |
gap: 8px; | |
padding: 8px 10px; | |
background: rgba(245, 246, 247, 0.03); | |
border-radius: 8px; | |
border: 1px solid var(--border-subtle); | |
transition: all 0.3s ease; | |
} | |
.insight-item:hover { | |
background: rgba(227, 84, 84, 0.1); | |
border-color: var(--accent-primary); | |
transform: translateX(4px); | |
} | |
.insight-icon { | |
font-size: 1rem; | |
flex-shrink: 0; | |
} | |
.insight-text { | |
flex: 1; | |
font-size: 0.85rem; | |
line-height: 1.3; | |
color: var(--text-secondary); | |
} | |
.highlight { | |
color: var(--text-primary); | |
font-weight: 600; | |
} | |
.badge-row { | |
display: flex; | |
gap: 6px; | |
margin-top: 10px; | |
flex-wrap: wrap; | |
} | |
.badge { | |
padding: 4px 10px; | |
background: rgba(245, 246, 247, 0.05); | |
border: 1px solid var(--border-subtle); | |
border-radius: 16px; | |
font-size: 0.75rem; | |
color: var(--text-secondary); | |
transition: all 0.2s ease; | |
display: flex; | |
align-items: center; | |
gap: 4px; | |
} | |
.badge:hover { | |
background: rgba(227, 84, 84, 0.15); | |
border-color: var(--accent-primary); | |
color: var(--text-primary); | |
transform: scale(1.05); | |
} | |
.badge-icon { | |
font-size: 0.85rem; | |
} | |
@keyframes float { | |
0%, 100% { transform: translateY(0); } | |
50% { transform: translateY(-5px); } | |
} | |
.floating-icon { | |
animation: float 3s ease-in-out infinite; | |
} | |
/* Tertiary color for special elements */ | |
.tertiary-color { | |
color: var(--accent-tertiary); | |
} | |
</style> | |
<!-- First row: Five key insight cards --> | |
<div style="display: grid; grid-template-columns: repeat(5, 1fr); gap: 16px;"> | |
<div class="insight-card"> | |
<div class="card-header"> | |
<div class="card-icon floating-icon" style="color: var(--accent-primary);"> | |
π― | |
</div> | |
</div> | |
<div class="card-value">Task Completion</div> | |
<div class="insight-text" style="margin-top: 16px; color: var(--text-secondary); font-size: 0.9rem; line-height: 1.5;"> | |
Compare models based on their ability to complete real-world business tasks accurately and efficiently | |
</div> | |
</div> | |
<div class="insight-card"> | |
<div class="card-header"> | |
<div class="card-icon floating-icon" style="color: var(--accent-primary);"> | |
π‘ | |
</div> | |
</div> | |
<div class="card-value">Tool Selection</div> | |
<div class="insight-text" style="margin-top: 16px; color: var(--text-secondary); font-size: 0.9rem; line-height: 1.5;"> | |
Analyze how precisely models choose the right tools for each task and make optimal decisions | |
</div> | |
</div> | |
<div class="insight-card"> | |
<div class="card-header"> | |
<div class="card-icon floating-icon" style="color: var(--accent-primary);"> | |
π° | |
</div> | |
</div> | |
<div class="card-value">Cost Efficiency</div> | |
<div class="insight-text" style="margin-top: 16px; color: var(--text-secondary); font-size: 0.9rem; line-height: 1.5;"> | |
Find models that deliver the best performance per dollar spent and optimize your ROI | |
</div> | |
</div> | |
<div class="insight-card"> | |
<div class="card-header"> | |
<div class="card-icon floating-icon" style="color: var(--accent-primary);"> | |
ποΈ | |
</div> | |
</div> | |
<div class="card-value">Domain Coverage</div> | |
<div class="insight-text" style="margin-top: 16px; color: var(--text-secondary); font-size: 0.9rem; line-height: 1.5;"> | |
Banking, Healthcare, Insurance, Investment, and Telecom industries analyzed for specialized performance | |
</div> | |
</div> | |
<div class="insight-card"> | |
<div class="card-header"> | |
<div class="card-icon floating-icon" style="color: var(--accent-primary);"> | |
π | |
</div> | |
</div> | |
<div class="card-value">Speed vs Accuracy</div> | |
<div class="insight-text" style="margin-top: 16px; color: var(--text-secondary); font-size: 0.9rem; line-height: 1.5;"> | |
Understand the trade-offs between response time and accuracy to find the right balance | |
</div> | |
</div> | |
</div> | |
<!-- Second row: Key features showcase --> | |
<div style="display: grid; grid-template-columns: repeat(3, 1fr); gap: 16px; margin-top: 16px;"> | |
<div class="insight-card" style="background: linear-gradient(145deg, rgba(227, 84, 84, 0.05) 0%, rgba(245, 246, 247, 0.05) 100%);"> | |
<div class="card-value">Model Capabilities</div> | |
<div class="badge-row" style="margin-top: 16px;"> | |
<div class="badge"> | |
<span class="badge-icon">π</span> | |
<span>Open Source</span> | |
</div> | |
<div class="badge"> | |
<span class="badge-icon">π</span> | |
<span>Proprietary</span> | |
</div> | |
<div class="badge"> | |
<span class="badge-icon">π§ </span> | |
<span>Reasoning</span> | |
</div> | |
</div> | |
</div> | |
<div class="insight-card" style="background: linear-gradient(145deg, rgba(227, 84, 84, 0.05) 0%, rgba(245, 246, 247, 0.05) 100%);"> | |
<div class="card-value">Interactive Visualizations</div> | |
<div class="badge-row" style="margin-top: 16px;"> | |
<div class="badge"> | |
<span class="badge-icon">πΈοΈ</span> | |
<span>Radar Charts</span> | |
</div> | |
<div class="badge"> | |
<span class="badge-icon">π</span> | |
<span>Heatmaps</span> | |
</div> | |
<div class="badge"> | |
<span class="badge-icon">π</span> | |
<span>Scatter Plots</span> | |
</div> | |
</div> | |
</div> | |
<div class="insight-card" style="background: linear-gradient(145deg, rgba(227, 84, 84, 0.05) 0%, rgba(245, 246, 247, 0.05) 100%);"> | |
<div class="card-value">Real-World Performance</div> | |
<div class="badge-row" style="margin-top: 16px;"> | |
<div class="badge"> | |
<span class="badge-icon">πΌ</span> | |
<span>Business Tasks</span> | |
</div> | |
<div class="badge"> | |
<span class="badge-icon">π</span> | |
<span>Multi-Turn</span> | |
</div> | |
<div class="badge"> | |
<span class="badge-icon">π</span> | |
<span>Benchmarks</span> | |
</div> | |
</div> | |
</div> | |
</div> | |
</div> | |
""") | |
# Domain filter section with enhanced styling | |
gr.HTML(""" | |
<style> | |
/* Enhanced domain selector styling */ | |
.domain-selector-container { | |
background: linear-gradient(145deg, rgba(227, 84, 84, 0.05) 0%, rgba(245, 246, 247, 0.05) 100%); | |
border-radius: 20px; | |
padding: 32px; | |
margin-bottom: 32px; | |
border: 1px solid var(--border-subtle); | |
position: relative; | |
overflow: hidden; | |
box-shadow: | |
0 8px 32px rgba(0, 0, 0, 0.3), | |
inset 0 1px 0 rgba(255, 255, 255, 0.05); | |
} | |
.domain-selector-container::before { | |
content: ''; | |
position: absolute; | |
top: -50%; | |
left: -50%; | |
width: 200%; | |
height: 200%; | |
background: radial-gradient(circle at center, var(--glow-primary) 0%, transparent 70%); | |
opacity: 0.1; | |
animation: pulse 4s ease-in-out infinite; | |
} | |
.domain-header { | |
text-align: center; | |
margin-bottom: 28px; | |
position: relative; | |
z-index: 1; | |
} | |
.domain-title { | |
font-size: 2rem; | |
font-weight: 800; | |
background: linear-gradient(90deg, var(--accent-primary), var(--accent-secondary)); | |
-webkit-background-clip: text; | |
-webkit-text-fill-color: transparent; | |
margin-bottom: 8px; | |
text-shadow: 0 0 30px var(--glow-primary); | |
} | |
.domain-subtitle { | |
color: var(--text-secondary); | |
font-size: 1.2rem; | |
font-family: 'Geist', sans-serif; | |
} | |
/* Custom radio button styling */ | |
.domain-radio { | |
display: flex !important; | |
gap: 12px !important; | |
flex-wrap: wrap !important; | |
justify-content: center !important; | |
position: relative; | |
z-index: 1; | |
} | |
/* Gradio radio button wrapper */ | |
.domain-radio .wrap { | |
display: flex !important; | |
gap: 12px !important; | |
flex-wrap: wrap !important; | |
justify-content: center !important; | |
width: 100% !important; | |
} | |
.domain-radio label, | |
.domain-radio .wrap > label { | |
flex: 1 !important; | |
min-width: 160px !important; | |
max-width: 200px !important; | |
padding: 16px 24px !important; | |
background: var(--bg-card) !important; | |
border: 2px solid var(--border-default) !important; | |
border-radius: 16px !important; | |
cursor: pointer !important; | |
transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important; | |
text-align: center !important; | |
position: relative !important; | |
overflow: hidden !important; | |
} | |
.domain-radio label::before { | |
content: ''; | |
position: absolute; | |
top: 0; | |
left: 0; | |
right: 0; | |
bottom: 0; | |
background: linear-gradient(145deg, transparent, var(--glow-primary)); | |
opacity: 0; | |
transition: opacity 0.3s ease; | |
pointer-events: none; | |
} | |
.domain-radio label:hover { | |
transform: translateY(-2px) !important; | |
border-color: var(--accent-primary) !important; | |
box-shadow: | |
0 8px 24px rgba(227, 84, 84, 0.3), | |
inset 0 0 20px rgba(227, 84, 84, 0.1) !important; | |
} | |
.domain-radio label:hover::before { | |
opacity: 0.1; | |
} | |
.domain-radio input[type="radio"] { | |
display: none !important; | |
} | |
.domain-radio input[type="radio"]:checked + label, | |
.domain-radio .wrap > label:has(input[type="radio"]:checked), | |
.domain-radio label.selected { | |
background: transparent !important; | |
border-color: var(--accent-primary) !important; | |
color: var(--text-primary) !important; | |
font-weight: 600 !important; | |
transform: scale(1.05) !important; | |
box-shadow: | |
0 12px 32px rgba(227, 84, 84, 0.4), | |
0 0 60px rgba(227, 84, 84, 0.2) !important; | |
} | |
.domain-radio input[type="radio"]:checked + label::before { | |
opacity: 0.2; | |
} | |
/* Domain icons */ | |
.domain-icon { | |
font-size: 1.5rem; | |
margin-bottom: 4px; | |
display: block; | |
filter: drop-shadow(0 0 10px currentColor); | |
} | |
.domain-name { | |
font-size: 0.95rem; | |
font-weight: 500; | |
margin-top: 4px; | |
} | |
/* Badge for domain counts */ | |
.domain-count { | |
position: absolute; | |
top: 8px; | |
right: 8px; | |
background: var(--accent-primary); | |
color: white; | |
font-size: 0.75rem; | |
padding: 2px 8px; | |
border-radius: 12px; | |
font-weight: 600; | |
opacity: 0.8; | |
} | |
/* Filter radio buttons styling - smaller for better fit */ | |
.filter-radio { | |
max-width: 100% !important; | |
} | |
.filter-radio .gr-row { | |
gap: 8px !important; | |
} | |
.filter-radio .gr-column { | |
min-width: 0 !important; | |
flex: 1 !important; | |
} | |
.filter-radio .gr-form { | |
min-width: 0 !important; | |
} | |
.filter-radio .gr-radio-group { | |
gap: 4px !important; | |
} | |
.filter-radio .domain-radio { | |
display: flex !important; | |
gap: 4px !important; | |
flex-wrap: nowrap !important; | |
justify-content: center !important; | |
} | |
.filter-radio .domain-radio label { | |
min-width: auto !important; | |
max-width: 120px !important; | |
padding: 8px 12px !important; | |
font-size: 0.8rem !important; | |
white-space: nowrap !important; | |
overflow: hidden !important; | |
text-overflow: ellipsis !important; | |
} | |
/* Additional targeting for the specific filter components */ | |
.filter-radio .gr-box { | |
padding: 8px !important; | |
} | |
.filter-radio .gr-radio { | |
gap: 4px !important; | |
} | |
.filter-radio .gr-input-label { | |
font-size: 0.85rem !important; | |
margin-bottom: 4px !important; | |
} | |
/* Force compact layout for the filters */ | |
@media (max-width: 1400px) { | |
.filter-radio .domain-radio label { | |
padding: 6px 10px !important; | |
font-size: 0.75rem !important; | |
} | |
} | |
/* Compact filter row styling */ | |
.compact-filter-row { | |
margin-bottom: 20px !important; | |
} | |
.compact-filter-row .gr-column { | |
padding: 0 8px !important; | |
} | |
.compact-filter-row .gr-box { | |
padding: 0 !important; | |
} | |
/* Compact radio button styling */ | |
.compact-radio { | |
width: 100% !important; | |
} | |
.compact-radio > label { | |
font-size: 0.85rem !important; | |
margin-bottom: 8px !important; | |
font-weight: 600 !important; | |
color: var(--text-primary) !important; | |
display: block !important; | |
} | |
.compact-radio .wrap { | |
display: flex !important; | |
flex-wrap: nowrap !important; | |
gap: 4px !important; | |
justify-content: center !important; | |
} | |
.compact-radio .wrap > label { | |
display: inline-flex !important; | |
align-items: center !important; | |
justify-content: center !important; | |
padding: 6px 10px !important; | |
margin: 0 !important; | |
background: var(--bg-card) !important; | |
border: 1px solid var(--border-default) !important; | |
border-radius: 8px !important; | |
cursor: pointer !important; | |
transition: all 0.2s ease !important; | |
font-size: 0.75rem !important; | |
white-space: nowrap !important; | |
flex: 1 !important; | |
min-width: 0 !important; | |
overflow: hidden !important; | |
text-overflow: ellipsis !important; | |
} | |
.compact-radio .wrap > label:has(input[type="radio"]:checked) { | |
background: transparent !important; | |
border-color: var(--accent-primary) !important; | |
color: var(--text-primary) !important; | |
font-weight: 600 !important; | |
} | |
.compact-radio .wrap > label:hover { | |
background: rgba(227, 84, 84, 0.1) !important; | |
border-color: var(--accent-primary) !important; | |
transform: scale(1.02) !important; | |
} | |
.compact-radio input[type="radio"] { | |
display: none !important; | |
} | |
/* Target Gradio's data attributes for selected state */ | |
.compact-radio label[data-selected="true"], | |
.compact-radio label[aria-checked="true"], | |
.domain-radio label[data-selected="true"], | |
.domain-radio label[aria-checked="true"] { | |
background: transparent !important; | |
border-color: var(--accent-primary) !important; | |
color: var(--text-primary) !important; | |
font-weight: 600 !important; | |
} | |
/* Sort by radio buttons */ | |
.sort-by-radio .domain-radio { | |
display: flex !important; | |
gap: 10px !important; | |
flex-wrap: wrap !important; | |
justify-content: flex-start !important; | |
} | |
.sort-by-radio .domain-radio .wrap { | |
display: flex !important; | |
gap: 10px !important; | |
flex-wrap: wrap !important; | |
justify-content: flex-start !important; | |
width: 100% !important; | |
} | |
.sort-by-radio .domain-radio label, | |
.sort-by-radio .domain-radio .wrap > label { | |
min-width: 180px !important; | |
max-width: 220px !important; | |
padding: 12px 20px !important; | |
font-size: 0.95rem !important; | |
} | |
</style> | |
<div class="domain-selector-container"> | |
<div class="domain-header"> | |
<h2 class="domain-title">ποΈ Select Business Domain</h2> | |
<p class="domain-subtitle">Choose a domain to see specialized agent performance</p> | |
</div> | |
""") | |
# Creating a custom radio with better visual design | |
domain_choices = [ | |
("All", "π", "All Domains"), | |
("Banking", "π¦", "Banking"), | |
("Healthcare", "π₯", "Healthcare"), | |
("Insurance", "π‘οΈ", "Insurance"), | |
("Investment", "π°", "Investment"), | |
("Telecom", "π±", "Telecom") | |
] | |
with gr.Row(): | |
domain_filter = gr.Radio( | |
choices=["π All", "π¦ Banking", "π₯ Healthcare", "π‘οΈ Insurance", "π° Investment", "π± Telecom"], | |
value="π All", | |
label="", | |
interactive=True, | |
elem_classes=["domain-radio"] | |
) | |
gr.HTML(""" | |
</div> | |
""") | |
# Filter controls with enhanced styling | |
gr.HTML(""" | |
<div class="dark-container" style="margin-bottom: 24px;"> | |
<div class="section-header"> | |
<span class="section-icon" style="color: var(--accent-secondary);">π</span> | |
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;"> | |
Filters & Sorting | |
</h3> | |
</div> | |
""") | |
# First row: Model filters and sort order | |
with gr.Row(elem_classes=["compact-filter-row"]): | |
with gr.Column(scale=1): | |
model_type_filter = gr.Radio( | |
choices=["All", "Open Source", "Proprietary"], | |
value="All", | |
label="π Model Access", | |
elem_classes=["compact-radio"] | |
) | |
with gr.Column(scale=1): | |
reasoning_filter = gr.Radio( | |
choices=["All", "Reasoning", "Normal"], | |
value="All", | |
label="π§ Output Type", | |
elem_classes=["compact-radio"] | |
) | |
with gr.Column(scale=1): | |
sort_order = gr.Radio( | |
choices=["Descending", "Ascending"], | |
value="Descending", | |
label="π Sort Order", | |
elem_classes=["compact-radio"] | |
) | |
# Second row: Sort by options | |
gr.HTML("""<div style="margin-top: 20px; margin-bottom: 10px;"> | |
<h4 style="color: var(--text-primary); font-size: 1.1rem; font-weight: 600; margin: 0;">π Sort By</h4> | |
</div>""") | |
gr.HTML('<div class="sort-by-radio">') | |
sort_by = gr.Radio( | |
choices=["Avg Action Completion", "Avg Tool Selection Quality", "Avg Session Cost", "Avg Session Duration", "Avg Turns"], | |
value="Avg Action Completion", | |
label="", | |
elem_classes=["domain-radio"] | |
) | |
gr.HTML('</div>') | |
gr.HTML("</div>") | |
# Main leaderboard table with dynamic title | |
leaderboard_title = gr.HTML(""" | |
<div class="dark-container pulse" style="margin-bottom: 24px;"> | |
<div class="section-header"> | |
<span class="section-icon" style="color: var(--accent-primary);">π</span> | |
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;"> | |
Agent Leaderboard for All | |
</h3> | |
</div> | |
<div class="dataframe-container"> | |
""") | |
leaderboard_table = gr.HTML(initial_table) | |
gr.HTML(""" | |
</div> | |
</div>""") | |
# Column Info Section | |
gr.HTML(""" | |
<div class="dark-container" style="margin-top: 24px; margin-bottom: 32px;"> | |
<div class="section-header"> | |
<span class="section-icon" style="color: var(--accent-secondary);">π</span> | |
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.3rem; font-family: 'Geist', sans-serif; font-weight: 700;"> | |
Column Explanations | |
</h3> | |
</div> | |
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 20px; margin-top: 20px;"> | |
<!-- Performance Metrics --> | |
<div class="info-box" style="background: linear-gradient(145deg, rgba(227, 84, 84, 0.05) 0%, rgba(245, 246, 247, 0.03) 100%);"> | |
<h4 style="color: var(--accent-primary); margin-top: 0; margin-bottom: 16px; font-size: 1.1rem; font-family: 'Geist', sans-serif; font-weight: 600; display: flex; align-items: center; gap: 8px;"> | |
<span style="font-size: 1.3rem;">π―</span> | |
Performance Metrics | |
</h4> | |
<div style="space-y: 12px;"> | |
<div style="margin-bottom: 12px;"> | |
<div style="font-weight: 600; color: var(--text-primary); margin-bottom: 4px; font-family: 'Geist Mono', monospace; font-size: 0.9rem;"> | |
π Action Completion | |
</div> | |
<div style="color: var(--text-secondary); font-size: 0.9rem; line-height: 1.4; margin-bottom: 6px;"> | |
Measures how well the agent accomplishes user goals and completes tasks successfully. | |
</div> | |
<a href="https://v2docs.galileo.ai/concepts/metrics/agentic/action-completion" target="_blank" | |
style="color: var(--accent-primary); text-decoration: none; font-size: 0.85rem; display: inline-flex; align-items: center; gap: 4px;"> | |
π Learn more about Action Completion | |
<span style="font-size: 0.7rem;">β</span> | |
</a> | |
</div> | |
<div style="border-top: 1px solid var(--border-subtle); padding-top: 12px;"> | |
<div style="font-weight: 600; color: var(--text-primary); margin-bottom: 4px; font-family: 'Geist Mono', monospace; font-size: 0.9rem;"> | |
π οΈ Tool Selection Quality | |
</div> | |
<div style="color: var(--text-secondary); font-size: 0.9rem; line-height: 1.4; margin-bottom: 6px;"> | |
Evaluates the accuracy of selecting the right tools and using them with correct parameters. | |
</div> | |
<a href="https://v2docs.galileo.ai/concepts/metrics/agentic/tool-selection-quality" target="_blank" | |
style="color: var(--accent-primary); text-decoration: none; font-size: 0.85rem; display: inline-flex; align-items: center; gap: 4px;"> | |
π Learn more about Tool Selection Quality | |
<span style="font-size: 0.7rem;">β</span> | |
</a> | |
</div> | |
</div> | |
</div> | |
<!-- Session-Level Metrics --> | |
<div class="info-box" style="background: linear-gradient(145deg, rgba(16, 152, 247, 0.05) 0%, rgba(245, 246, 247, 0.03) 100%);"> | |
<h4 style="color: var(--accent-secondary); margin-top: 0; margin-bottom: 16px; font-size: 1.1rem; font-family: 'Geist', sans-serif; font-weight: 600; display: flex; align-items: center; gap: 8px;"> | |
<span style="font-size: 1.3rem;">π</span> | |
Session-Level Metrics | |
</h4> | |
<div style="space-y: 10px;"> | |
<div style="margin-bottom: 10px;"> | |
<div style="font-weight: 600; color: var(--text-primary); margin-bottom: 4px; font-family: 'Geist Mono', monospace; font-size: 0.9rem;"> | |
π° Avg Cost ($) | |
</div> | |
<div style="color: var(--text-secondary); font-size: 0.9rem; line-height: 1.4;"> | |
Average cost per conversation session, including all API calls and processing. | |
</div> | |
</div> | |
<div style="margin-bottom: 10px; border-top: 1px solid var(--border-subtle); padding-top: 10px;"> | |
<div style="font-weight: 600; color: var(--text-primary); margin-bottom: 4px; font-family: 'Geist Mono', monospace; font-size: 0.9rem;"> | |
β‘ Avg Duration (s) | |
</div> | |
<div style="color: var(--text-secondary); font-size: 0.9rem; line-height: 1.4;"> | |
Average time taken to complete a full conversation session from start to finish. | |
</div> | |
</div> | |
<div style="border-top: 1px solid var(--border-subtle); padding-top: 10px;"> | |
<div style="font-weight: 600; color: var(--text-primary); margin-bottom: 4px; font-family: 'Geist Mono', monospace; font-size: 0.9rem;"> | |
π¬ Avg Turns | |
</div> | |
<div style="color: var(--text-secondary); font-size: 0.9rem; line-height: 1.4;"> | |
Average number of back-and-forth exchanges needed to complete a task. | |
</div> | |
</div> | |
</div> | |
</div> | |
</div> | |
<!-- Additional Notes --> | |
<div style="margin-top: 24px; padding: 16px; background: rgba(245, 246, 247, 0.03); border: 1px solid var(--border-subtle); border-radius: 12px;"> | |
<div style="display: flex; align-items: center; gap: 8px; margin-bottom: 8px;"> | |
<span style="font-size: 1.1rem;">π‘</span> | |
<span style="font-weight: 600; color: var(--text-primary); font-size: 0.95rem;">Default Sorting</span> | |
</div> | |
<div style="color: var(--text-secondary); font-size: 0.9rem; line-height: 1.4;"> | |
The table is sorted by <strong style="color: var(--text-primary);">Action Completion</strong> in descending order by default, showing the best-performing models first. You can change the sorting using the filters above. | |
</div> | |
</div> | |
</div> | |
""") | |
# Radar Chart Section | |
gr.HTML(""" | |
<div class="dark-container" style="margin-bottom: 24px;"> | |
<div class="section-header"> | |
<span class="section-icon" style="color: var(--accent-primary);">πΈοΈ</span> | |
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;"> | |
Domain Performance Analysis | |
</h3> | |
</div> | |
<p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Geist', sans-serif;">Compare model performance across different business domains</p> | |
""") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
model_selector = gr.Dropdown( | |
choices=initial_df['Model'].tolist()[:10], | |
value=initial_df['Model'].tolist()[:5], | |
multiselect=True, | |
label="π― Select Models for Comparison", | |
info="Choose up to 5 models to visualize", | |
elem_classes=["dropdown"] | |
) | |
# Radar chart plot - wrapped in centered container | |
gr.HTML('<div class="chart-container">') | |
radar_chart = gr.Plot( | |
label="", | |
value=create_domain_radar_chart( | |
load_leaderboard_data(), | |
"Avg AC", | |
initial_df['Model'].tolist()[:5] | |
), | |
elem_classes=["radar-chart", "plot-container"] | |
) | |
gr.HTML('</div>') | |
gr.HTML("</div>") | |
# Update functions | |
def get_optimal_sort_order(sort_by_value): | |
"""Return the optimal sort order for a given metric""" | |
# Metrics where higher is better (descending) | |
descending_metrics = ["Avg Action Completion", "Avg Tool Selection Quality"] | |
# Metrics where lower is better (ascending) | |
ascending_metrics = ["Avg Session Cost", "Avg Session Duration", "Avg Turns"] | |
if sort_by_value in descending_metrics: | |
return "Descending" | |
elif sort_by_value in ascending_metrics: | |
return "Ascending" | |
else: | |
return "Descending" # Default fallback | |
def update_sort_order_automatically(sort_by_value): | |
"""Update sort order automatically based on selected metric""" | |
optimal_order = get_optimal_sort_order(sort_by_value) | |
return optimal_order | |
def update_table(*args): | |
title_html = update_leaderboard_title(args[0]) # domain_filter is first arg | |
table_html = filter_and_sort_data(*args) | |
return title_html, table_html | |
def update_radar_chart(domain_filter, model_type_filter, reasoning_filter, sort_by, sort_order, selected_models): | |
# Get filtered dataframe | |
df = load_leaderboard_data() | |
filtered_df = df.copy() | |
# Strip emoji prefix from domain filter | |
domain_filter_clean = domain_filter | |
if domain_filter.startswith('π'): | |
domain_filter_clean = "All" | |
elif domain_filter.startswith('π¦'): | |
domain_filter_clean = "Banking" | |
elif domain_filter.startswith('π₯'): | |
domain_filter_clean = "Healthcare" | |
elif domain_filter.startswith('π‘οΈ'): | |
domain_filter_clean = "Insurance" | |
elif domain_filter.startswith('π°'): | |
domain_filter_clean = "Investment" | |
elif domain_filter.startswith('π±'): | |
domain_filter_clean = "Telecom" | |
# Apply filters (same logic as filter_and_sort_data) | |
if domain_filter_clean != "All": | |
domain_col_map = { | |
"Banking": "Banking AC", | |
"Healthcare": "Healthcare AC", | |
"Insurance": "Insurance AC", | |
"Investment": "Investment AC", | |
"Telecom": "Telecom AC" | |
} | |
if domain_filter_clean in domain_col_map: | |
domain_col = domain_col_map[domain_filter_clean] | |
# Only show models that have data for this domain | |
filtered_df = filtered_df[filtered_df[domain_col] != ''] | |
if model_type_filter != "All": | |
if model_type_filter == "Open Source": | |
filtered_df = filtered_df[filtered_df['Model Type'] == 'Open source'] | |
elif model_type_filter == "Proprietary": | |
filtered_df = filtered_df[filtered_df['Model Type'] == 'Proprietary'] | |
if reasoning_filter != "All": | |
if reasoning_filter == "Reasoning": | |
filtered_df = filtered_df[filtered_df['Output Type'] == 'Reasoning'] | |
elif reasoning_filter == "Normal": | |
filtered_df = filtered_df[filtered_df['Output Type'] == 'Normal'] | |
# Map display name to actual column name using shared mapping | |
actual_sort_column = SORT_COLUMN_MAP.get(sort_by, sort_by) | |
# If domain is selected and sorting by AC or TSQ, use domain-specific column | |
if domain_filter_clean != "All": | |
if actual_sort_column == "Avg AC": | |
actual_sort_column = f"{domain_filter_clean} AC" | |
elif actual_sort_column == "Avg TSQ": | |
actual_sort_column = f"{domain_filter_clean} TSQ" | |
elif actual_sort_column == "Avg Total Cost": | |
actual_sort_column = f"{domain_filter_clean} Cost" | |
elif actual_sort_column == "Avg Session Duration": | |
actual_sort_column = f"{domain_filter_clean} Duration" | |
elif actual_sort_column == "Avg Turns": | |
actual_sort_column = f"{domain_filter_clean} Turns" | |
if actual_sort_column and actual_sort_column in filtered_df.columns: | |
ascending = (sort_order == "Ascending") | |
filtered_df = filtered_df.sort_values(by=actual_sort_column, ascending=ascending, na_position='last') | |
# Update model selector choices based on filtered data | |
available_models = filtered_df['Model'].tolist()[:15] # Top 15 from filtered results | |
# If selected models are not in available models, reset to top 5 | |
if selected_models: | |
valid_selected = [m for m in selected_models if m in available_models] | |
if not valid_selected: | |
valid_selected = available_models[:5] | |
else: | |
valid_selected = available_models[:5] | |
# Create radar chart | |
chart = create_domain_radar_chart(load_leaderboard_data(), sort_by, valid_selected) | |
return gr.Dropdown(choices=available_models, value=valid_selected), chart | |
def update_radar_only(domain_filter, model_type_filter, reasoning_filter, sort_by, sort_order, selected_models): | |
# Get filtered dataframe | |
df = load_leaderboard_data() | |
filtered_df = df.copy() | |
# Strip emoji prefix from domain filter | |
domain_filter_clean = domain_filter | |
if domain_filter.startswith('π'): | |
domain_filter_clean = "All" | |
elif domain_filter.startswith('π¦'): | |
domain_filter_clean = "Banking" | |
elif domain_filter.startswith('π₯'): | |
domain_filter_clean = "Healthcare" | |
elif domain_filter.startswith('π‘οΈ'): | |
domain_filter_clean = "Insurance" | |
elif domain_filter.startswith('π°'): | |
domain_filter_clean = "Investment" | |
elif domain_filter.startswith('π±'): | |
domain_filter_clean = "Telecom" | |
# Apply filters (same logic as filter_and_sort_data) | |
if domain_filter_clean != "All": | |
domain_col_map = { | |
"Banking": "Banking AC", | |
"Healthcare": "Healthcare AC", | |
"Insurance": "Insurance AC", | |
"Investment": "Investment AC", | |
"Telecom": "Telecom AC" | |
} | |
if domain_filter_clean in domain_col_map: | |
domain_col = domain_col_map[domain_filter_clean] | |
filtered_df = filtered_df[filtered_df[domain_col] != ''] | |
if model_type_filter != "All": | |
if model_type_filter == "Open Source": | |
filtered_df = filtered_df[filtered_df['Model Type'] == 'Open source'] | |
elif model_type_filter == "Proprietary": | |
filtered_df = filtered_df[filtered_df['Model Type'] == 'Proprietary'] | |
if reasoning_filter != "All": | |
if reasoning_filter == "Reasoning": | |
filtered_df = filtered_df[filtered_df['Output Type'] == 'Reasoning'] | |
elif reasoning_filter == "Normal": | |
filtered_df = filtered_df[filtered_df['Output Type'] == 'Normal'] | |
# Map display name to actual column name using shared mapping | |
actual_sort_column = SORT_COLUMN_MAP.get(sort_by, sort_by) | |
if actual_sort_column and actual_sort_column in filtered_df.columns: | |
ascending = (sort_order == "Ascending") | |
filtered_df = filtered_df.sort_values(by=actual_sort_column, ascending=ascending, na_position='last') | |
if selected_models: | |
valid_selected = [m for m in selected_models if m in filtered_df['Model'].tolist()] | |
if not valid_selected: | |
valid_selected = filtered_df['Model'].tolist()[:5] | |
else: | |
valid_selected = filtered_df['Model'].tolist()[:5] | |
return create_domain_radar_chart(load_leaderboard_data(), sort_by, valid_selected) | |
# Update table when filters change | |
filter_inputs = [domain_filter, model_type_filter, reasoning_filter, sort_by, sort_order] | |
for input_component in filter_inputs: | |
input_component.change( | |
fn=update_table, | |
inputs=filter_inputs, | |
outputs=[leaderboard_title, leaderboard_table] | |
) | |
# Also update radar chart when filters change | |
input_component.change( | |
fn=update_radar_chart, | |
inputs=filter_inputs + [model_selector], | |
outputs=[model_selector, radar_chart] | |
) | |
# Update radar chart when model selection changes | |
model_selector.change( | |
fn=update_radar_only, | |
inputs=filter_inputs + [model_selector], | |
outputs=[radar_chart] | |
) | |
# Automatically update sort order when sort_by changes | |
sort_by.change( | |
fn=update_sort_order_automatically, | |
inputs=[sort_by], | |
outputs=[sort_order] | |
) | |
# Performance insights section | |
gr.HTML(""" | |
<div class="dark-container" style="margin-top: 32px;"> | |
<div class="section-header"> | |
<span class="section-icon" style="color: var(--accent-secondary);">π</span> | |
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;"> | |
Key Insights | |
</h3> | |
</div> | |
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(280px, 1fr)); gap: 24px; margin-top: 24px;"> | |
<div class="info-box"> | |
<h4 style="color: var(--accent-primary); margin-top: 0; font-size: 1.2rem; font-family: 'Geist', sans-serif; font-weight: 600;">π Top Performers</h4> | |
<ul style="color: var(--text-secondary); margin: 0; padding-left: 20px; line-height: 1.6; font-family: 'Geist', sans-serif;"> | |
<li>Highest AC scores indicate best action completion</li> | |
<li>Superior TSQ shows optimal tool selection</li> | |
<li>Balance cost-effectiveness with performance</li> | |
</ul> | |
</div> | |
<div class="info-box"> | |
<h4 style="color: var(--accent-secondary); margin-top: 0; font-size: 1.2rem; font-family: 'Geist', sans-serif; font-weight: 600;">π Filter Features</h4> | |
<ul style="color: var(--text-secondary); margin: 0; padding-left: 20px; line-height: 1.6; font-family: 'Geist', sans-serif;"> | |
<li>Domain-specific performance analysis</li> | |
<li>Compare open source vs private models</li> | |
<li>Reasoning vs standard model comparison</li> | |
</ul> | |
</div> | |
<div class="info-box"> | |
<h4 style="color: var(--accent-primary); margin-top: 0; font-size: 1.2rem; font-family: 'Geist', sans-serif; font-weight: 600;">π Visualization</h4> | |
<ul style="color: var(--text-secondary); margin: 0; padding-left: 20px; line-height: 1.6; font-family: 'Geist', sans-serif;"> | |
<li>Interactive radar charts for domain breakdown</li> | |
<li>Compare up to 5 models simultaneously</li> | |
<li>Hover for detailed performance metrics</li> | |
</ul> | |
</div> | |
</div> | |
</div> | |
""") | |
# NEW VISUALIZATIONS START HERE | |
# 1. Cost-Performance Efficiency Scatter Plot | |
gr.HTML(""" | |
<div class="dark-container" style="margin-top: 32px;"> | |
<div class="section-header"> | |
<span class="section-icon" style="color: var(--accent-primary);">π‘</span> | |
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;"> | |
Cost-Performance Efficiency Analysis | |
</h3> | |
</div> | |
<p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Geist', sans-serif;"> | |
Identify models that deliver the best performance per dollar spent | |
</p> | |
""") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
efficiency_metric = gr.Dropdown( | |
choices=["Avg Action Completion", "Avg Tool Selection Quality"], | |
value="Avg Action Completion", | |
label="π Performance Metric", | |
info="Select which performance metric to analyze against cost", | |
elem_classes=["dropdown"] | |
) | |
gr.HTML('<div class="chart-container">') | |
cost_performance_plot = gr.Plot( | |
label="", | |
value=create_cost_performance_scatter(load_leaderboard_data(), "Avg AC"), | |
elem_classes=["efficiency-chart", "plot-container"] | |
) | |
gr.HTML('</div>') | |
gr.HTML("</div>") | |
# 2. Speed vs Accuracy Trade-off Chart | |
gr.HTML(""" | |
<div class="dark-container" style="margin-top: 32px;"> | |
<div class="section-header"> | |
<span class="section-icon" style="color: var(--accent-secondary);">β‘</span> | |
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;"> | |
Speed vs Accuracy Trade-off | |
</h3> | |
</div> | |
<p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Geist', sans-serif;"> | |
Find the sweet spot between response time and accuracy | |
</p> | |
""") | |
gr.HTML('<div class="chart-container">') | |
speed_accuracy_plot = gr.Plot( | |
label="", | |
value=create_speed_accuracy_plot(load_leaderboard_data(), "Avg AC"), | |
elem_classes=["speed-accuracy-chart", "plot-container"] | |
) | |
gr.HTML('</div>') | |
gr.HTML("</div>") | |
# 3. Performance Heatmap | |
gr.HTML(""" | |
<div class="dark-container" style="margin-top: 32px;"> | |
<div class="section-header"> | |
<span class="section-icon" style="color: var(--accent-primary);">π₯</span> | |
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;"> | |
Comprehensive Performance Heatmap | |
</h3> | |
</div> | |
<p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Geist', sans-serif;"> | |
All metrics at a glance - darker colors indicate better performance | |
</p> | |
""") | |
gr.HTML('<div class="chart-container">') | |
performance_heatmap = gr.Plot( | |
label="", | |
value=create_performance_heatmap(load_leaderboard_data()), | |
elem_classes=["heatmap-chart", "plot-container"] | |
) | |
gr.HTML('</div>') | |
gr.HTML("</div>") | |
# 4. Domain Specialization Matrix | |
gr.HTML(""" | |
<div class="dark-container" style="margin-top: 32px;"> | |
<div class="section-header"> | |
<span class="section-icon" style="color: var(--accent-primary);">π―</span> | |
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;"> | |
Domain Specialization Matrix | |
</h3> | |
</div> | |
<p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Geist', sans-serif;"> | |
Bubble size shows performance level, color intensity shows specialization strength | |
</p> | |
""") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
specialization_metric = gr.Dropdown( | |
choices=["AC (Action Completion)", "TSQ (Tool Selection Quality)"], | |
value="AC (Action Completion)", | |
label="π Metric Type", | |
info="Choose which metric to analyze for domain specialization", | |
elem_classes=["dropdown"] | |
) | |
gr.HTML('<div class="chart-container">') | |
domain_specialization_plot = gr.Plot( | |
label="", | |
value=create_domain_specialization_matrix(load_leaderboard_data(), "AC"), | |
elem_classes=["specialization-chart", "plot-container"] | |
) | |
gr.HTML('</div>') | |
gr.HTML("</div>") | |
# 5. Performance Gap Analysis | |
gr.HTML(""" | |
<div class="dark-container" style="margin-top: 32px;"> | |
<div class="section-header"> | |
<span class="section-icon" style="color: var(--accent-secondary);">π</span> | |
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;"> | |
Performance Gap Analysis by Domain | |
</h3> | |
</div> | |
<p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Geist', sans-serif;"> | |
Visualize the performance range across models for each domain | |
</p> | |
""") | |
gr.HTML('<div class="chart-container">') | |
performance_gap_plot = gr.Plot( | |
label="", | |
value=create_performance_gap_analysis(load_leaderboard_data(), "AC"), | |
elem_classes=["gap-analysis-chart", "plot-container"] | |
) | |
gr.HTML('</div>') | |
gr.HTML("</div>") | |
# Update functions for new visualizations | |
def update_cost_performance(efficiency_metric): | |
actual_metric = SORT_COLUMN_MAP.get(efficiency_metric, efficiency_metric) | |
return create_cost_performance_scatter(load_leaderboard_data(), actual_metric) | |
def update_speed_accuracy(efficiency_metric): | |
actual_metric = SORT_COLUMN_MAP.get(efficiency_metric, efficiency_metric) | |
return create_speed_accuracy_plot(load_leaderboard_data(), actual_metric) | |
def update_domain_specialization(specialization_metric): | |
metric_type = "AC" if "AC" in specialization_metric else "TSQ" | |
return create_domain_specialization_matrix(load_leaderboard_data(), metric_type) | |
def update_performance_gap(specialization_metric): | |
metric_type = "AC" if "AC" in specialization_metric else "TSQ" | |
return create_performance_gap_analysis(load_leaderboard_data(), metric_type) | |
def update_all_visualizations(domain_filter, model_type_filter, reasoning_filter, sort_by, sort_order): | |
"""Update all new visualizations when filters change""" | |
df = load_leaderboard_data() | |
filtered_df = apply_filters(df, domain_filter, model_type_filter, reasoning_filter) | |
# Update efficiency metric based on current sort | |
actual_metric = SORT_COLUMN_MAP.get(sort_by, sort_by) if sort_by in ["Avg Action Completion", "Avg Tool Selection Quality"] else "Avg AC" | |
# Update all plots | |
cost_perf = create_cost_performance_scatter(filtered_df, actual_metric) | |
speed_acc = create_speed_accuracy_plot(filtered_df, actual_metric) | |
heatmap = create_performance_heatmap(filtered_df) | |
return cost_perf, speed_acc, heatmap | |
def apply_filters(df, domain_filter, model_type_filter, reasoning_filter): | |
"""Apply filters to dataframe""" | |
filtered_df = df.copy() | |
# Strip emoji prefix from domain filter | |
domain_filter_clean = domain_filter | |
if domain_filter.startswith('π'): | |
domain_filter_clean = "All" | |
elif domain_filter.startswith('π¦'): | |
domain_filter_clean = "Banking" | |
elif domain_filter.startswith('π₯'): | |
domain_filter_clean = "Healthcare" | |
elif domain_filter.startswith('π‘οΈ'): | |
domain_filter_clean = "Insurance" | |
elif domain_filter.startswith('π°'): | |
domain_filter_clean = "Investment" | |
elif domain_filter.startswith('π±'): | |
domain_filter_clean = "Telecom" | |
# Domain filtering | |
if domain_filter_clean != "All": | |
domain_col_map = { | |
"Banking": "Banking AC", | |
"Healthcare": "Healthcare AC", | |
"Insurance": "Insurance AC", | |
"Investment": "Investment AC", | |
"Telecom": "Telecom AC" | |
} | |
if domain_filter_clean in domain_col_map: | |
domain_col = domain_col_map[domain_filter_clean] | |
filtered_df = filtered_df[filtered_df[domain_col] != ''] | |
# Model type filtering | |
if model_type_filter != "All": | |
if model_type_filter == "Open Source": | |
filtered_df = filtered_df[filtered_df['Model Type'] == 'Open source'] | |
elif model_type_filter == "Proprietary": | |
filtered_df = filtered_df[filtered_df['Model Type'] == 'Proprietary'] | |
# Reasoning filtering | |
if reasoning_filter != "All": | |
if reasoning_filter == "Reasoning": | |
filtered_df = filtered_df[filtered_df['Output Type'] == 'Reasoning'] | |
elif reasoning_filter == "Normal": | |
filtered_df = filtered_df[filtered_df['Output Type'] == 'Normal'] | |
return filtered_df | |
# Connect update functions to components | |
efficiency_metric.change( | |
fn=update_cost_performance, | |
inputs=[efficiency_metric], | |
outputs=[cost_performance_plot] | |
) | |
efficiency_metric.change( | |
fn=update_speed_accuracy, | |
inputs=[efficiency_metric], | |
outputs=[speed_accuracy_plot] | |
) | |
specialization_metric.change( | |
fn=update_domain_specialization, | |
inputs=[specialization_metric], | |
outputs=[domain_specialization_plot] | |
) | |
specialization_metric.change( | |
fn=update_performance_gap, | |
inputs=[specialization_metric], | |
outputs=[performance_gap_plot] | |
) | |
# Update new visualizations when main filters change | |
for input_component in filter_inputs: | |
input_component.change( | |
fn=update_all_visualizations, | |
inputs=filter_inputs, | |
outputs=[cost_performance_plot, speed_accuracy_plot, performance_heatmap] | |
) | |
# Define generate_performance_card function before using it | |
def generate_performance_card(model_name): | |
"""Generate HTML for the model performance card""" | |
if not model_name: | |
return """<div style="text-align: center; color: var(--text-secondary); padding: 40px;"> | |
Please select a model to generate its performance card | |
</div>""" | |
# Get model data | |
df = load_leaderboard_data() | |
model_data = df[df['Model'] == model_name] | |
if model_data.empty: | |
return """<div style="text-align: center; color: var(--text-secondary); padding: 40px;"> | |
Model not found in the database | |
</div>""" | |
row = model_data.iloc[0] | |
# Get overall rank | |
df_with_ac = df[df['Avg AC'] != ''].copy() | |
df_with_ac['Avg AC'] = pd.to_numeric(df_with_ac['Avg AC'], errors='coerce') | |
df_sorted = df_with_ac.sort_values('Avg AC', ascending=False).reset_index(drop=True) | |
try: | |
rank = df_sorted[df_sorted['Model'] == model_name].index[0] + 1 | |
except: | |
rank = 'N/A' | |
# Format values | |
def format_value(val, decimals=3, prefix='', suffix=''): | |
if pd.isna(val) or val == '': | |
return 'N/A' | |
return f"{prefix}{float(val):.{decimals}f}{suffix}" | |
# Determine model type icon and badge color | |
type_icon = "π" if row['Model Type'] == 'Open source' else "π" | |
reasoning_icon = "π§ " if row.get('Output Type', '') == 'Reasoning' else "π‘" | |
# Calculate performance stars | |
def get_performance_stars(value, max_val=1.0): | |
if pd.isna(value) or value == '': | |
return 'β' * 0 | |
score = float(value) / max_val | |
if score >= 0.9: | |
return 'β' * 5 | |
elif score >= 0.7: | |
return 'β' * 4 | |
elif score >= 0.5: | |
return 'β' * 3 | |
elif score >= 0.3: | |
return 'β' * 2 | |
else: | |
return 'β' * 1 | |
# Create HTML | |
card_html = f""" | |
<div class="performance-card"> | |
<div class="card-header"> | |
<h1 class="card-model-name">{model_name}</h1> | |
<div class="card-stars"> | |
{get_performance_stars(row['Avg AC'])} | |
</div> | |
</div> | |
<div class="metrics-grid" style="margin-bottom: 24px;"> | |
<div class="metric-item"> | |
<div class="metric-icon" style="color: var(--accent-primary);">π</div> | |
<div class="metric-label">Overall Rank</div> | |
<div class="metric-value">#{rank}</div> | |
</div> | |
<div class="metric-item"> | |
<div class="metric-icon" style="color: var(--accent-primary);">π―</div> | |
<div class="metric-label">Action Completion</div> | |
<div class="metric-value">{format_value(row['Avg AC'])}</div> | |
</div> | |
<div class="metric-item"> | |
<div class="metric-icon" style="color: var(--accent-secondary);">π οΈ</div> | |
<div class="metric-label">Tool Selection</div> | |
<div class="metric-value">{format_value(row['Avg TSQ'])}</div> | |
</div> | |
<div class="metric-item"> | |
<div class="metric-icon" style="color: #F5F6F7;">π°</div> | |
<div class="metric-label">Avg Cost</div> | |
<div class="metric-value">{format_value(row['Avg Total Cost'], 3, '$')}</div> | |
</div> | |
<div class="metric-item"> | |
<div class="metric-icon" style="color: #F5F6F7;">β‘</div> | |
<div class="metric-label">Avg Duration</div> | |
<div class="metric-value">{format_value(row['Avg Session Duration'], 1, '', 's')}</div> | |
</div> | |
<div class="metric-item"> | |
<div class="metric-icon" style="color: #F5F6F7;">π¬</div> | |
<div class="metric-label">Avg Turns</div> | |
<div class="metric-value">{format_value(row['Avg Turns'], 1)}</div> | |
</div> | |
</div> | |
<div class="domains-section" style="margin-top: 24px;"> | |
<h3 class="domains-title">ποΈ Domain Performance</h3> | |
<div class="domains-grid"> | |
""" | |
# Add domain scores | |
domains = [ | |
('π¦', 'Banking'), | |
('π₯', 'Healthcare'), | |
('π‘οΈ', 'Insurance'), | |
('π°', 'Investment'), | |
('π±', 'Telecom') | |
] | |
for domain_icon, domain_name in domains: | |
ac_col = f'{domain_name} AC' | |
ac_value = row.get(ac_col, '') | |
if ac_value != '' and not pd.isna(ac_value): | |
score_display = f"{float(ac_value):.3f}" | |
score_color = "var(--accent-primary)" | |
else: | |
score_display = "N/A" | |
score_color = "var(--text-muted)" | |
card_html += f""" | |
<div class="domain-item"> | |
<div class="domain-name">{domain_icon}</div> | |
<div style="font-size: 0.7rem; color: var(--text-secondary); margin-bottom: 2px;">{domain_name}</div> | |
<div class="domain-score" style="color: {score_color};">{score_display}</div> | |
</div> | |
""" | |
card_html += f""" | |
</div> | |
</div> | |
<div class="card-footer"> | |
<div class="card-url"> | |
<strong>https://galileo.ai/agent-leaderboard</strong> | |
</div> | |
</div> | |
</div> | |
""" | |
return card_html | |
# MODEL PERFORMANCE CARD SECTION | |
gr.HTML(""" | |
<div class="dark-container" style="margin-top: 32px;"> | |
<div class="section-header"> | |
<span class="section-icon" style="color: var(--accent-primary);">π―</span> | |
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;"> | |
Model Performance Card | |
</h3> | |
</div> | |
<p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Geist', sans-serif;"> | |
Comprehensive performance card for any model - perfect for presentations and reports | |
</p> | |
<div style="display: flex; gap: 24px; align-items: flex-start;"> | |
<!-- Controls Column --> | |
<div style="flex: 0 0 280px;"> | |
<div style="background: rgba(245, 246, 247, 0.03); border: 1px solid var(--border-subtle); | |
border-radius: 16px; padding: 20px; position: sticky; top: 20px;"> | |
""") | |
card_model_selector = gr.Dropdown( | |
choices=initial_df['Model'].tolist(), | |
value=initial_df['Model'].tolist()[0] if len(initial_df) > 0 else None, | |
label="π€ Select Model", | |
info="Choose a model to view its performance card", | |
elem_classes=["dropdown"] | |
) | |
download_card_btn = gr.Button( | |
"π₯ Download Card as PNG", | |
variant="secondary", | |
elem_classes=["download-button"], | |
elem_id="download-card-btn" | |
) | |
gr.HTML(""" | |
</div> | |
</div> | |
<!-- Card Display Column --> | |
<div style="flex: 1; min-width: 0;" id="card-display-container"> | |
""") | |
# Card display area - generate initial card | |
initial_model = initial_df['Model'].tolist()[0] if len(initial_df) > 0 else None | |
initial_card_html = generate_performance_card(initial_model) if initial_model else "" | |
card_display = gr.HTML(value=initial_card_html, elem_id="performance-card-html") | |
gr.HTML(""" | |
</div> | |
</div> | |
</div>""") | |
# Add custom CSS for the performance card | |
gr.HTML(""" | |
<style> | |
/* Performance Card Styles */ | |
.performance-card { | |
background: linear-gradient(145deg, rgba(1, 9, 26, 0.98) 0%, rgba(227, 84, 84, 0.05) 100%); | |
border: 2px solid var(--accent-primary); | |
border-radius: 24px; | |
padding: 32px; | |
max-width: 700px; | |
margin: 0 auto; | |
position: relative; | |
overflow: hidden; | |
box-shadow: | |
0 20px 40px rgba(0, 0, 0, 0.5), | |
0 0 80px rgba(227, 84, 84, 0.2), | |
inset 0 0 120px rgba(227, 84, 84, 0.05); | |
} | |
.performance-card::before { | |
content: ''; | |
position: absolute; | |
top: -50%; | |
left: -50%; | |
width: 200%; | |
height: 200%; | |
background: radial-gradient(circle at center, var(--glow-primary) 0%, transparent 70%); | |
opacity: 0.1; | |
animation: pulse 4s ease-in-out infinite; | |
} | |
.card-header { | |
text-align: center; | |
margin-bottom: 24px; | |
position: relative; | |
z-index: 1; | |
} | |
.card-badges { | |
display: flex; | |
justify-content: center; | |
gap: 12px; | |
margin-bottom: 16px; | |
} | |
.card-model-name { | |
font-size: 2rem; | |
font-weight: 800; | |
background: linear-gradient(135deg, var(--accent-primary) 0%, var(--accent-secondary) 100%); | |
-webkit-background-clip: text; | |
-webkit-text-fill-color: transparent; | |
margin-bottom: 8px; | |
text-shadow: 0 0 40px var(--glow-primary); | |
line-height: 1.2; | |
} | |
.card-stars { | |
font-size: 1.2rem; | |
margin: 8px 0; | |
display: flex; | |
justify-content: center; | |
align-items: center; | |
gap: 2px; | |
} | |
.card-vendor { | |
font-size: 1.2rem; | |
color: var(--text-secondary); | |
font-weight: 500; | |
margin-top: 4px; | |
} | |
.metrics-grid { | |
display: grid; | |
grid-template-columns: repeat(auto-fit, minmax(180px, 1fr)); | |
gap: 16px; | |
margin-bottom: 24px; | |
position: relative; | |
z-index: 1; | |
} | |
.metric-item { | |
background: rgba(245, 246, 247, 0.05); | |
border: 1px solid var(--border-subtle); | |
border-radius: 16px; | |
padding: 16px; | |
text-align: center; | |
transition: all 0.3s ease; | |
} | |
.metric-item:hover { | |
transform: translateY(-4px); | |
border-color: var(--accent-primary); | |
box-shadow: 0 8px 24px rgba(227, 84, 84, 0.2); | |
} | |
.metric-icon { | |
font-size: 1.5rem; | |
margin-bottom: 6px; | |
filter: drop-shadow(0 0 20px currentColor); | |
} | |
.metric-label { | |
font-size: 0.75rem; | |
color: var(--text-secondary); | |
text-transform: uppercase; | |
letter-spacing: 0.05em; | |
margin-bottom: 4px; | |
} | |
.metric-value { | |
font-size: 1.4rem; | |
font-weight: 700; | |
color: var(--text-primary); | |
font-family: 'Geist Mono', monospace; | |
} | |
.domains-section { | |
margin-top: 32px; | |
position: relative; | |
z-index: 1; | |
} | |
.domains-title { | |
font-size: 1.1rem; | |
font-weight: 600; | |
color: var(--text-primary); | |
margin-bottom: 16px; | |
text-align: center; | |
} | |
.domains-grid { | |
display: grid; | |
grid-template-columns: repeat(5, 1fr); | |
gap: 12px; | |
} | |
.domain-item { | |
background: rgba(245, 246, 247, 0.05); | |
border: 1px solid var(--border-subtle); | |
border-radius: 12px; | |
padding: 12px; | |
text-align: center; | |
} | |
.domain-name { | |
font-size: 1.4rem; | |
margin-bottom: 4px; | |
} | |
.domain-score { | |
font-size: 1rem; | |
font-weight: 600; | |
color: var(--accent-primary); | |
} | |
.card-footer { | |
text-align: center; | |
margin-top: 24px; | |
padding-top: 20px; | |
border-top: 1px solid var(--border-subtle); | |
position: relative; | |
z-index: 1; | |
} | |
.card-badge { | |
display: inline-flex; | |
align-items: center; | |
gap: 8px; | |
padding: 8px 16px; | |
background: rgba(245, 246, 247, 0.05); | |
border: 1px solid var(--border-subtle); | |
border-radius: 20px; | |
font-size: 0.9rem; | |
color: var(--text-secondary); | |
margin: 0 4px; | |
} | |
.card-url { | |
margin-top: 12px; | |
font-size: 0.75rem; | |
color: var(--text-muted); | |
font-family: 'Geist Mono', monospace; | |
} | |
.primary-button { | |
background: linear-gradient(135deg, var(--accent-primary) 0%, #B94545 100%) !important; | |
color: white !important; | |
border: none !important; | |
padding: 10px 20px !important; | |
font-weight: 600 !important; | |
transition: all 0.3s ease !important; | |
} | |
.primary-button:hover { | |
transform: translateY(-2px) !important; | |
box-shadow: 0 8px 24px rgba(227, 84, 84, 0.4) !important; | |
} | |
/* Download button styling */ | |
.download-button { | |
background: linear-gradient(135deg, var(--accent-secondary) 0%, #0A6BC4 100%) !important; | |
color: white !important; | |
border: none !important; | |
padding: 10px 20px !important; | |
font-weight: 600 !important; | |
transition: all 0.3s ease !important; | |
} | |
.download-button:hover { | |
transform: translateY(-2px) !important; | |
box-shadow: 0 8px 24px rgba(16, 152, 247, 0.4) !important; | |
} | |
/* Responsive layout for performance card section */ | |
@media (max-width: 1200px) { | |
.performance-card { | |
padding: 24px !important; | |
} | |
.card-model-name { | |
font-size: 1.7rem !important; | |
} | |
.metric-value { | |
font-size: 1.2rem !important; | |
} | |
} | |
@media (max-width: 900px) { | |
/* Stack the controls above the card on smaller screens */ | |
#card-display-container { | |
margin-top: 20px; | |
} | |
.performance-card { | |
padding: 20px !important; | |
} | |
.card-model-name { | |
font-size: 1.5rem !important; | |
} | |
.metric-value { | |
font-size: 1.1rem !important; | |
} | |
.domains-grid { | |
grid-template-columns: repeat(3, 1fr) !important; | |
} | |
} | |
/* Button states */ | |
.download-button:disabled { | |
opacity: 0.6 !important; | |
cursor: not-allowed !important; | |
} | |
</style> | |
<!-- Include html2canvas library --> | |
<script src="https://cdn.jsdelivr.net/npm/html2canvas@1.4.1/dist/html2canvas.min.js"></script> | |
""") | |
# Wire up the card generator to selection change | |
card_model_selector.change( | |
fn=generate_performance_card, | |
inputs=[card_model_selector], | |
outputs=[card_display] | |
) | |
# Wire up download button with improved functionality | |
download_card_btn.click( | |
fn=None, | |
js=""" | |
() => { | |
// Wait a bit to ensure the card is fully rendered | |
setTimeout(() => { | |
const card = document.querySelector('.performance-card'); | |
if (!card) { | |
alert('Performance card not found. Please select a model first.'); | |
return; | |
} | |
// Check if html2canvas is loaded | |
if (typeof html2canvas === 'undefined') { | |
// Try to load html2canvas dynamically | |
const script = document.createElement('script'); | |
script.src = 'https://cdn.jsdelivr.net/npm/html2canvas@1.4.1/dist/html2canvas.min.js'; | |
script.onload = () => { | |
captureCard(); | |
}; | |
script.onerror = () => { | |
alert('Failed to load html2canvas library. Please try again.'); | |
}; | |
document.head.appendChild(script); | |
} else { | |
captureCard(); | |
} | |
function captureCard() { | |
// Show loading indicator | |
const btn = document.getElementById('download-card-btn'); | |
const originalText = btn.textContent; | |
btn.textContent = 'Generating...'; | |
btn.disabled = true; | |
html2canvas(card, { | |
backgroundColor: '#01091A', | |
scale: 2, | |
logging: false, | |
useCORS: true, | |
allowTaint: true | |
}).then(canvas => { | |
// Create download link | |
const link = document.createElement('a'); | |
const modelName = card.querySelector('.card-model-name')?.textContent || 'model'; | |
const timestamp = new Date().toISOString().slice(0,10); | |
const fileName = `${modelName.replace(/[^a-z0-9]/gi, '-').toLowerCase()}-performance-${timestamp}.png`; | |
link.download = fileName; | |
link.href = canvas.toDataURL('image/png'); | |
document.body.appendChild(link); | |
link.click(); | |
document.body.removeChild(link); | |
// Restore button | |
btn.textContent = originalText; | |
btn.disabled = false; | |
}).catch(error => { | |
console.error('Error capturing card:', error); | |
alert('Failed to capture performance card. Please try again.'); | |
btn.textContent = originalText; | |
btn.disabled = false; | |
}); | |
} | |
}, 100); | |
} | |
""" | |
) | |
# Also update card when filters change to keep model selector in sync | |
for input_component in filter_inputs: | |
def update_dropdown_and_card(*args): | |
filtered_df = apply_filters(load_leaderboard_data(), args[0], args[1], args[2]) | |
choices = filtered_df['Model'].tolist() | |
# Select first model from filtered list | |
value = choices[0] if choices else None | |
return gr.Dropdown(choices=choices, value=value) | |
input_component.change( | |
fn=update_dropdown_and_card, | |
inputs=filter_inputs, | |
outputs=[card_model_selector] | |
) | |
return leaderboard_table | |
def create_leaderboard_v2_interface(): | |
"""Create the complete leaderboard v2 interface""" | |
return create_leaderboard_v2_tab() | |
def create_domain_radar_chart(df, metric_type, selected_models=None, max_models=5): | |
"""Create a radar chart showing model performance across domains for the selected metric""" | |
# Map the metric_type to actual column name using shared mapping | |
actual_metric_type = SORT_COLUMN_MAP.get(metric_type, metric_type) | |
if selected_models is None or len(selected_models) == 0: | |
# Default to top 5 models by the selected metric if available | |
if actual_metric_type in df.columns: | |
selected_models = df.nlargest(max_models, actual_metric_type)['Model'].tolist() | |
else: | |
selected_models = df.head(max_models)['Model'].tolist() | |
# Limit to max_models for readability | |
selected_models = selected_models[:max_models] | |
# Define domain mapping based on metric type | |
domain_mapping = { | |
'Avg AC': { | |
'Banking': 'Banking AC', | |
'Healthcare': 'Healthcare AC', | |
'Insurance': 'Insurance AC', | |
'Investment': 'Investment AC', | |
'Telecom': 'Telecom AC' | |
}, | |
'Avg TSQ': { | |
'Banking': 'Banking TSQ', | |
'Healthcare': 'Healthcare TSQ', | |
'Insurance': 'Insurance TSQ', | |
'Investment': 'Investment TSQ', | |
'Telecom': 'Telecom TSQ' | |
}, | |
'Avg Total Cost': { | |
'Banking': 'Banking Cost', | |
'Healthcare': 'Healthcare Cost', | |
'Insurance': 'Insurance Cost', | |
'Investment': 'Investment Cost', | |
'Telecom': 'Telecom Cost' | |
}, | |
'Avg Session Duration': { | |
'Banking': 'Banking Duration', | |
'Healthcare': 'Healthcare Duration', | |
'Insurance': 'Insurance Duration', | |
'Investment': 'Investment Duration', | |
'Telecom': 'Telecom Duration' | |
}, | |
'Avg Turns': { | |
'Banking': 'Banking Turns', | |
'Healthcare': 'Healthcare Turns', | |
'Insurance': 'Insurance Turns', | |
'Investment': 'Investment Turns', | |
'Telecom': 'Telecom Turns' | |
} | |
} | |
# Only show radar chart for AC and TSQ metrics that have domain breakdowns | |
if actual_metric_type not in domain_mapping: | |
return create_empty_radar_chart(f"Domain breakdown not available for {metric_type}") | |
fig = go.Figure() | |
domains = list(domain_mapping[actual_metric_type].keys()) | |
domain_columns = list(domain_mapping[actual_metric_type].values()) | |
# Galileo dark theme color scheme | |
galileo_dark_colors = [ | |
{'fill': 'rgba(227, 84, 84, 0.25)', 'line': '#E35454', 'name': 'Vanguard'}, # Vanguard Red | |
{'fill': 'rgba(16, 152, 247, 0.15)', 'line': '#1098F7', 'name': 'Airglow'}, # Airglow Blue | |
{'fill': 'rgba(245, 246, 247, 0.15)', 'line': '#F5F6F7', 'name': 'Mercury'}, # Light Mercury | |
{'fill': 'rgba(227, 84, 84, 0.35)', 'line': '#B94545', 'name': 'Deep Red'}, # Darker Vanguard | |
{'fill': 'rgba(16, 152, 247, 0.25)', 'line': '#0A6BC4', 'name': 'Deep Blue'} # Darker Airglow | |
] | |
for idx, model_name in enumerate(selected_models): | |
model_data = df[df['Model'] == model_name] | |
if model_data.empty: | |
continue | |
model_row = model_data.iloc[0] | |
values = [] | |
# Get values for each domain | |
for col in domain_columns: | |
if col in df.columns and col in model_row: | |
val = model_row[col] | |
if pd.isna(val) or val == '': | |
val = 0 | |
else: | |
val = float(val) | |
values.append(val) | |
else: | |
values.append(0) | |
# Close the radar chart by repeating first value | |
values_plot = values + [values[0]] | |
domains_plot = domains + [domains[0]] | |
colors = galileo_dark_colors[idx % len(galileo_dark_colors)] | |
fig.add_trace( | |
go.Scatterpolar( | |
r=values_plot, | |
theta=domains_plot, | |
fill='toself', | |
fillcolor=colors['fill'], | |
line=dict( | |
color=colors['line'], | |
width=3, | |
shape='spline', | |
smoothing=0.8 | |
), | |
marker=dict( | |
size=10, | |
color=colors['line'], | |
symbol='circle', | |
line=dict(width=2, color='#01091A') | |
), | |
name=model_name, | |
mode="lines+markers", | |
hovertemplate="<b>%{fullData.name}</b><br>" + | |
"<span style='color: #94A3B8'>%{theta}</span><br>" + | |
"<b style='font-size: 14px; color: #F5F6F7'>%{r:.3f}</b><br>" + | |
"<extra></extra>", | |
hoverlabel=dict( | |
bgcolor="rgba(1, 9, 26, 0.95)", | |
bordercolor=colors['line'], | |
font=dict(color="#F5F6F7", size=12, family="'Geist', sans-serif") | |
) | |
) | |
) | |
# Determine appropriate range based on metric type | |
if actual_metric_type in ['Avg AC', 'Avg TSQ']: | |
max_range = 1.0 | |
else: | |
# Calculate max from data for other metrics (Cost, Duration, Turns) | |
all_values = [] | |
for model_name in selected_models: | |
model_data = df[df['Model'] == model_name] | |
if not model_data.empty: | |
model_row = model_data.iloc[0] | |
for col in domain_columns: | |
if col in df.columns and col in model_row: | |
val = model_row[col] | |
if pd.notna(val) and val != '': | |
all_values.append(float(val)) | |
max_range = max(all_values) * 1.1 if all_values else 1.0 | |
# Create custom tick values for better readability | |
tick_vals = [i * max_range / 5 for i in range(6)] | |
tick_text = [f"{val:.2f}" for val in tick_vals] | |
fig.update_layout( | |
polar=dict( | |
bgcolor='rgba(245, 246, 247, 0.03)', | |
radialaxis=dict( | |
visible=True, | |
range=[0, max_range], | |
showline=True, | |
linewidth=2, | |
linecolor='rgba(245, 246, 247, 0.2)', | |
gridcolor='rgba(245, 246, 247, 0.1)', | |
gridwidth=1, | |
tickvals=tick_vals, | |
ticktext=tick_text, | |
tickfont=dict( | |
size=11, | |
color='#94A3B8', | |
family="'Geist Mono', monospace" | |
), | |
tickangle=0 | |
), | |
angularaxis=dict( | |
showline=True, | |
linewidth=2, | |
linecolor='rgba(245, 246, 247, 0.2)', | |
gridcolor='rgba(245, 246, 247, 0.08)', | |
tickfont=dict( | |
size=14, | |
family="'Geist', sans-serif", | |
color='#F5F6F7', | |
weight=600 | |
), | |
rotation=90, | |
direction="clockwise", | |
), | |
), | |
showlegend=True, | |
legend=dict( | |
orientation="v", | |
yanchor="middle", | |
y=0.5, | |
xanchor="left", | |
x=1.05, | |
font=dict( | |
size=12, | |
family="'Geist', sans-serif", | |
color='#F5F6F7' | |
), | |
bgcolor='rgba(1, 9, 26, 0.8)', | |
bordercolor='rgba(245, 246, 247, 0.2)', | |
borderwidth=1, | |
itemsizing='constant', | |
itemwidth=30 | |
), | |
title=dict( | |
text=f"<b>Domain Performance: {metric_type}</b>", | |
x=0.5, | |
y=0.97, | |
font=dict( | |
size=22, | |
family="'Geist', sans-serif", | |
color="#F5F6F7", | |
weight=700 | |
), | |
), | |
paper_bgcolor="#01091A", | |
plot_bgcolor="rgba(245, 246, 247, 0.02)", | |
height=900, | |
width=1450, | |
margin=dict(t=100, b=80, l=80, r=200), | |
annotations=[ | |
dict( | |
text="Galileo Agent Leaderboard", | |
xref="paper", yref="paper", | |
x=0.98, y=0.02, | |
xanchor='right', yanchor='bottom', | |
font=dict(size=10, color='#64748B'), | |
showarrow=False | |
) | |
] | |
) | |
return fig | |
def create_empty_radar_chart(message): | |
"""Create an empty radar chart with a message""" | |
fig = go.Figure() | |
fig.add_annotation( | |
text=f"π {message}", | |
xref="paper", yref="paper", | |
x=0.5, y=0.5, | |
xanchor='center', yanchor='middle', | |
font=dict( | |
size=18, | |
color="#94A3B8", | |
family="'Geist', sans-serif" | |
), | |
showarrow=False, | |
bgcolor="rgba(245, 246, 247, 0.05)", | |
bordercolor="rgba(245, 246, 247, 0.2)", | |
borderwidth=1, | |
borderpad=20 | |
) | |
fig.update_layout( | |
paper_bgcolor="#01091A", | |
plot_bgcolor="rgba(245, 246, 247, 0.02)", | |
height=900, | |
width=1450, | |
margin=dict(t=100, b=80, l=80, r=200), | |
title=dict( | |
text="<b>Domain Performance Chart</b>", | |
x=0.5, | |
y=0.97, | |
font=dict( | |
size=22, | |
family="'Geist', sans-serif", | |
color="#F5F6F7", | |
weight=700 | |
), | |
), | |
annotations=[ | |
dict( | |
text="Galileo Agent Leaderboard", | |
xref="paper", yref="paper", | |
x=0.98, y=0.02, | |
xanchor='right', yanchor='bottom', | |
font=dict(size=10, color='#64748B'), | |
showarrow=False | |
) | |
] | |
) | |
return fig | |
# NEW VISUALIZATION FUNCTIONS | |
def create_cost_performance_scatter(df, metric="Avg AC"): | |
"""Create scatter plot showing cost vs performance efficiency""" | |
# Filter out models without cost or performance data | |
df_filtered = df[(df['Avg Total Cost'] != '') & (df[metric] != '')].copy() | |
if df_filtered.empty: | |
return create_empty_chart("No data available for cost-performance analysis") | |
# Convert to numeric | |
df_filtered['Avg Total Cost'] = pd.to_numeric(df_filtered['Avg Total Cost'], errors='coerce') | |
df_filtered[metric] = pd.to_numeric(df_filtered[metric], errors='coerce') | |
df_filtered['Avg Turns'] = pd.to_numeric(df_filtered['Avg Turns'], errors='coerce') | |
# Create color mapping for model type | |
color_map = { | |
'Proprietary': '#1098F7', # Airglow Blue for Proprietary | |
'Open source': '#58BC82' # Green for Open source | |
} | |
df_filtered['Color'] = df_filtered['Model Type'].map(color_map).fillna('#F5F6F7') | |
fig = go.Figure() | |
# Add scatter points | |
for model_type in df_filtered['Model Type'].unique(): | |
df_type = df_filtered[df_filtered['Model Type'] == model_type] | |
fig.add_trace(go.Scatter( | |
x=df_type[metric], | |
y=df_type['Avg Total Cost'], | |
mode='markers+text', | |
name=model_type, | |
text=df_type['Model'], | |
textposition="top center", | |
textfont=dict(size=10, color='#94A3B8'), | |
marker=dict( | |
size=df_type['Avg Turns'] * 3, # Size based on number of turns | |
color=color_map.get(model_type, '#F5F6F7'), | |
opacity=0.8, | |
line=dict(width=2, color='#01091A') | |
), | |
hovertemplate="<b>%{text}</b><br>" + | |
f"{metric}: %{{x:.3f}}<br>" + | |
"Cost: $%{y:.3f}<br>" + | |
"Turns: %{marker.size:.1f}<br>" + | |
"<extra></extra>" | |
)) | |
# Add quadrant lines | |
median_x = df_filtered[metric].median() | |
median_y = df_filtered['Avg Total Cost'].median() | |
fig.add_hline(y=median_y, line_dash="dash", line_color="#64748B", opacity=0.5) | |
fig.add_vline(x=median_x, line_dash="dash", line_color="#64748B", opacity=0.5) | |
# Add quadrant labels | |
fig.add_annotation(x=0.95, y=0.05, text="π High Performance<br>Low Cost", | |
showarrow=False, xref="paper", yref="paper", | |
font=dict(size=12, color="#F5F6F7"), bgcolor="rgba(245, 246, 247, 0.1)") | |
fig.add_annotation(x=0.05, y=0.95, text="β οΈ Low Performance<br>High Cost", | |
showarrow=False, xref="paper", yref="paper", | |
font=dict(size=12, color="#E35454"), bgcolor="rgba(227, 84, 84, 0.1)") | |
metric_display = "Action Completion" if metric == "Avg AC" else "Tool Selection Quality" | |
fig.update_layout( | |
title=dict( | |
text=f"<b>Cost-Performance Efficiency: {metric_display}</b>", | |
x=0.5, | |
y=0.97, | |
font=dict(size=22, family="'Geist', sans-serif", color="#F5F6F7", weight=700) | |
), | |
xaxis=dict( | |
title=dict( | |
text=f"<b>{metric_display}</b>", | |
font=dict(size=16, color="#F5F6F7") | |
), | |
tickfont=dict(size=12, color="#94A3B8"), | |
gridcolor="rgba(245, 246, 247, 0.1)", | |
zerolinecolor="rgba(245, 246, 247, 0.2)" | |
), | |
yaxis=dict( | |
title=dict( | |
text="<b>Average Session Cost ($)</b>", | |
font=dict(size=16, color="#F5F6F7") | |
), | |
tickfont=dict(size=12, color="#94A3B8"), | |
gridcolor="rgba(245, 246, 247, 0.1)", | |
zerolinecolor="rgba(245, 246, 247, 0.2)" | |
), | |
paper_bgcolor="#01091A", | |
plot_bgcolor="rgba(245, 246, 247, 0.02)", | |
height=900, | |
width=1450, | |
showlegend=True, | |
legend=dict( | |
orientation="h", | |
yanchor="bottom", | |
y=1.02, | |
xanchor="right", | |
x=1, | |
font=dict(size=12, family="'Geist', sans-serif", color='#F5F6F7'), | |
bgcolor='rgba(1, 9, 26, 0.8)', | |
bordercolor='rgba(245, 246, 247, 0.2)', | |
borderwidth=1 | |
), | |
margin=dict(t=100, b=80, l=80, r=80) | |
) | |
return fig | |
def create_speed_accuracy_plot(df, metric="Avg AC"): | |
"""Create scatter plot showing speed vs accuracy trade-off""" | |
# Filter out models without duration or performance data | |
df_filtered = df[(df['Avg Session Duration'] != '') & (df[metric] != '')].copy() | |
if df_filtered.empty: | |
return create_empty_chart("No data available for speed-accuracy analysis") | |
# Convert to numeric | |
df_filtered['Avg Session Duration'] = pd.to_numeric(df_filtered['Avg Session Duration'], errors='coerce') | |
df_filtered[metric] = pd.to_numeric(df_filtered[metric], errors='coerce') | |
# Create color scale based on cost | |
df_filtered['Avg Total Cost'] = pd.to_numeric(df_filtered['Avg Total Cost'], errors='coerce') | |
fig = go.Figure() | |
# Add scatter trace | |
fig.add_trace(go.Scatter( | |
x=df_filtered[metric], | |
y=df_filtered['Avg Session Duration'], | |
mode='markers+text', | |
text=df_filtered['Model'], | |
textposition="top center", | |
textfont=dict(size=9, color='#94A3B8'), | |
marker=dict( | |
size=12, | |
color=df_filtered['Avg Total Cost'], | |
colorscale=[[0, '#01091A'], [0.5, '#1098F7'], [1, '#E35454']], | |
showscale=True, | |
colorbar=dict( | |
title=dict( | |
text="Cost ($)", | |
font=dict(color="#F5F6F7") | |
), | |
tickfont=dict(color="#94A3B8"), | |
bgcolor="rgba(1, 9, 26, 0.8)", | |
bordercolor="rgba(245, 246, 247, 0.2)", | |
borderwidth=1, | |
x=1.02 | |
), | |
line=dict(width=2, color='#01091A') | |
), | |
hovertemplate="<b>%{text}</b><br>" + | |
f"{metric}: %{{x:.3f}}<br>" + | |
"Duration: %{y:.1f}s<br>" + | |
"Cost: $%{marker.color:.3f}<br>" + | |
"<extra></extra>" | |
)) | |
# Add quadrant lines | |
median_x = df_filtered[metric].median() | |
median_y = df_filtered['Avg Session Duration'].median() | |
fig.add_hline(y=median_y, line_dash="dash", line_color="#64748B", opacity=0.5) | |
fig.add_vline(x=median_x, line_dash="dash", line_color="#64748B", opacity=0.5) | |
# Add quadrant labels | |
fig.add_annotation(x=0.95, y=0.05, text="β‘ Fast & Accurate", | |
showarrow=False, xref="paper", yref="paper", | |
font=dict(size=12, color="#F5F6F7", weight=600)) | |
fig.add_annotation(x=0.05, y=0.95, text="π Slow & Inaccurate", | |
showarrow=False, xref="paper", yref="paper", | |
font=dict(size=12, color="#E35454", weight=600)) | |
metric_display = "Action Completion" if metric == "Avg AC" else "Tool Selection Quality" | |
fig.update_layout( | |
title=dict( | |
text=f"<b>Speed vs Accuracy Trade-off: {metric_display}</b>", | |
x=0.5, | |
y=0.97, | |
font=dict(size=22, family="'Geist', sans-serif", color="#F5F6F7", weight=700) | |
), | |
xaxis=dict( | |
title=dict( | |
text=f"<b>{metric_display}</b>", | |
font=dict(size=16, color="#F5F6F7") | |
), | |
tickfont=dict(size=12, color="#94A3B8"), | |
gridcolor="rgba(245, 246, 247, 0.1)", | |
zerolinecolor="rgba(245, 246, 247, 0.2)" | |
), | |
yaxis=dict( | |
title=dict( | |
text="<b>Average Session Duration (seconds)</b>", | |
font=dict(size=16, color="#F5F6F7") | |
), | |
tickfont=dict(size=12, color="#94A3B8"), | |
gridcolor="rgba(245, 246, 247, 0.1)", | |
zerolinecolor="rgba(245, 246, 247, 0.2)" | |
), | |
paper_bgcolor="#01091A", | |
plot_bgcolor="rgba(245, 246, 247, 0.02)", | |
height=900, | |
width=1450, | |
margin=dict(t=100, b=80, l=80, r=120) | |
) | |
return fig | |
def create_performance_heatmap(df): | |
"""Create a heatmap showing all metrics for all models""" | |
# Select relevant columns | |
metrics = ['Avg AC', 'Avg TSQ', 'Avg Total Cost', 'Avg Session Duration', 'Avg Turns'] | |
# Filter models with data | |
df_filtered = df[df['Avg AC'] != ''].copy() | |
if df_filtered.empty: | |
return create_empty_chart("No data available for performance heatmap") | |
# Convert to numeric and normalize | |
for col in metrics: | |
df_filtered[col] = pd.to_numeric(df_filtered[col], errors='coerce') | |
# Create normalized data (0-1 scale) | |
# For cost, duration, and turns, lower is better so we invert | |
normalized_data = [] | |
metric_labels = [] | |
for col in metrics: | |
if col in ['Avg Total Cost', 'Avg Session Duration', 'Avg Turns']: | |
# Invert these metrics (lower is better) | |
normalized = 1 - (df_filtered[col] - df_filtered[col].min()) / (df_filtered[col].max() - df_filtered[col].min()) | |
else: | |
# Higher is better for AC and TSQ | |
normalized = (df_filtered[col] - df_filtered[col].min()) / (df_filtered[col].max() - df_filtered[col].min()) | |
normalized_data.append(normalized.values) | |
# Create better labels | |
label_map = { | |
'Avg AC': 'Action Completion', | |
'Avg TSQ': 'Tool Selection', | |
'Avg Total Cost': 'Cost Efficiency', | |
'Avg Session Duration': 'Speed', | |
'Avg Turns': 'Conversation Efficiency' | |
} | |
metric_labels.append(label_map.get(col, col)) | |
# Create heatmap | |
fig = go.Figure(data=go.Heatmap( | |
z=normalized_data, | |
x=df_filtered['Model'].tolist(), | |
y=metric_labels, | |
colorscale=[[0, '#01091A'], [0.5, '#1098F7'], [1, '#E35454']], | |
hovertemplate="<b>%{x}</b><br>" + | |
"%{y}: %{z:.2f}<br>" + | |
"<extra></extra>", | |
text=[[f"{val:.2f}" for val in row] for row in normalized_data], | |
texttemplate="%{text}", | |
textfont={"size": 10, "color": "#F5F6F7"}, | |
showscale=True, | |
colorbar=dict( | |
title=dict( | |
text="Performance<br>Score", | |
font=dict(color="#F5F6F7") | |
), | |
tickfont=dict(color="#94A3B8"), | |
bgcolor="rgba(1, 9, 26, 0.8)", | |
bordercolor="rgba(245, 246, 247, 0.2)", | |
borderwidth=1 | |
) | |
)) | |
fig.update_layout( | |
title=dict( | |
text="<b>Comprehensive Performance Heatmap</b>", | |
x=0.5, | |
y=0.97, | |
font=dict(size=22, family="'Geist', sans-serif", color="#F5F6F7", weight=700) | |
), | |
xaxis=dict( | |
side="bottom", | |
tickfont=dict(size=11, color="#94A3B8"), | |
tickangle=-45 | |
), | |
yaxis=dict( | |
tickfont=dict(size=13, color="#F5F6F7", weight=600) | |
), | |
paper_bgcolor="#01091A", | |
plot_bgcolor="rgba(245, 246, 247, 0.02)", | |
height=700, | |
width=1550, | |
margin=dict(t=100, b=120, l=170, r=120) | |
) | |
return fig | |
def create_domain_specialization_matrix(df, metric_type="AC"): | |
"""Create bubble chart showing domain specialization""" | |
domains = ['Banking', 'Healthcare', 'Insurance', 'Investment', 'Telecom'] | |
# Prepare data | |
data = [] | |
for _, model in df.iterrows(): | |
if model['Model'] == '': | |
continue | |
model_avg = pd.to_numeric(model[f'Avg {metric_type}'], errors='coerce') | |
if pd.isna(model_avg): | |
continue | |
for domain in domains: | |
domain_col = f'{domain} {metric_type}' | |
if domain_col in model and model[domain_col] != '': | |
domain_val = pd.to_numeric(model[domain_col], errors='coerce') | |
if not pd.isna(domain_val): | |
# Calculate specialization strength (deviation from model average) | |
specialization = domain_val - model_avg | |
data.append({ | |
'Model': model['Model'], | |
'Domain': domain, | |
'Performance': domain_val, | |
'Specialization': specialization, | |
'Model Type': model['Model Type'] | |
}) | |
if not data: | |
return create_empty_chart("No domain specialization data available") | |
df_plot = pd.DataFrame(data) | |
# Create bubble chart | |
fig = go.Figure() | |
# Color based on specialization strength | |
fig.add_trace(go.Scatter( | |
x=df_plot['Domain'], | |
y=df_plot['Model'], | |
mode='markers', | |
marker=dict( | |
size=df_plot['Performance'] * 30, # Size based on absolute performance | |
color=df_plot['Specialization'], | |
colorscale=[[0, '#1098F7'], [0.5, '#F5F6F7'], [1, '#E35454']], | |
showscale=True, | |
colorbar=dict( | |
title=dict( | |
text="Specialization<br>Strength", | |
font=dict(color="#F5F6F7") | |
), | |
tickfont=dict(color="#94A3B8"), | |
bgcolor="rgba(1, 9, 26, 0.8)", | |
bordercolor="rgba(245, 246, 247, 0.2)", | |
borderwidth=1 | |
), | |
line=dict(width=2, color='#01091A'), | |
opacity=0.8 | |
), | |
text=[f"Performance: {p:.3f}<br>Specialization: {s:+.3f}" | |
for p, s in zip(df_plot['Performance'], df_plot['Specialization'])], | |
hovertemplate="<b>%{y}</b><br>" + | |
"Domain: %{x}<br>" + | |
"%{text}<br>" + | |
"<extra></extra>" | |
)) | |
metric_display = "Action Completion" if metric_type == "AC" else "Tool Selection Quality" | |
fig.update_layout( | |
title=dict( | |
text=f"<b>Domain Specialization Matrix: {metric_display}</b>", | |
x=0.5, | |
y=0.97, | |
font=dict(size=22, family="'Geist', sans-serif", color="#F5F6F7", weight=700) | |
), | |
xaxis=dict( | |
title=dict( | |
text="<b>Business Domains</b>", | |
font=dict(size=16, color="#F5F6F7") | |
), | |
tickfont=dict(size=13, color="#F5F6F7"), | |
gridcolor="rgba(245, 246, 247, 0.1)" | |
), | |
yaxis=dict( | |
title=dict( | |
text="<b>Models</b>", | |
font=dict(size=16, color="#F5F6F7") | |
), | |
tickfont=dict(size=11, color="#94A3B8"), | |
gridcolor="rgba(245, 246, 247, 0.1)" | |
), | |
paper_bgcolor="#01091A", | |
plot_bgcolor="rgba(245, 246, 247, 0.02)", | |
height=1100, | |
width=1450, | |
margin=dict(t=100, b=80, l=220, r=120) | |
) | |
return fig | |
def create_performance_gap_analysis(df, metric_type="AC"): | |
"""Create range plot showing performance gaps by domain""" | |
domains = ['Banking', 'Healthcare', 'Insurance', 'Investment', 'Telecom'] | |
# Calculate min, max, median for each domain | |
gap_data = [] | |
for domain in domains: | |
domain_col = f'{domain} {metric_type}' | |
if domain_col in df.columns: | |
domain_values = pd.to_numeric(df[domain_col], errors='coerce').dropna() | |
if len(domain_values) > 0: | |
gap_data.append({ | |
'Domain': domain, | |
'Min': domain_values.min(), | |
'Max': domain_values.max(), | |
'Median': domain_values.median(), | |
'Q1': domain_values.quantile(0.25), | |
'Q3': domain_values.quantile(0.75), | |
'Gap': domain_values.max() - domain_values.min() | |
}) | |
if not gap_data: | |
return create_empty_chart("No data available for gap analysis") | |
df_gap = pd.DataFrame(gap_data) | |
df_gap = df_gap.sort_values('Gap', ascending=True) | |
fig = go.Figure() | |
# Add range bars | |
for idx, row in df_gap.iterrows(): | |
# Add full range line | |
fig.add_trace(go.Scatter( | |
x=[row['Min'], row['Max']], | |
y=[row['Domain'], row['Domain']], | |
mode='lines', | |
line=dict(color='#64748B', width=2), | |
showlegend=False, | |
hoverinfo='skip' | |
)) | |
# Add IQR box | |
fig.add_trace(go.Scatter( | |
x=[row['Q1'], row['Q3'], row['Q3'], row['Q1'], row['Q1']], | |
y=[row['Domain'], row['Domain'], row['Domain'], row['Domain'], row['Domain']], | |
fill='toself', | |
fillcolor='rgba(227, 84, 84, 0.3)', | |
line=dict(color='#E35454', width=2), | |
showlegend=False, | |
hoverinfo='skip', | |
mode='lines' | |
)) | |
# Add median marker | |
fig.add_trace(go.Scatter( | |
x=[row['Median']], | |
y=[row['Domain']], | |
mode='markers', | |
marker=dict( | |
size=12, | |
color='#E35454', | |
symbol='diamond', | |
line=dict(width=2, color='#01091A') | |
), | |
showlegend=False, | |
hovertemplate=f"<b>{row['Domain']}</b><br>" + | |
f"Min: {row['Min']:.3f}<br>" + | |
f"Q1: {row['Q1']:.3f}<br>" + | |
f"Median: {row['Median']:.3f}<br>" + | |
f"Q3: {row['Q3']:.3f}<br>" + | |
f"Max: {row['Max']:.3f}<br>" + | |
f"Gap: {row['Gap']:.3f}<br>" + | |
"<extra></extra>" | |
)) | |
# Add min/max points | |
for idx, row in df_gap.iterrows(): | |
fig.add_trace(go.Scatter( | |
x=[row['Min'], row['Max']], | |
y=[row['Domain'], row['Domain']], | |
mode='markers', | |
marker=dict(size=8, color='#F5F6F7', line=dict(width=2, color='#01091A')), | |
showlegend=False, | |
hoverinfo='skip' | |
)) | |
metric_display = "Action Completion" if metric_type == "AC" else "Tool Selection Quality" | |
fig.update_layout( | |
title=dict( | |
text=f"<b>Performance Gap Analysis by Domain: {metric_display}</b>", | |
x=0.5, | |
y=0.97, | |
font=dict(size=22, family="'Geist', sans-serif", color="#F5F6F7", weight=700) | |
), | |
xaxis=dict( | |
title=dict( | |
text=f"<b>{metric_display} Score</b>", | |
font=dict(size=16, color="#F5F6F7") | |
), | |
tickfont=dict(size=12, color="#94A3B8"), | |
gridcolor="rgba(245, 246, 247, 0.1)", | |
range=[0, 1] if metric_type in ['AC', 'TSQ'] else None | |
), | |
yaxis=dict( | |
title=dict( | |
text="<b>Business Domain</b>", | |
font=dict(size=16, color="#F5F6F7") | |
), | |
tickfont=dict(size=13, color="#F5F6F7"), | |
gridcolor="rgba(245, 246, 247, 0.1)" | |
), | |
paper_bgcolor="#01091A", | |
plot_bgcolor="rgba(245, 246, 247, 0.02)", | |
height=800, | |
width=1450, | |
margin=dict(t=100, b=80, l=140, r=80), | |
showlegend=False | |
) | |
# Add legend manually | |
fig.add_annotation( | |
text="β Median β IQR β Full Range", | |
xref="paper", yref="paper", | |
x=0.98, y=0.02, | |
xanchor='right', yanchor='bottom', | |
font=dict(size=12, color='#94A3B8'), | |
showarrow=False | |
) | |
return fig | |
def create_empty_chart(message): | |
"""Create an empty chart with a message""" | |
fig = go.Figure() | |
fig.add_annotation( | |
text=f"π {message}", | |
xref="paper", yref="paper", | |
x=0.5, y=0.5, | |
xanchor='center', yanchor='middle', | |
font=dict( | |
size=18, | |
color="#94A3B8", | |
family="'Geist', sans-serif" | |
), | |
showarrow=False, | |
bgcolor="rgba(245, 246, 247, 0.05)", | |
bordercolor="rgba(245, 246, 247, 0.2)", | |
borderwidth=1, | |
borderpad=20 | |
) | |
fig.update_layout( | |
paper_bgcolor="#01091A", | |
plot_bgcolor="rgba(245, 246, 247, 0.02)", | |
height=700, | |
width=1450, | |
margin=dict(t=80, b=80, l=80, r=80) | |
) |