agent-leaderboard / tabs /leaderboard_v2.py
Pratik Bhavsar
updated link
0aebfb6
import gradio as gr
import pandas as pd
import plotly.graph_objects as go
# Utility functions (moved from utils.py)
def get_chart_colors():
return {
"Private": "#1098F7", # Airglow Blue for Proprietary
"Open source": "#58BC82", # Green for Open source
"performance_bands": ["#DCFCE7", "#FEF9C3", "#FEE2E2"],
"text": "#F5F6F7",
"background": "#01091A",
"grid": (0, 0, 0, 0.1), # RGBA tuple for grid
}
def get_rank_badge(rank):
"""Generate HTML for rank badge with appropriate styling"""
badge_styles = {
1: ("1st", "linear-gradient(145deg, #ffd700, #ffc400)", "#000"),
2: ("2nd", "linear-gradient(145deg, #9ca3af, #787C7E)", "#fff"),
3: ("3rd", "linear-gradient(145deg, #CD7F32, #b36a1d)", "#fff"),
}
if rank in badge_styles:
label, gradient, text_color = badge_styles[rank]
return f"""
<div style="
display: inline-flex;
align-items: center;
justify-content: center;
min-width: 48px;
padding: 4px 12px;
background: {gradient};
color: {text_color};
border-radius: 6px;
font-weight: 600;
font-size: 0.9em;
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2);
">
{label}
</div>
"""
return f"""
<div style="
display: inline-flex;
align-items: center;
justify-content: center;
min-width: 28px;
color: #a1a1aa;
font-weight: 500;
">
{rank}
</div>
"""
def get_type_badge(model_type):
"""Generate HTML for model type badge"""
colors = get_chart_colors()
colors = {"Private": colors["Private"], "Open source": colors["Open source"]}
bg_color = colors.get(model_type, "#4F46E5")
return f"""
<div style="
display: inline-flex;
align-items: center;
padding: 4px 8px;
background: {bg_color};
color: white;
border-radius: 4px;
font-size: 0.85em;
font-weight: 500;
">
{model_type}
</div>
"""
def get_output_type_badge(output_type):
"""Generate HTML for output type badge"""
if output_type == "Reasoning":
bg_color = "#9333ea" # Purple for reasoning
else:
bg_color = "#6b7280" # Gray for normal
return f"""
<div style="
display: inline-flex;
align-items: center;
gap: 4px;
padding: 4px 8px;
background: {bg_color};
color: white;
border-radius: 4px;
font-size: 0.85em;
font-weight: 500;
">
{output_type}
</div>
"""
def get_score_bar(score):
"""Generate HTML for score bar with gradient styling"""
width = score * 100
return f"""
<div style="display: flex; align-items: center; gap: 12px; width: 100%;">
<div style="
flex-grow: 1;
height: 8px;
background: rgba(245, 246, 247, 0.1);
border-radius: 4px;
overflow: hidden;
max-width: 200px;
">
<div style="
width: {width}%;
height: 100%;
background: linear-gradient(90deg, #E35454, #1098F7);
border-radius: 4px;
transition: width 0.3s ease;
"></div>
</div>
<span style="
font-family: 'SF Mono', monospace;
font-weight: 600;
color: #F5F6F7;
min-width: 60px;
">{score:.3f}</span>
</div>
"""
# Define column mapping once for reuse across all functions
SORT_COLUMN_MAP = {
"Avg Action Completion": "Avg AC",
"Avg Tool Selection Quality": "Avg TSQ",
"Avg Session Cost": "Avg Total Cost",
}
def create_leaderboard_v2_tab():
"""Create the main leaderboard v2 tab with interactive table"""
def load_leaderboard_data():
"""Load and prepare the leaderboard data"""
df = pd.read_csv('results_v2.csv')
# Clean and prepare data
df = df.copy()
# Round numeric columns for better display
numeric_cols = ['Avg AC', 'Avg TSQ', 'Avg Total Cost', 'Avg Session Duration', 'Avg Turns']
for col in numeric_cols:
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors='coerce').round(3)
# Fill NaN values appropriately
df = df.fillna('')
return df
def generate_html_table(filtered_df, domain_filter):
"""Generate styled HTML table with rank badges and score bars"""
table_html = """
<style>
/* Dark theme table styling */
.v2-table-container {
background: var(--bg-card);
border-radius: 16px;
overflow: hidden;
border: 1px solid var(--border-subtle);
margin-top: 20px;
}
.v2-styled-table {
width: 100%;
border-collapse: collapse;
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif;
background: var(--bg-card);
color: var(--text-primary);
}
.v2-styled-table thead {
position: sticky;
top: 0;
background: rgba(227, 84, 84, 0.1);
z-index: 1;
}
.v2-styled-table th {
padding: 14px 12px;
text-align: left;
font-weight: 600;
color: var(--text-primary);
border-bottom: 2px solid var(--accent-primary);
font-size: 13px;
text-transform: uppercase;
letter-spacing: 0.05em;
}
.v2-styled-table td {
padding: 12px;
border-bottom: 1px solid var(--border-subtle);
color: var(--text-primary);
transition: all 0.2s ease;
}
.v2-styled-table tbody tr {
transition: all 0.3s ease;
}
.v2-styled-table tbody tr:hover {
background: rgba(227, 84, 84, 0.15) !important;
box-shadow: 0 0 20px rgba(227, 84, 84, 0.3), inset 0 0 20px rgba(227, 84, 84, 0.1);
transform: scale(1.01);
}
.v2-styled-table tbody tr:nth-child(even) {
background: var(--bg-secondary);
}
.model-name {
font-weight: 500;
color: var(--accent-primary);
transition: color 0.2s ease;
}
/* Keep model name color consistent on hover to emphasize row highlight */
.v2-styled-table tr:hover .model-name {
color: var(--accent-secondary);
}
.numeric-cell {
font-family: 'Geist Mono', monospace;
font-size: 13px;
text-align: center;
}
/* Score bar specific styling */
.score-cell {
min-width: 180px;
}
</style>
<div class="v2-table-container">
<table class="v2-styled-table">
<thead>
<tr>
<th style="width: 80px;">Rank</th>
<th>Model</th>
<th style="width: 120px;">Type</th>
<th style="width: 120px;">Output Type</th>
<th>Vendor</th>
<th style="width: 200px;">Avg Action Completion</th>
<th style="width: 200px;">Avg Tool Selection Quality</th>
<th>Avg Cost ($)</th>
<th>Avg Duration (s)</th>
<th>Avg Turns</th>
</tr>
</thead>
<tbody>
"""
# Generate table rows
for idx, (_, row) in enumerate(filtered_df.iterrows()):
rank = idx + 1
table_html += f"""
<tr>
<td>{get_rank_badge(rank)}</td>
<td class="model-name">{row['Model']}</td>
<td>{get_type_badge(row['Model Type'])}</td>
<td>{get_output_type_badge(row.get('Output Type', 'Normal'))}</td>
<td>{row['Vendor']}</td>
"""
# Get appropriate values based on domain filter
if domain_filter != "All":
# For specific domain, show domain-specific scores
ac_col = f'{domain_filter} AC'
tsq_col = f'{domain_filter} TSQ'
# AC score
if ac_col in row and row[ac_col] != '':
ac_value = float(row[ac_col])
table_html += f'<td class="score-cell">{get_score_bar(ac_value)}</td>'
else:
table_html += '<td class="numeric-cell">-</td>'
# TSQ score
if tsq_col in row and row[tsq_col] != '':
tsq_value = float(row[tsq_col])
table_html += f'<td class="score-cell">{get_score_bar(tsq_value)}</td>'
else:
table_html += '<td class="numeric-cell">-</td>'
else:
# For "All", show overall averages
table_html += f"""
<td class="score-cell">{get_score_bar(row['Avg AC'])}</td>
<td class="score-cell">{get_score_bar(row['Avg TSQ'])}</td>
"""
# Add appropriate cost, duration, and turns based on domain filter
if domain_filter != "All":
# Use domain-specific values
cost_col = f'{domain_filter} Cost'
duration_col = f'{domain_filter} Duration'
turns_col = f'{domain_filter} Turns'
cost = row.get(cost_col, '')
duration = row.get(duration_col, '')
turns = row.get(turns_col, '')
# Convert to float if not empty
if cost != '':
cost = float(cost)
if duration != '':
duration = float(duration)
if turns != '':
turns = float(turns)
else:
# Use overall averages for "All" domain
cost = row.get('Avg Total Cost', row.get('Cost ($)', ''))
duration = row.get('Avg Session Duration', row.get('Duration (s)', ''))
turns = row.get('Avg Turns', row.get('Turns', ''))
# Format the values for display
if cost != '':
cost_display = f'{cost:.3f}'
else:
cost_display = '-'
if duration != '':
duration_display = f'{duration:.1f}'
else:
duration_display = '-'
if turns != '':
turns_display = f'{turns:.1f}'
else:
turns_display = '-'
table_html += f"""
<td class="numeric-cell">${cost_display}</td>
<td class="numeric-cell">{duration_display}</td>
<td class="numeric-cell">{turns_display}</td>
</tr>
"""
table_html += """
</tbody>
</table>
</div>
"""
return table_html
def update_leaderboard_title(domain_filter):
"""Update the leaderboard title based on selected domain"""
# Strip emoji prefix from domain filter
domain_filter_clean = domain_filter
if domain_filter.startswith('🌐'):
domain_filter_clean = "All"
elif domain_filter.startswith('🏦'):
domain_filter_clean = "Banking"
elif domain_filter.startswith('πŸ₯'):
domain_filter_clean = "Healthcare"
elif domain_filter.startswith('πŸ›‘οΈ'):
domain_filter_clean = "Insurance"
elif domain_filter.startswith('πŸ’°'):
domain_filter_clean = "Investment"
elif domain_filter.startswith('πŸ“±'):
domain_filter_clean = "Telecom"
return f"""
<div class="dark-container pulse" style="margin-bottom: 24px;">
<div class="section-header">
<span class="section-icon" style="color: var(--accent-primary);">πŸ“ˆ</span>
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;">
Agent Leaderboard for {domain_filter_clean}
</h3>
</div>
<div class="dataframe-container">
"""
def filter_and_sort_data(domain_filter, model_type_filter, reasoning_filter, sort_by, sort_order):
"""Filter and sort the leaderboard data"""
df = load_leaderboard_data()
# Apply filters
filtered_df = df.copy()
# Strip emoji prefix from domain filter
domain_filter_clean = domain_filter
if domain_filter.startswith('🌐'):
domain_filter_clean = "All"
elif domain_filter.startswith('🏦'):
domain_filter_clean = "Banking"
elif domain_filter.startswith('πŸ₯'):
domain_filter_clean = "Healthcare"
elif domain_filter.startswith('πŸ›‘οΈ'):
domain_filter_clean = "Insurance"
elif domain_filter.startswith('πŸ’°'):
domain_filter_clean = "Investment"
elif domain_filter.startswith('πŸ“±'):
domain_filter_clean = "Telecom"
# Domain filtering (check if model performs well in specific domain)
if domain_filter_clean != "All":
domain_col_map = {
"Banking": "Banking AC",
"Healthcare": "Healthcare AC",
"Insurance": "Insurance AC",
"Investment": "Investment AC",
"Telecom": "Telecom AC"
}
if domain_filter_clean in domain_col_map:
domain_col = domain_col_map[domain_filter_clean]
# Only show models that have data for this domain
filtered_df = filtered_df[filtered_df[domain_col] != '']
# Model type filtering
if model_type_filter != "All":
if model_type_filter == "Open Source":
filtered_df = filtered_df[filtered_df['Model Type'] == 'Open source']
elif model_type_filter == "Proprietary":
filtered_df = filtered_df[filtered_df['Model Type'] == 'Proprietary']
# Reasoning filtering
if reasoning_filter != "All":
if reasoning_filter == "Reasoning":
filtered_df = filtered_df[filtered_df['Output Type'] == 'Reasoning']
elif reasoning_filter == "Normal":
filtered_df = filtered_df[filtered_df['Output Type'] == 'Normal']
# Map display name to actual column name using shared mapping
actual_sort_column = SORT_COLUMN_MAP.get(sort_by, sort_by)
# If domain is selected and sorting by AC or TSQ, use domain-specific column
if domain_filter_clean != "All":
if actual_sort_column == "Avg AC":
actual_sort_column = f"{domain_filter_clean} AC"
elif actual_sort_column == "Avg TSQ":
actual_sort_column = f"{domain_filter_clean} TSQ"
elif actual_sort_column == "Avg Total Cost":
actual_sort_column = f"{domain_filter_clean} Cost"
elif actual_sort_column == "Avg Session Duration":
actual_sort_column = f"{domain_filter_clean} Duration"
elif actual_sort_column == "Avg Turns":
actual_sort_column = f"{domain_filter_clean} Turns"
if actual_sort_column and actual_sort_column in filtered_df.columns:
ascending = (sort_order == "Ascending")
filtered_df = filtered_df.sort_values(by=actual_sort_column, ascending=ascending, na_position='last')
# Generate HTML table
return generate_html_table(filtered_df, domain_filter_clean)
# Load initial data
initial_table = filter_and_sort_data("🌐 All", "All", "All", "Avg AC", "Descending")
initial_df = load_leaderboard_data() # Load raw data for model selector
# Custom CSS for Galileo dark theme
custom_css = """
<style>
/* Import Geist fonts */
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap');
@font-face {
font-family: 'Geist';
src: url('https://raw.githubusercontent.com/vercel/geist-font/main/packages/next/dist/fonts/geist-sans/Geist-Variable.woff2') format('woff2');
font-weight: 100 900;
font-style: normal;
}
@font-face {
font-family: 'Geist Mono';
src: url('https://raw.githubusercontent.com/vercel/geist-font/main/packages/next/dist/fonts/geist-mono/GeistMono-Variable.woff2') format('woff2');
font-weight: 100 900;
font-style: normal;
}
/* Root variables for enhanced color scheme */
:root {
--bg-primary: #01091A;
--bg-secondary: rgba(245, 246, 247, 0.03);
--bg-card: rgba(245, 246, 247, 0.02);
--border-subtle: rgba(245, 246, 247, 0.08);
--border-default: rgba(245, 246, 247, 0.12);
--border-strong: rgba(245, 246, 247, 0.2);
--text-primary: #F5F6F7;
--text-secondary: #94A3B8;
--text-muted: #64748B;
--accent-primary: #E35454;
--accent-secondary: #1098F7;
--accent-tertiary: #F5F6F7;
--glow-primary: rgba(227, 84, 84, 0.4);
--glow-secondary: rgba(16, 152, 247, 0.4);
--glow-tertiary: rgba(245, 246, 247, 0.3);
}
/* Global font and background */
.gradio-container {
font-family: 'Geist', -apple-system, BlinkMacSystemFont, 'Inter', sans-serif !important;
background: var(--bg-primary) !important;
color: var(--text-primary) !important;
}
/* Headers and text */
h1, h2, h3, h4 {
color: var(--text-primary) !important;
font-weight: 700 !important;
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
}
p, span, div {
color: var(--text-primary) !important;
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
}
/* Labels and info text */
label {
color: var(--text-primary) !important;
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
}
.gr-box label {
color: var(--text-primary) !important;
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
}
.gr-info {
color: var(--text-secondary) !important;
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
}
/* Simple metric cards */
.metric-card {
background: var(--bg-card);
border-radius: 16px;
padding: 24px;
position: relative;
border: 1px solid var(--border-subtle);
transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
}
.metric-card:hover {
transform: translateY(-4px);
border-color: var(--accent-primary);
box-shadow: 0 8px 24px rgba(227, 84, 84, 0.2);
}
/* Metric icon with glow effect */
.metric-icon {
width: 48px;
height: 48px;
display: flex;
align-items: center;
justify-content: center;
font-size: 2rem;
margin-bottom: 16px;
filter: drop-shadow(0 0 20px currentColor);
transition: all 0.3s ease;
}
.metric-card:hover .metric-icon {
transform: scale(1.1);
filter: drop-shadow(0 0 30px currentColor);
}
/* Metric values and labels */
.metric-card .metric-label {
font-family: 'Geist Mono', monospace !important;
letter-spacing: 0.1em !important;
color: var(--text-secondary) !important;
font-size: 0.875rem !important;
text-transform: uppercase !important;
margin-bottom: 8px !important;
}
.metric-card .metric-value {
font-family: 'Geist', sans-serif !important;
font-weight: 700 !important;
font-size: 1.25rem !important;
color: var(--text-primary) !important;
margin-bottom: 8px !important;
}
.metric-card .metric-description {
color: var(--text-secondary) !important;
font-size: 0.875rem !important;
line-height: 1.5 !important;
}
/* Enhanced radio buttons with primary accent */
input[type="radio"] {
background-color: var(--bg-secondary) !important;
border-color: var(--border-default) !important;
}
input[type="radio"]:checked {
background-color: var(--accent-primary) !important;
border-color: var(--accent-primary) !important;
box-shadow: 0 0 10px var(--glow-primary) !important;
}
.gr-check-radio label {
color: var(--text-primary) !important;
transition: color 0.2s ease !important;
}
.gr-check-radio:hover label {
color: var(--accent-primary) !important;
}
/* Gradio's selected radio button styling - comprehensive targeting */
.gr-radio .wrap > label.selected,
.gr-radio .wrap > label:has(input:checked),
input[type="radio"]:checked ~ span,
label:has(> input[type="radio"]:checked) {
background: transparent !important;
border-color: var(--accent-primary) !important;
color: var(--text-primary) !important;
font-weight: 600 !important;
}
/* Enhanced dropdown styling */
.dropdown {
border-color: var(--border-default) !important;
background: var(--bg-card) !important;
color: var(--text-primary) !important;
transition: all 0.2s ease !important;
}
.dropdown:hover {
border-color: var(--accent-primary) !important;
box-shadow: 0 0 15px var(--glow-primary) !important;
}
select, .gr-dropdown {
background: var(--bg-card) !important;
color: var(--text-primary) !important;
border: 1px solid var(--border-default) !important;
transition: all 0.2s ease !important;
}
select:hover, .gr-dropdown:hover {
border-color: var(--accent-primary) !important;
box-shadow: 0 0 15px var(--glow-primary) !important;
}
select option, .gr-dropdown option {
background: var(--bg-primary) !important;
color: var(--text-primary) !important;
}
/* Enhanced table styling with gradient accents */
.dataframe {
background: var(--bg-card) !important;
border-radius: 16px !important;
overflow: hidden !important;
border: 1px solid var(--border-subtle) !important;
font-size: 14px !important;
max-height: 600px !important;
overflow-y: auto !important;
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
box-shadow: 0 4px 16px rgba(0, 0, 0, 0.3) !important;
}
/* Fixed table layout for better column control */
.dataframe table {
table-layout: fixed !important;
width: 100% !important;
}
.dataframe th {
background: rgba(227, 84, 84, 0.1) !important;
color: var(--text-primary) !important;
font-weight: 600 !important;
padding: 14px 8px !important;
text-align: left !important;
border-bottom: 2px solid var(--accent-primary) !important;
position: relative !important;
white-space: nowrap !important;
overflow: hidden !important;
text-overflow: ellipsis !important;
font-size: 13px !important;
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
text-transform: uppercase !important;
letter-spacing: 0.05em !important;
}
/* Column-specific widths */
.dataframe th:nth-child(2), /* Model */
.dataframe td:nth-child(2) {
min-width: 200px !important;
max-width: 250px !important;
}
.dataframe th:nth-child(3), /* Model Type */
.dataframe td:nth-child(3) {
min-width: 100px !important;
max-width: 120px !important;
}
.dataframe th:nth-child(4), /* Output Type */
.dataframe td:nth-child(4) {
min-width: 100px !important;
max-width: 120px !important;
}
.dataframe th:nth-child(5), /* Vendor */
.dataframe td:nth-child(5) {
min-width: 100px !important;
max-width: 120px !important;
}
/* Numeric columns - smaller width */
.dataframe th:nth-child(6), .dataframe th:nth-child(7),
.dataframe th:nth-child(8), .dataframe th:nth-child(9),
.dataframe th:nth-child(10),
.dataframe td:nth-child(6), .dataframe td:nth-child(7),
.dataframe td:nth-child(8), .dataframe td:nth-child(9),
.dataframe td:nth-child(10) {
min-width: 80px !important;
max-width: 100px !important;
text-align: center !important;
font-family: 'Geist Mono', monospace !important;
font-size: 13px !important;
}
.dataframe td {
padding: 12px 8px !important;
border-bottom: 1px solid var(--border-subtle) !important;
color: var(--text-primary) !important;
white-space: nowrap !important;
overflow: hidden !important;
text-overflow: ellipsis !important;
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
transition: all 0.2s ease !important;
}
/* Model names - keep consistent color on hover */
.dataframe td:nth-child(2) {
font-weight: 500 !important;
color: var(--accent-primary) !important;
transition: all 0.2s ease !important;
}
/* Keep model name color consistent to emphasize row highlight */
.dataframe tr:hover td:nth-child(2) {
color: var(--accent-secondary) !important;
}
.dataframe tbody tr {
transition: all 0.3s ease !important;
}
.dataframe tr:hover {
background: rgba(227, 84, 84, 0.15) !important;
box-shadow: 0 0 20px rgba(227, 84, 84, 0.3), inset 0 0 20px rgba(227, 84, 84, 0.1) !important;
transform: scale(1.01) !important;
}
.dataframe tr:nth-child(even) {
background: var(--bg-secondary) !important;
}
/* Tooltip on hover for truncated text */
.dataframe td:hover,
.dataframe th:hover {
overflow: visible !important;
position: relative !important;
z-index: 10 !important;
}
/* Horizontal scroll styling */
.dataframe-container {
overflow-x: auto !important;
overflow-y: visible !important;
max-width: 100% !important;
-webkit-overflow-scrolling: touch !important;
position: relative !important;
}
/* Simple scrollbar */
.dataframe-container::-webkit-scrollbar {
height: 10px !important;
}
.dataframe-container::-webkit-scrollbar-track {
background: var(--bg-secondary) !important;
border-radius: 5px !important;
}
.dataframe-container::-webkit-scrollbar-thumb {
background: var(--accent-secondary) !important;
border-radius: 4px !important;
}
.dataframe-container::-webkit-scrollbar-thumb:hover {
background: var(--accent-primary) !important;
}
/* Responsive design for smaller screens */
@media (max-width: 1200px) {
.dataframe th:nth-child(9), /* Vendor column */
.dataframe td:nth-child(9),
.dataframe th:nth-child(10), /* Last columns */
.dataframe td:nth-child(10) {
display: none !important;
}
}
@media (max-width: 900px) {
.dataframe th {
font-size: 12px !important;
padding: 8px 4px !important;
}
.dataframe td {
font-size: 12px !important;
padding: 8px 4px !important;
}
.dataframe th:nth-child(2),
.dataframe td:nth-child(2) {
min-width: 150px !important;
max-width: 200px !important;
}
}
/* Simple button styling */
button {
background: var(--bg-card) !important;
color: var(--text-primary) !important;
border: 1px solid var(--border-default) !important;
transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
}
button:hover {
transform: translateY(-2px) !important;
border-color: var(--accent-primary) !important;
box-shadow: 0 4px 16px rgba(227, 84, 84, 0.2) !important;
}
/* Enhanced info boxes */
.info-box {
background: var(--bg-card);
border: 1px solid var(--border-subtle);
border-radius: 12px;
padding: 20px;
margin: 8px 0;
backdrop-filter: blur(10px);
position: relative;
overflow: hidden;
transition: all 0.3s ease;
}
.info-box::before {
content: '';
position: absolute;
top: 0;
left: -100%;
width: 100%;
height: 100%;
background: linear-gradient(90deg, transparent, rgba(227, 84, 84, 0.1), transparent);
transition: left 0.6s ease;
}
.info-box:hover::before {
left: 100%;
}
.info-box:hover {
border-color: var(--accent-primary);
box-shadow: 0 4px 20px var(--glow-primary);
}
/* Enhanced dark containers */
.dark-container {
background: var(--bg-card);
border: 1px solid var(--border-subtle);
border-radius: 20px;
padding: 28px;
box-shadow: 0 8px 32px rgba(0, 0, 0, 0.4);
backdrop-filter: blur(10px);
position: relative;
overflow: hidden;
}
.dark-container::after {
content: '';
position: absolute;
top: -50%;
right: -50%;
width: 200%;
height: 200%;
background: radial-gradient(circle, var(--glow-primary) 0%, transparent 70%);
opacity: 0.05;
pointer-events: none;
}
/* Section headers with glow */
.section-header {
display: flex;
align-items: center;
gap: 12px;
margin-bottom: 24px;
}
.section-icon {
filter: drop-shadow(0 0 12px currentColor);
transition: all 0.3s ease;
}
.dark-container:hover .section-icon {
filter: drop-shadow(0 0 20px currentColor);
transform: scale(1.1);
}
/* Text effects */
/* Simple scrollbar styling */
::-webkit-scrollbar {
width: 8px;
height: 8px;
}
::-webkit-scrollbar-track {
background: var(--bg-secondary);
border-radius: 4px;
}
::-webkit-scrollbar-thumb {
background: var(--accent-secondary);
border-radius: 4px;
}
::-webkit-scrollbar-thumb:hover {
background: var(--accent-primary);
}
/* Pulse animation for important elements */
@keyframes pulse-glow {
0% { box-shadow: 0 0 0 0 var(--glow-primary); }
70% { box-shadow: 0 0 0 10px transparent; }
100% { box-shadow: 0 0 0 0 transparent; }
}
.pulse {
animation: pulse-glow 2s infinite;
}
/* Center align charts */
.chart-container {
display: flex;
justify-content: center;
align-items: center;
width: 100%;
margin: 0 auto;
}
.chart-container > div {
width: 100%;
max-width: 1400px;
margin: 0 auto;
}
/* Ensure plots are centered */
.plot-container {
margin: 0 auto !important;
display: flex !important;
justify-content: center !important;
}
.js-plotly-plot {
margin: 0 auto !important;
}
</style>
<script>
// Function to update radio button styling
function updateRadioStyling() {
// Remove selected class from all labels first
document.querySelectorAll('.selected').forEach(function(label) {
label.classList.remove('selected');
});
// Apply selected class to checked radio buttons
document.querySelectorAll('input[type="radio"]:checked').forEach(function(input) {
var label = input.closest('label');
if (label) {
label.classList.add('selected');
// For domain radio buttons, apply special styling
if (label.closest('.domain-radio')) {
label.style.background = 'linear-gradient(145deg, rgba(227, 84, 84, 0.2), rgba(227, 84, 84, 0.1))';
label.style.borderColor = 'var(--accent-primary)';
label.style.transform = 'scale(1.05)';
label.style.fontWeight = '600';
}
}
});
}
// Wait for Gradio to initialize
function initializeRadioStyles() {
updateRadioStyling();
// Create observer to watch for changes
var observer = new MutationObserver(function(mutations) {
mutations.forEach(function(mutation) {
if (mutation.type === 'attributes' && mutation.attributeName === 'checked') {
updateRadioStyling();
}
});
});
// Observe all radio inputs
document.querySelectorAll('input[type="radio"]').forEach(function(radio) {
observer.observe(radio, { attributes: true });
});
}
// Try multiple initialization strategies
document.addEventListener('DOMContentLoaded', function() {
setTimeout(initializeRadioStyles, 100);
setTimeout(initializeRadioStyles, 500);
setTimeout(initializeRadioStyles, 1000);
});
// Also check when window loads
window.addEventListener('load', function() {
setTimeout(initializeRadioStyles, 100);
});
// Listen for Gradio's custom events
document.addEventListener('gradio:loaded', initializeRadioStyles);
</script>
"""
gr.HTML(custom_css)
# Header button above title
gr.HTML("""
<style>
/* Enhanced button styling with better gradio compatibility */
.custom-button-container {
text-align: center;
padding: 20px 0 10px 0;
margin-bottom: 10px;
}
.header-action-button {
display: inline-block !important;
padding: 14px 28px !important;
background: linear-gradient(135deg, #E35454 0%, #C84545 100%) !important;
color: #FFFFFF !important;
text-decoration: none !important;
border-radius: 16px !important;
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
font-weight: 700 !important;
font-size: 1.1rem !important;
transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
border: none !important;
cursor: pointer !important;
box-shadow: 0 8px 24px rgba(227, 84, 84, 0.4), 0 4px 12px rgba(0, 0, 0, 0.3) !important;
position: relative !important;
overflow: hidden !important;
text-shadow: 0 1px 2px rgba(0, 0, 0, 0.3) !important;
}
.header-action-button::before {
content: '';
position: absolute;
top: 0;
left: -100%;
width: 100%;
height: 100%;
background: linear-gradient(90deg, transparent, rgba(255, 255, 255, 0.2), transparent);
transition: left 0.6s;
}
.header-action-button:hover::before {
left: 100%;
}
.header-action-button:hover {
transform: translateY(-3px) !important;
box-shadow: 0 12px 32px rgba(227, 84, 84, 0.5), 0 8px 16px rgba(0, 0, 0, 0.4) !important;
background: linear-gradient(135deg, #F46464 0%, #D84F4F 100%) !important;
color: #FFFFFF !important;
text-decoration: none !important;
}
.header-action-button:active {
transform: translateY(-1px) !important;
}
.action-button-icon {
font-size: 1.2rem !important;
margin-right: 8px !important;
filter: drop-shadow(0 0 8px rgba(255, 255, 255, 0.3));
}
/* Navigation buttons styling */
.nav-buttons-container {
display: flex;
justify-content: center;
align-items: center;
gap: 16px;
flex-wrap: wrap;
margin: 24px 0;
padding: 0 20px;
}
.nav-link-button {
display: inline-flex !important;
align-items: center !important;
gap: 8px !important;
padding: 12px 20px !important;
background: rgba(1, 9, 26, 0.8) !important;
color: #F5F6F7 !important;
text-decoration: none !important;
border-radius: 12px !important;
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
font-weight: 600 !important;
font-size: 0.95rem !important;
transition: all 0.3s ease !important;
border: 2px solid rgba(245, 246, 247, 0.15) !important;
backdrop-filter: blur(10px) !important;
-webkit-backdrop-filter: blur(10px) !important;
position: relative !important;
overflow: hidden !important;
box-shadow: 0 4px 16px rgba(0, 0, 0, 0.3) !important;
}
.nav-link-button::before {
content: '';
position: absolute;
top: 0;
left: 0;
right: 0;
bottom: 0;
background: linear-gradient(135deg, rgba(227, 84, 84, 0.1) 0%, rgba(16, 152, 247, 0.1) 100%);
opacity: 0;
transition: opacity 0.3s ease;
}
.nav-link-button:hover::before {
opacity: 1;
}
.nav-link-button:hover {
transform: translateY(-3px) scale(1.02) !important;
border-color: #E35454 !important;
box-shadow: 0 8px 24px rgba(227, 84, 84, 0.3), 0 4px 12px rgba(0, 0, 0, 0.4) !important;
text-decoration: none !important;
color: #FFFFFF !important;
}
.nav-link-button.primary-nav {
background: linear-gradient(135deg, #1098F7 0%, #0A6BC4 100%) !important;
border-color: #1098F7 !important;
color: #FFFFFF !important;
font-weight: 700 !important;
}
.nav-link-button.primary-nav:hover {
background: linear-gradient(135deg, #2AA8FF 0%, #0550A0 100%) !important;
border-color: #2AA8FF !important;
box-shadow: 0 8px 24px rgba(16, 152, 247, 0.4), 0 4px 12px rgba(0, 0, 0, 0.4) !important;
color: #FFFFFF !important;
}
.nav-button-icon {
font-size: 1.1rem !important;
filter: drop-shadow(0 0 6px currentColor);
}
/* Responsive design */
@media (max-width: 768px) {
.nav-buttons-container {
gap: 12px;
padding: 0 10px;
}
.nav-link-button {
font-size: 0.85rem !important;
padding: 10px 16px !important;
}
.header-action-button {
font-size: 1rem !important;
padding: 12px 24px !important;
}
}
@media (max-width: 480px) {
.nav-buttons-container {
flex-direction: column;
gap: 8px;
}
.nav-link-button {
width: 200px;
justify-content: center;
}
}
</style>
<div class="custom-button-container">
<a href="https://app.galileo.ai/sign-up?utm_medium=referral&utm_source=HF&utm_campaign=agent_leaderboard_v2" target="_blank" class="header-action-button">
<span class="action-button-icon">πŸš€</span>Evaluate your GenAI for free
</a>
</div>
""")
gr.HTML("""
<div style="text-align: center; padding: 20px 0;">
<h1 style="font-size: 3rem; margin-bottom: 12px; color: var(--text-primary);
text-shadow: 0 0 20px rgba(227, 84, 84, 0.3); font-family: 'Geist', sans-serif; font-weight: 800;">
πŸš€ Galileo Agent Leaderboard v2
</h1>
<p style="color: var(--text-secondary); font-size: 1.2rem; margin-top: 0; font-family: 'Geist', sans-serif;">
Comprehensive performance metrics for LLM agents across business domains
</p>
</div>
""")
# Links section below title
gr.HTML("""
<div class="nav-buttons-container">
<a href="http://galileo.ai/blog/agent-leaderboard-v2" target="_blank" class="nav-link-button">
<span class="nav-button-icon">πŸ“</span>
Blog
</a>
<a href="https://github.com/rungalileo/agent-leaderboard" target="_blank" class="nav-link-button">
<span class="nav-button-icon">πŸ™</span>
GitHub
</a>
<a href="https://huggingface.co/datasets/galileo-ai/agent-leaderboard-v2" target="_blank" class="nav-link-button">
<span class="nav-button-icon">πŸ€—</span>
Dataset
</a>
<a href="https://huggingface.co/spaces/galileo-ai/agent-leaderboard/discussions/new" target="_blank" class="nav-link-button">
<span class="nav-button-icon">βž•</span>
Add Model
</a>
</div>
""")
# Metrics overview cards with insights
gr.HTML("""
<div style="margin-bottom: 40px;">
<!-- Ultra-modern metric cards with advanced styling -->
<style>
.insight-card {
background: linear-gradient(145deg, rgba(245, 246, 247, 0.03) 0%, rgba(227, 84, 84, 0.08) 100%);
border-radius: 16px;
padding: 20px;
position: relative;
border: 1px solid var(--border-subtle);
transition: all 0.4s cubic-bezier(0.4, 0, 0.2, 1);
overflow: hidden;
backdrop-filter: blur(20px);
-webkit-backdrop-filter: blur(20px);
}
.insight-card::before {
content: '';
position: absolute;
inset: 0;
border-radius: 24px;
padding: 1px;
background: linear-gradient(145deg, var(--border-subtle), var(--border-default));
-webkit-mask: linear-gradient(#fff 0 0) content-box, linear-gradient(#fff 0 0);
-webkit-mask-composite: source-out;
mask-composite: subtract;
pointer-events: none;
}
.insight-card::after {
content: '';
position: absolute;
top: -100%;
left: -100%;
width: 300%;
height: 300%;
background: radial-gradient(circle at center, var(--glow-primary) 0%, transparent 70%);
opacity: 0;
transition: opacity 0.6s ease, transform 0.6s ease;
pointer-events: none;
}
.insight-card:hover::after {
opacity: 0.15;
transform: translate(50%, 50%);
}
.insight-card:hover {
transform: translateY(-8px);
border-color: var(--accent-primary);
box-shadow:
0 24px 48px rgba(227, 84, 84, 0.2),
0 12px 24px rgba(0, 0, 0, 0.3),
inset 0 1px 0 rgba(255, 255, 255, 0.1);
}
.insight-card.secondary-accent:hover {
border-color: var(--accent-primary);
}
.insight-card.secondary-accent::after {
background: radial-gradient(circle at center, var(--glow-primary) 0%, transparent 70%);
}
.insight-card.tertiary-accent:hover {
border-color: var(--accent-primary);
}
.insight-card.tertiary-accent::after {
background: radial-gradient(circle at center, var(--glow-primary) 0%, transparent 70%);
}
.card-header {
display: flex;
align-items: center;
gap: 8px;
margin-bottom: 12px;
}
.card-icon {
display: flex;
align-items: center;
justify-content: center;
font-size: 2rem;
margin-right: 8px;
}
.card-title {
flex: 1;
}
.card-label {
font-family: 'Geist Mono', monospace;
font-size: 0.7rem;
letter-spacing: 0.05em;
text-transform: uppercase;
color: var(--text-secondary);
margin-bottom: 2px;
}
.card-value {
font-family: 'Geist', sans-serif;
font-size: 1.1rem;
font-weight: 700;
color: var(--text-primary);
line-height: 1.1;
}
.insight-list {
list-style: none;
padding: 0;
margin: 0;
}
.insight-list li {
margin-bottom: 8px;
}
.insight-item {
display: flex;
align-items: center;
gap: 8px;
padding: 8px 10px;
background: rgba(245, 246, 247, 0.03);
border-radius: 8px;
border: 1px solid var(--border-subtle);
transition: all 0.3s ease;
}
.insight-item:hover {
background: rgba(227, 84, 84, 0.1);
border-color: var(--accent-primary);
transform: translateX(4px);
}
.insight-icon {
font-size: 1rem;
flex-shrink: 0;
}
.insight-text {
flex: 1;
font-size: 0.85rem;
line-height: 1.3;
color: var(--text-secondary);
}
.highlight {
color: var(--text-primary);
font-weight: 600;
}
.badge-row {
display: flex;
gap: 6px;
margin-top: 10px;
flex-wrap: wrap;
}
.badge {
padding: 4px 10px;
background: rgba(245, 246, 247, 0.05);
border: 1px solid var(--border-subtle);
border-radius: 16px;
font-size: 0.75rem;
color: var(--text-secondary);
transition: all 0.2s ease;
display: flex;
align-items: center;
gap: 4px;
}
.badge:hover {
background: rgba(227, 84, 84, 0.15);
border-color: var(--accent-primary);
color: var(--text-primary);
transform: scale(1.05);
}
.badge-icon {
font-size: 0.85rem;
}
@keyframes float {
0%, 100% { transform: translateY(0); }
50% { transform: translateY(-5px); }
}
.floating-icon {
animation: float 3s ease-in-out infinite;
}
/* Tertiary color for special elements */
.tertiary-color {
color: var(--accent-tertiary);
}
</style>
<!-- First row: Five key insight cards -->
<div style="display: grid; grid-template-columns: repeat(5, 1fr); gap: 16px;">
<div class="insight-card">
<div class="card-header">
<div class="card-icon floating-icon" style="color: var(--accent-primary);">
🎯
</div>
</div>
<div class="card-value">Task Completion</div>
<div class="insight-text" style="margin-top: 16px; color: var(--text-secondary); font-size: 0.9rem; line-height: 1.5;">
Compare models based on their ability to complete real-world business tasks accurately and efficiently
</div>
</div>
<div class="insight-card">
<div class="card-header">
<div class="card-icon floating-icon" style="color: var(--accent-primary);">
πŸ’‘
</div>
</div>
<div class="card-value">Tool Selection</div>
<div class="insight-text" style="margin-top: 16px; color: var(--text-secondary); font-size: 0.9rem; line-height: 1.5;">
Analyze how precisely models choose the right tools for each task and make optimal decisions
</div>
</div>
<div class="insight-card">
<div class="card-header">
<div class="card-icon floating-icon" style="color: var(--accent-primary);">
πŸ’°
</div>
</div>
<div class="card-value">Cost Efficiency</div>
<div class="insight-text" style="margin-top: 16px; color: var(--text-secondary); font-size: 0.9rem; line-height: 1.5;">
Find models that deliver the best performance per dollar spent and optimize your ROI
</div>
</div>
<div class="insight-card">
<div class="card-header">
<div class="card-icon floating-icon" style="color: var(--accent-primary);">
πŸ›οΈ
</div>
</div>
<div class="card-value">Domain Coverage</div>
<div class="insight-text" style="margin-top: 16px; color: var(--text-secondary); font-size: 0.9rem; line-height: 1.5;">
Banking, Healthcare, Insurance, Investment, and Telecom industries analyzed for specialized performance
</div>
</div>
<div class="insight-card">
<div class="card-header">
<div class="card-icon floating-icon" style="color: var(--accent-primary);">
πŸš€
</div>
</div>
<div class="card-value">Speed vs Accuracy</div>
<div class="insight-text" style="margin-top: 16px; color: var(--text-secondary); font-size: 0.9rem; line-height: 1.5;">
Understand the trade-offs between response time and accuracy to find the right balance
</div>
</div>
</div>
<!-- Second row: Key features showcase -->
<div style="display: grid; grid-template-columns: repeat(3, 1fr); gap: 16px; margin-top: 16px;">
<div class="insight-card" style="background: linear-gradient(145deg, rgba(227, 84, 84, 0.05) 0%, rgba(245, 246, 247, 0.05) 100%);">
<div class="card-value">Model Capabilities</div>
<div class="badge-row" style="margin-top: 16px;">
<div class="badge">
<span class="badge-icon">πŸ”“</span>
<span>Open Source</span>
</div>
<div class="badge">
<span class="badge-icon">πŸ”’</span>
<span>Proprietary</span>
</div>
<div class="badge">
<span class="badge-icon">🧠</span>
<span>Reasoning</span>
</div>
</div>
</div>
<div class="insight-card" style="background: linear-gradient(145deg, rgba(227, 84, 84, 0.05) 0%, rgba(245, 246, 247, 0.05) 100%);">
<div class="card-value">Interactive Visualizations</div>
<div class="badge-row" style="margin-top: 16px;">
<div class="badge">
<span class="badge-icon">πŸ•ΈοΈ</span>
<span>Radar Charts</span>
</div>
<div class="badge">
<span class="badge-icon">πŸ“Š</span>
<span>Heatmaps</span>
</div>
<div class="badge">
<span class="badge-icon">πŸ“ˆ</span>
<span>Scatter Plots</span>
</div>
</div>
</div>
<div class="insight-card" style="background: linear-gradient(145deg, rgba(227, 84, 84, 0.05) 0%, rgba(245, 246, 247, 0.05) 100%);">
<div class="card-value">Real-World Performance</div>
<div class="badge-row" style="margin-top: 16px;">
<div class="badge">
<span class="badge-icon">πŸ’Ό</span>
<span>Business Tasks</span>
</div>
<div class="badge">
<span class="badge-icon">πŸ”„</span>
<span>Multi-Turn</span>
</div>
<div class="badge">
<span class="badge-icon">πŸ“‹</span>
<span>Benchmarks</span>
</div>
</div>
</div>
</div>
</div>
""")
# Domain filter section with enhanced styling
gr.HTML("""
<style>
/* Enhanced domain selector styling */
.domain-selector-container {
background: linear-gradient(145deg, rgba(227, 84, 84, 0.05) 0%, rgba(245, 246, 247, 0.05) 100%);
border-radius: 20px;
padding: 32px;
margin-bottom: 32px;
border: 1px solid var(--border-subtle);
position: relative;
overflow: hidden;
box-shadow:
0 8px 32px rgba(0, 0, 0, 0.3),
inset 0 1px 0 rgba(255, 255, 255, 0.05);
}
.domain-selector-container::before {
content: '';
position: absolute;
top: -50%;
left: -50%;
width: 200%;
height: 200%;
background: radial-gradient(circle at center, var(--glow-primary) 0%, transparent 70%);
opacity: 0.1;
animation: pulse 4s ease-in-out infinite;
}
.domain-header {
text-align: center;
margin-bottom: 28px;
position: relative;
z-index: 1;
}
.domain-title {
font-size: 2rem;
font-weight: 800;
background: linear-gradient(90deg, var(--accent-primary), var(--accent-secondary));
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
margin-bottom: 8px;
text-shadow: 0 0 30px var(--glow-primary);
}
.domain-subtitle {
color: var(--text-secondary);
font-size: 1.2rem;
font-family: 'Geist', sans-serif;
}
/* Custom radio button styling */
.domain-radio {
display: flex !important;
gap: 12px !important;
flex-wrap: wrap !important;
justify-content: center !important;
position: relative;
z-index: 1;
}
/* Gradio radio button wrapper */
.domain-radio .wrap {
display: flex !important;
gap: 12px !important;
flex-wrap: wrap !important;
justify-content: center !important;
width: 100% !important;
}
.domain-radio label,
.domain-radio .wrap > label {
flex: 1 !important;
min-width: 160px !important;
max-width: 200px !important;
padding: 16px 24px !important;
background: var(--bg-card) !important;
border: 2px solid var(--border-default) !important;
border-radius: 16px !important;
cursor: pointer !important;
transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
text-align: center !important;
position: relative !important;
overflow: hidden !important;
}
.domain-radio label::before {
content: '';
position: absolute;
top: 0;
left: 0;
right: 0;
bottom: 0;
background: linear-gradient(145deg, transparent, var(--glow-primary));
opacity: 0;
transition: opacity 0.3s ease;
pointer-events: none;
}
.domain-radio label:hover {
transform: translateY(-2px) !important;
border-color: var(--accent-primary) !important;
box-shadow:
0 8px 24px rgba(227, 84, 84, 0.3),
inset 0 0 20px rgba(227, 84, 84, 0.1) !important;
}
.domain-radio label:hover::before {
opacity: 0.1;
}
.domain-radio input[type="radio"] {
display: none !important;
}
.domain-radio input[type="radio"]:checked + label,
.domain-radio .wrap > label:has(input[type="radio"]:checked),
.domain-radio label.selected {
background: transparent !important;
border-color: var(--accent-primary) !important;
color: var(--text-primary) !important;
font-weight: 600 !important;
transform: scale(1.05) !important;
box-shadow:
0 12px 32px rgba(227, 84, 84, 0.4),
0 0 60px rgba(227, 84, 84, 0.2) !important;
}
.domain-radio input[type="radio"]:checked + label::before {
opacity: 0.2;
}
/* Domain icons */
.domain-icon {
font-size: 1.5rem;
margin-bottom: 4px;
display: block;
filter: drop-shadow(0 0 10px currentColor);
}
.domain-name {
font-size: 0.95rem;
font-weight: 500;
margin-top: 4px;
}
/* Badge for domain counts */
.domain-count {
position: absolute;
top: 8px;
right: 8px;
background: var(--accent-primary);
color: white;
font-size: 0.75rem;
padding: 2px 8px;
border-radius: 12px;
font-weight: 600;
opacity: 0.8;
}
/* Filter radio buttons styling - smaller for better fit */
.filter-radio {
max-width: 100% !important;
}
.filter-radio .gr-row {
gap: 8px !important;
}
.filter-radio .gr-column {
min-width: 0 !important;
flex: 1 !important;
}
.filter-radio .gr-form {
min-width: 0 !important;
}
.filter-radio .gr-radio-group {
gap: 4px !important;
}
.filter-radio .domain-radio {
display: flex !important;
gap: 4px !important;
flex-wrap: nowrap !important;
justify-content: center !important;
}
.filter-radio .domain-radio label {
min-width: auto !important;
max-width: 120px !important;
padding: 8px 12px !important;
font-size: 0.8rem !important;
white-space: nowrap !important;
overflow: hidden !important;
text-overflow: ellipsis !important;
}
/* Additional targeting for the specific filter components */
.filter-radio .gr-box {
padding: 8px !important;
}
.filter-radio .gr-radio {
gap: 4px !important;
}
.filter-radio .gr-input-label {
font-size: 0.85rem !important;
margin-bottom: 4px !important;
}
/* Force compact layout for the filters */
@media (max-width: 1400px) {
.filter-radio .domain-radio label {
padding: 6px 10px !important;
font-size: 0.75rem !important;
}
}
/* Compact filter row styling */
.compact-filter-row {
margin-bottom: 20px !important;
}
.compact-filter-row .gr-column {
padding: 0 8px !important;
}
.compact-filter-row .gr-box {
padding: 0 !important;
}
/* Compact radio button styling */
.compact-radio {
width: 100% !important;
}
.compact-radio > label {
font-size: 0.85rem !important;
margin-bottom: 8px !important;
font-weight: 600 !important;
color: var(--text-primary) !important;
display: block !important;
}
.compact-radio .wrap {
display: flex !important;
flex-wrap: nowrap !important;
gap: 4px !important;
justify-content: center !important;
}
.compact-radio .wrap > label {
display: inline-flex !important;
align-items: center !important;
justify-content: center !important;
padding: 6px 10px !important;
margin: 0 !important;
background: var(--bg-card) !important;
border: 1px solid var(--border-default) !important;
border-radius: 8px !important;
cursor: pointer !important;
transition: all 0.2s ease !important;
font-size: 0.75rem !important;
white-space: nowrap !important;
flex: 1 !important;
min-width: 0 !important;
overflow: hidden !important;
text-overflow: ellipsis !important;
}
.compact-radio .wrap > label:has(input[type="radio"]:checked) {
background: transparent !important;
border-color: var(--accent-primary) !important;
color: var(--text-primary) !important;
font-weight: 600 !important;
}
.compact-radio .wrap > label:hover {
background: rgba(227, 84, 84, 0.1) !important;
border-color: var(--accent-primary) !important;
transform: scale(1.02) !important;
}
.compact-radio input[type="radio"] {
display: none !important;
}
/* Target Gradio's data attributes for selected state */
.compact-radio label[data-selected="true"],
.compact-radio label[aria-checked="true"],
.domain-radio label[data-selected="true"],
.domain-radio label[aria-checked="true"] {
background: transparent !important;
border-color: var(--accent-primary) !important;
color: var(--text-primary) !important;
font-weight: 600 !important;
}
/* Sort by radio buttons */
.sort-by-radio .domain-radio {
display: flex !important;
gap: 10px !important;
flex-wrap: wrap !important;
justify-content: flex-start !important;
}
.sort-by-radio .domain-radio .wrap {
display: flex !important;
gap: 10px !important;
flex-wrap: wrap !important;
justify-content: flex-start !important;
width: 100% !important;
}
.sort-by-radio .domain-radio label,
.sort-by-radio .domain-radio .wrap > label {
min-width: 180px !important;
max-width: 220px !important;
padding: 12px 20px !important;
font-size: 0.95rem !important;
}
</style>
<div class="domain-selector-container">
<div class="domain-header">
<h2 class="domain-title">πŸ›οΈ Select Business Domain</h2>
<p class="domain-subtitle">Choose a domain to see specialized agent performance</p>
</div>
""")
# Creating a custom radio with better visual design
domain_choices = [
("All", "🌐", "All Domains"),
("Banking", "🏦", "Banking"),
("Healthcare", "πŸ₯", "Healthcare"),
("Insurance", "πŸ›‘οΈ", "Insurance"),
("Investment", "πŸ’°", "Investment"),
("Telecom", "πŸ“±", "Telecom")
]
with gr.Row():
domain_filter = gr.Radio(
choices=["🌐 All", "🏦 Banking", "πŸ₯ Healthcare", "πŸ›‘οΈ Insurance", "πŸ’° Investment", "πŸ“± Telecom"],
value="🌐 All",
label="",
interactive=True,
elem_classes=["domain-radio"]
)
gr.HTML("""
</div>
""")
# Filter controls with enhanced styling
gr.HTML("""
<div class="dark-container" style="margin-bottom: 24px;">
<div class="section-header">
<span class="section-icon" style="color: var(--accent-secondary);">πŸ”</span>
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;">
Filters & Sorting
</h3>
</div>
""")
# First row: Model filters and sort order
with gr.Row(elem_classes=["compact-filter-row"]):
with gr.Column(scale=1):
model_type_filter = gr.Radio(
choices=["All", "Open Source", "Proprietary"],
value="All",
label="πŸ”“ Model Access",
elem_classes=["compact-radio"]
)
with gr.Column(scale=1):
reasoning_filter = gr.Radio(
choices=["All", "Reasoning", "Normal"],
value="All",
label="🧠 Output Type",
elem_classes=["compact-radio"]
)
with gr.Column(scale=1):
sort_order = gr.Radio(
choices=["Descending", "Ascending"],
value="Descending",
label="πŸ”„ Sort Order",
elem_classes=["compact-radio"]
)
# Second row: Sort by options
gr.HTML("""<div style="margin-top: 20px; margin-bottom: 10px;">
<h4 style="color: var(--text-primary); font-size: 1.1rem; font-weight: 600; margin: 0;">πŸ“Š Sort By</h4>
</div>""")
gr.HTML('<div class="sort-by-radio">')
sort_by = gr.Radio(
choices=["Avg Action Completion", "Avg Tool Selection Quality", "Avg Session Cost", "Avg Session Duration", "Avg Turns"],
value="Avg Action Completion",
label="",
elem_classes=["domain-radio"]
)
gr.HTML('</div>')
gr.HTML("</div>")
# Main leaderboard table with dynamic title
leaderboard_title = gr.HTML("""
<div class="dark-container pulse" style="margin-bottom: 24px;">
<div class="section-header">
<span class="section-icon" style="color: var(--accent-primary);">πŸ“ˆ</span>
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;">
Agent Leaderboard for All
</h3>
</div>
<div class="dataframe-container">
""")
leaderboard_table = gr.HTML(initial_table)
gr.HTML("""
</div>
</div>""")
# Column Info Section
gr.HTML("""
<div class="dark-container" style="margin-top: 24px; margin-bottom: 32px;">
<div class="section-header">
<span class="section-icon" style="color: var(--accent-secondary);">πŸ“‹</span>
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.3rem; font-family: 'Geist', sans-serif; font-weight: 700;">
Column Explanations
</h3>
</div>
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 20px; margin-top: 20px;">
<!-- Performance Metrics -->
<div class="info-box" style="background: linear-gradient(145deg, rgba(227, 84, 84, 0.05) 0%, rgba(245, 246, 247, 0.03) 100%);">
<h4 style="color: var(--accent-primary); margin-top: 0; margin-bottom: 16px; font-size: 1.1rem; font-family: 'Geist', sans-serif; font-weight: 600; display: flex; align-items: center; gap: 8px;">
<span style="font-size: 1.3rem;">🎯</span>
Performance Metrics
</h4>
<div style="space-y: 12px;">
<div style="margin-bottom: 12px;">
<div style="font-weight: 600; color: var(--text-primary); margin-bottom: 4px; font-family: 'Geist Mono', monospace; font-size: 0.9rem;">
πŸ“Š Action Completion
</div>
<div style="color: var(--text-secondary); font-size: 0.9rem; line-height: 1.4; margin-bottom: 6px;">
Measures how well the agent accomplishes user goals and completes tasks successfully.
</div>
<a href="https://v2docs.galileo.ai/concepts/metrics/agentic/action-completion" target="_blank"
style="color: var(--accent-primary); text-decoration: none; font-size: 0.85rem; display: inline-flex; align-items: center; gap: 4px;">
πŸ“– Learn more about Action Completion
<span style="font-size: 0.7rem;">β†—</span>
</a>
</div>
<div style="border-top: 1px solid var(--border-subtle); padding-top: 12px;">
<div style="font-weight: 600; color: var(--text-primary); margin-bottom: 4px; font-family: 'Geist Mono', monospace; font-size: 0.9rem;">
πŸ› οΈ Tool Selection Quality
</div>
<div style="color: var(--text-secondary); font-size: 0.9rem; line-height: 1.4; margin-bottom: 6px;">
Evaluates the accuracy of selecting the right tools and using them with correct parameters.
</div>
<a href="https://v2docs.galileo.ai/concepts/metrics/agentic/tool-selection-quality" target="_blank"
style="color: var(--accent-primary); text-decoration: none; font-size: 0.85rem; display: inline-flex; align-items: center; gap: 4px;">
πŸ“– Learn more about Tool Selection Quality
<span style="font-size: 0.7rem;">β†—</span>
</a>
</div>
</div>
</div>
<!-- Session-Level Metrics -->
<div class="info-box" style="background: linear-gradient(145deg, rgba(16, 152, 247, 0.05) 0%, rgba(245, 246, 247, 0.03) 100%);">
<h4 style="color: var(--accent-secondary); margin-top: 0; margin-bottom: 16px; font-size: 1.1rem; font-family: 'Geist', sans-serif; font-weight: 600; display: flex; align-items: center; gap: 8px;">
<span style="font-size: 1.3rem;">πŸ“ˆ</span>
Session-Level Metrics
</h4>
<div style="space-y: 10px;">
<div style="margin-bottom: 10px;">
<div style="font-weight: 600; color: var(--text-primary); margin-bottom: 4px; font-family: 'Geist Mono', monospace; font-size: 0.9rem;">
πŸ’° Avg Cost ($)
</div>
<div style="color: var(--text-secondary); font-size: 0.9rem; line-height: 1.4;">
Average cost per conversation session, including all API calls and processing.
</div>
</div>
<div style="margin-bottom: 10px; border-top: 1px solid var(--border-subtle); padding-top: 10px;">
<div style="font-weight: 600; color: var(--text-primary); margin-bottom: 4px; font-family: 'Geist Mono', monospace; font-size: 0.9rem;">
⚑ Avg Duration (s)
</div>
<div style="color: var(--text-secondary); font-size: 0.9rem; line-height: 1.4;">
Average time taken to complete a full conversation session from start to finish.
</div>
</div>
<div style="border-top: 1px solid var(--border-subtle); padding-top: 10px;">
<div style="font-weight: 600; color: var(--text-primary); margin-bottom: 4px; font-family: 'Geist Mono', monospace; font-size: 0.9rem;">
πŸ’¬ Avg Turns
</div>
<div style="color: var(--text-secondary); font-size: 0.9rem; line-height: 1.4;">
Average number of back-and-forth exchanges needed to complete a task.
</div>
</div>
</div>
</div>
</div>
<!-- Additional Notes -->
<div style="margin-top: 24px; padding: 16px; background: rgba(245, 246, 247, 0.03); border: 1px solid var(--border-subtle); border-radius: 12px;">
<div style="display: flex; align-items: center; gap: 8px; margin-bottom: 8px;">
<span style="font-size: 1.1rem;">πŸ’‘</span>
<span style="font-weight: 600; color: var(--text-primary); font-size: 0.95rem;">Default Sorting</span>
</div>
<div style="color: var(--text-secondary); font-size: 0.9rem; line-height: 1.4;">
The table is sorted by <strong style="color: var(--text-primary);">Action Completion</strong> in descending order by default, showing the best-performing models first. You can change the sorting using the filters above.
</div>
</div>
</div>
""")
# Radar Chart Section
gr.HTML("""
<div class="dark-container" style="margin-bottom: 24px;">
<div class="section-header">
<span class="section-icon" style="color: var(--accent-primary);">πŸ•ΈοΈ</span>
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;">
Domain Performance Analysis
</h3>
</div>
<p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Geist', sans-serif;">Compare model performance across different business domains</p>
""")
with gr.Row():
with gr.Column(scale=1):
model_selector = gr.Dropdown(
choices=initial_df['Model'].tolist()[:10],
value=initial_df['Model'].tolist()[:5],
multiselect=True,
label="🎯 Select Models for Comparison",
info="Choose up to 5 models to visualize",
elem_classes=["dropdown"]
)
# Radar chart plot - wrapped in centered container
gr.HTML('<div class="chart-container">')
radar_chart = gr.Plot(
label="",
value=create_domain_radar_chart(
load_leaderboard_data(),
"Avg AC",
initial_df['Model'].tolist()[:5]
),
elem_classes=["radar-chart", "plot-container"]
)
gr.HTML('</div>')
gr.HTML("</div>")
# Update functions
def get_optimal_sort_order(sort_by_value):
"""Return the optimal sort order for a given metric"""
# Metrics where higher is better (descending)
descending_metrics = ["Avg Action Completion", "Avg Tool Selection Quality"]
# Metrics where lower is better (ascending)
ascending_metrics = ["Avg Session Cost", "Avg Session Duration", "Avg Turns"]
if sort_by_value in descending_metrics:
return "Descending"
elif sort_by_value in ascending_metrics:
return "Ascending"
else:
return "Descending" # Default fallback
def update_sort_order_automatically(sort_by_value):
"""Update sort order automatically based on selected metric"""
optimal_order = get_optimal_sort_order(sort_by_value)
return optimal_order
def update_table(*args):
title_html = update_leaderboard_title(args[0]) # domain_filter is first arg
table_html = filter_and_sort_data(*args)
return title_html, table_html
def update_radar_chart(domain_filter, model_type_filter, reasoning_filter, sort_by, sort_order, selected_models):
# Get filtered dataframe
df = load_leaderboard_data()
filtered_df = df.copy()
# Strip emoji prefix from domain filter
domain_filter_clean = domain_filter
if domain_filter.startswith('🌐'):
domain_filter_clean = "All"
elif domain_filter.startswith('🏦'):
domain_filter_clean = "Banking"
elif domain_filter.startswith('πŸ₯'):
domain_filter_clean = "Healthcare"
elif domain_filter.startswith('πŸ›‘οΈ'):
domain_filter_clean = "Insurance"
elif domain_filter.startswith('πŸ’°'):
domain_filter_clean = "Investment"
elif domain_filter.startswith('πŸ“±'):
domain_filter_clean = "Telecom"
# Apply filters (same logic as filter_and_sort_data)
if domain_filter_clean != "All":
domain_col_map = {
"Banking": "Banking AC",
"Healthcare": "Healthcare AC",
"Insurance": "Insurance AC",
"Investment": "Investment AC",
"Telecom": "Telecom AC"
}
if domain_filter_clean in domain_col_map:
domain_col = domain_col_map[domain_filter_clean]
# Only show models that have data for this domain
filtered_df = filtered_df[filtered_df[domain_col] != '']
if model_type_filter != "All":
if model_type_filter == "Open Source":
filtered_df = filtered_df[filtered_df['Model Type'] == 'Open source']
elif model_type_filter == "Proprietary":
filtered_df = filtered_df[filtered_df['Model Type'] == 'Proprietary']
if reasoning_filter != "All":
if reasoning_filter == "Reasoning":
filtered_df = filtered_df[filtered_df['Output Type'] == 'Reasoning']
elif reasoning_filter == "Normal":
filtered_df = filtered_df[filtered_df['Output Type'] == 'Normal']
# Map display name to actual column name using shared mapping
actual_sort_column = SORT_COLUMN_MAP.get(sort_by, sort_by)
# If domain is selected and sorting by AC or TSQ, use domain-specific column
if domain_filter_clean != "All":
if actual_sort_column == "Avg AC":
actual_sort_column = f"{domain_filter_clean} AC"
elif actual_sort_column == "Avg TSQ":
actual_sort_column = f"{domain_filter_clean} TSQ"
elif actual_sort_column == "Avg Total Cost":
actual_sort_column = f"{domain_filter_clean} Cost"
elif actual_sort_column == "Avg Session Duration":
actual_sort_column = f"{domain_filter_clean} Duration"
elif actual_sort_column == "Avg Turns":
actual_sort_column = f"{domain_filter_clean} Turns"
if actual_sort_column and actual_sort_column in filtered_df.columns:
ascending = (sort_order == "Ascending")
filtered_df = filtered_df.sort_values(by=actual_sort_column, ascending=ascending, na_position='last')
# Update model selector choices based on filtered data
available_models = filtered_df['Model'].tolist()[:15] # Top 15 from filtered results
# If selected models are not in available models, reset to top 5
if selected_models:
valid_selected = [m for m in selected_models if m in available_models]
if not valid_selected:
valid_selected = available_models[:5]
else:
valid_selected = available_models[:5]
# Create radar chart
chart = create_domain_radar_chart(load_leaderboard_data(), sort_by, valid_selected)
return gr.Dropdown(choices=available_models, value=valid_selected), chart
def update_radar_only(domain_filter, model_type_filter, reasoning_filter, sort_by, sort_order, selected_models):
# Get filtered dataframe
df = load_leaderboard_data()
filtered_df = df.copy()
# Strip emoji prefix from domain filter
domain_filter_clean = domain_filter
if domain_filter.startswith('🌐'):
domain_filter_clean = "All"
elif domain_filter.startswith('🏦'):
domain_filter_clean = "Banking"
elif domain_filter.startswith('πŸ₯'):
domain_filter_clean = "Healthcare"
elif domain_filter.startswith('πŸ›‘οΈ'):
domain_filter_clean = "Insurance"
elif domain_filter.startswith('πŸ’°'):
domain_filter_clean = "Investment"
elif domain_filter.startswith('πŸ“±'):
domain_filter_clean = "Telecom"
# Apply filters (same logic as filter_and_sort_data)
if domain_filter_clean != "All":
domain_col_map = {
"Banking": "Banking AC",
"Healthcare": "Healthcare AC",
"Insurance": "Insurance AC",
"Investment": "Investment AC",
"Telecom": "Telecom AC"
}
if domain_filter_clean in domain_col_map:
domain_col = domain_col_map[domain_filter_clean]
filtered_df = filtered_df[filtered_df[domain_col] != '']
if model_type_filter != "All":
if model_type_filter == "Open Source":
filtered_df = filtered_df[filtered_df['Model Type'] == 'Open source']
elif model_type_filter == "Proprietary":
filtered_df = filtered_df[filtered_df['Model Type'] == 'Proprietary']
if reasoning_filter != "All":
if reasoning_filter == "Reasoning":
filtered_df = filtered_df[filtered_df['Output Type'] == 'Reasoning']
elif reasoning_filter == "Normal":
filtered_df = filtered_df[filtered_df['Output Type'] == 'Normal']
# Map display name to actual column name using shared mapping
actual_sort_column = SORT_COLUMN_MAP.get(sort_by, sort_by)
if actual_sort_column and actual_sort_column in filtered_df.columns:
ascending = (sort_order == "Ascending")
filtered_df = filtered_df.sort_values(by=actual_sort_column, ascending=ascending, na_position='last')
if selected_models:
valid_selected = [m for m in selected_models if m in filtered_df['Model'].tolist()]
if not valid_selected:
valid_selected = filtered_df['Model'].tolist()[:5]
else:
valid_selected = filtered_df['Model'].tolist()[:5]
return create_domain_radar_chart(load_leaderboard_data(), sort_by, valid_selected)
# Update table when filters change
filter_inputs = [domain_filter, model_type_filter, reasoning_filter, sort_by, sort_order]
for input_component in filter_inputs:
input_component.change(
fn=update_table,
inputs=filter_inputs,
outputs=[leaderboard_title, leaderboard_table]
)
# Also update radar chart when filters change
input_component.change(
fn=update_radar_chart,
inputs=filter_inputs + [model_selector],
outputs=[model_selector, radar_chart]
)
# Update radar chart when model selection changes
model_selector.change(
fn=update_radar_only,
inputs=filter_inputs + [model_selector],
outputs=[radar_chart]
)
# Automatically update sort order when sort_by changes
sort_by.change(
fn=update_sort_order_automatically,
inputs=[sort_by],
outputs=[sort_order]
)
# Performance insights section
gr.HTML("""
<div class="dark-container" style="margin-top: 32px;">
<div class="section-header">
<span class="section-icon" style="color: var(--accent-secondary);">πŸ“Š</span>
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;">
Key Insights
</h3>
</div>
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(280px, 1fr)); gap: 24px; margin-top: 24px;">
<div class="info-box">
<h4 style="color: var(--accent-primary); margin-top: 0; font-size: 1.2rem; font-family: 'Geist', sans-serif; font-weight: 600;">πŸ† Top Performers</h4>
<ul style="color: var(--text-secondary); margin: 0; padding-left: 20px; line-height: 1.6; font-family: 'Geist', sans-serif;">
<li>Highest AC scores indicate best action completion</li>
<li>Superior TSQ shows optimal tool selection</li>
<li>Balance cost-effectiveness with performance</li>
</ul>
</div>
<div class="info-box">
<h4 style="color: var(--accent-secondary); margin-top: 0; font-size: 1.2rem; font-family: 'Geist', sans-serif; font-weight: 600;">πŸ” Filter Features</h4>
<ul style="color: var(--text-secondary); margin: 0; padding-left: 20px; line-height: 1.6; font-family: 'Geist', sans-serif;">
<li>Domain-specific performance analysis</li>
<li>Compare open source vs private models</li>
<li>Reasoning vs standard model comparison</li>
</ul>
</div>
<div class="info-box">
<h4 style="color: var(--accent-primary); margin-top: 0; font-size: 1.2rem; font-family: 'Geist', sans-serif; font-weight: 600;">πŸ“ˆ Visualization</h4>
<ul style="color: var(--text-secondary); margin: 0; padding-left: 20px; line-height: 1.6; font-family: 'Geist', sans-serif;">
<li>Interactive radar charts for domain breakdown</li>
<li>Compare up to 5 models simultaneously</li>
<li>Hover for detailed performance metrics</li>
</ul>
</div>
</div>
</div>
""")
# NEW VISUALIZATIONS START HERE
# 1. Cost-Performance Efficiency Scatter Plot
gr.HTML("""
<div class="dark-container" style="margin-top: 32px;">
<div class="section-header">
<span class="section-icon" style="color: var(--accent-primary);">πŸ’‘</span>
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;">
Cost-Performance Efficiency Analysis
</h3>
</div>
<p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Geist', sans-serif;">
Identify models that deliver the best performance per dollar spent
</p>
""")
with gr.Row():
with gr.Column(scale=1):
efficiency_metric = gr.Dropdown(
choices=["Avg Action Completion", "Avg Tool Selection Quality"],
value="Avg Action Completion",
label="πŸ“Š Performance Metric",
info="Select which performance metric to analyze against cost",
elem_classes=["dropdown"]
)
gr.HTML('<div class="chart-container">')
cost_performance_plot = gr.Plot(
label="",
value=create_cost_performance_scatter(load_leaderboard_data(), "Avg AC"),
elem_classes=["efficiency-chart", "plot-container"]
)
gr.HTML('</div>')
gr.HTML("</div>")
# 2. Speed vs Accuracy Trade-off Chart
gr.HTML("""
<div class="dark-container" style="margin-top: 32px;">
<div class="section-header">
<span class="section-icon" style="color: var(--accent-secondary);">⚑</span>
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;">
Speed vs Accuracy Trade-off
</h3>
</div>
<p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Geist', sans-serif;">
Find the sweet spot between response time and accuracy
</p>
""")
gr.HTML('<div class="chart-container">')
speed_accuracy_plot = gr.Plot(
label="",
value=create_speed_accuracy_plot(load_leaderboard_data(), "Avg AC"),
elem_classes=["speed-accuracy-chart", "plot-container"]
)
gr.HTML('</div>')
gr.HTML("</div>")
# 3. Performance Heatmap
gr.HTML("""
<div class="dark-container" style="margin-top: 32px;">
<div class="section-header">
<span class="section-icon" style="color: var(--accent-primary);">πŸ”₯</span>
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;">
Comprehensive Performance Heatmap
</h3>
</div>
<p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Geist', sans-serif;">
All metrics at a glance - darker colors indicate better performance
</p>
""")
gr.HTML('<div class="chart-container">')
performance_heatmap = gr.Plot(
label="",
value=create_performance_heatmap(load_leaderboard_data()),
elem_classes=["heatmap-chart", "plot-container"]
)
gr.HTML('</div>')
gr.HTML("</div>")
# 4. Domain Specialization Matrix
gr.HTML("""
<div class="dark-container" style="margin-top: 32px;">
<div class="section-header">
<span class="section-icon" style="color: var(--accent-primary);">🎯</span>
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;">
Domain Specialization Matrix
</h3>
</div>
<p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Geist', sans-serif;">
Bubble size shows performance level, color intensity shows specialization strength
</p>
""")
with gr.Row():
with gr.Column(scale=1):
specialization_metric = gr.Dropdown(
choices=["AC (Action Completion)", "TSQ (Tool Selection Quality)"],
value="AC (Action Completion)",
label="πŸ“Š Metric Type",
info="Choose which metric to analyze for domain specialization",
elem_classes=["dropdown"]
)
gr.HTML('<div class="chart-container">')
domain_specialization_plot = gr.Plot(
label="",
value=create_domain_specialization_matrix(load_leaderboard_data(), "AC"),
elem_classes=["specialization-chart", "plot-container"]
)
gr.HTML('</div>')
gr.HTML("</div>")
# 5. Performance Gap Analysis
gr.HTML("""
<div class="dark-container" style="margin-top: 32px;">
<div class="section-header">
<span class="section-icon" style="color: var(--accent-secondary);">πŸ“ˆ</span>
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;">
Performance Gap Analysis by Domain
</h3>
</div>
<p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Geist', sans-serif;">
Visualize the performance range across models for each domain
</p>
""")
gr.HTML('<div class="chart-container">')
performance_gap_plot = gr.Plot(
label="",
value=create_performance_gap_analysis(load_leaderboard_data(), "AC"),
elem_classes=["gap-analysis-chart", "plot-container"]
)
gr.HTML('</div>')
gr.HTML("</div>")
# Update functions for new visualizations
def update_cost_performance(efficiency_metric):
actual_metric = SORT_COLUMN_MAP.get(efficiency_metric, efficiency_metric)
return create_cost_performance_scatter(load_leaderboard_data(), actual_metric)
def update_speed_accuracy(efficiency_metric):
actual_metric = SORT_COLUMN_MAP.get(efficiency_metric, efficiency_metric)
return create_speed_accuracy_plot(load_leaderboard_data(), actual_metric)
def update_domain_specialization(specialization_metric):
metric_type = "AC" if "AC" in specialization_metric else "TSQ"
return create_domain_specialization_matrix(load_leaderboard_data(), metric_type)
def update_performance_gap(specialization_metric):
metric_type = "AC" if "AC" in specialization_metric else "TSQ"
return create_performance_gap_analysis(load_leaderboard_data(), metric_type)
def update_all_visualizations(domain_filter, model_type_filter, reasoning_filter, sort_by, sort_order):
"""Update all new visualizations when filters change"""
df = load_leaderboard_data()
filtered_df = apply_filters(df, domain_filter, model_type_filter, reasoning_filter)
# Update efficiency metric based on current sort
actual_metric = SORT_COLUMN_MAP.get(sort_by, sort_by) if sort_by in ["Avg Action Completion", "Avg Tool Selection Quality"] else "Avg AC"
# Update all plots
cost_perf = create_cost_performance_scatter(filtered_df, actual_metric)
speed_acc = create_speed_accuracy_plot(filtered_df, actual_metric)
heatmap = create_performance_heatmap(filtered_df)
return cost_perf, speed_acc, heatmap
def apply_filters(df, domain_filter, model_type_filter, reasoning_filter):
"""Apply filters to dataframe"""
filtered_df = df.copy()
# Strip emoji prefix from domain filter
domain_filter_clean = domain_filter
if domain_filter.startswith('🌐'):
domain_filter_clean = "All"
elif domain_filter.startswith('🏦'):
domain_filter_clean = "Banking"
elif domain_filter.startswith('πŸ₯'):
domain_filter_clean = "Healthcare"
elif domain_filter.startswith('πŸ›‘οΈ'):
domain_filter_clean = "Insurance"
elif domain_filter.startswith('πŸ’°'):
domain_filter_clean = "Investment"
elif domain_filter.startswith('πŸ“±'):
domain_filter_clean = "Telecom"
# Domain filtering
if domain_filter_clean != "All":
domain_col_map = {
"Banking": "Banking AC",
"Healthcare": "Healthcare AC",
"Insurance": "Insurance AC",
"Investment": "Investment AC",
"Telecom": "Telecom AC"
}
if domain_filter_clean in domain_col_map:
domain_col = domain_col_map[domain_filter_clean]
filtered_df = filtered_df[filtered_df[domain_col] != '']
# Model type filtering
if model_type_filter != "All":
if model_type_filter == "Open Source":
filtered_df = filtered_df[filtered_df['Model Type'] == 'Open source']
elif model_type_filter == "Proprietary":
filtered_df = filtered_df[filtered_df['Model Type'] == 'Proprietary']
# Reasoning filtering
if reasoning_filter != "All":
if reasoning_filter == "Reasoning":
filtered_df = filtered_df[filtered_df['Output Type'] == 'Reasoning']
elif reasoning_filter == "Normal":
filtered_df = filtered_df[filtered_df['Output Type'] == 'Normal']
return filtered_df
# Connect update functions to components
efficiency_metric.change(
fn=update_cost_performance,
inputs=[efficiency_metric],
outputs=[cost_performance_plot]
)
efficiency_metric.change(
fn=update_speed_accuracy,
inputs=[efficiency_metric],
outputs=[speed_accuracy_plot]
)
specialization_metric.change(
fn=update_domain_specialization,
inputs=[specialization_metric],
outputs=[domain_specialization_plot]
)
specialization_metric.change(
fn=update_performance_gap,
inputs=[specialization_metric],
outputs=[performance_gap_plot]
)
# Update new visualizations when main filters change
for input_component in filter_inputs:
input_component.change(
fn=update_all_visualizations,
inputs=filter_inputs,
outputs=[cost_performance_plot, speed_accuracy_plot, performance_heatmap]
)
# Define generate_performance_card function before using it
def generate_performance_card(model_name):
"""Generate HTML for the model performance card"""
if not model_name:
return """<div style="text-align: center; color: var(--text-secondary); padding: 40px;">
Please select a model to generate its performance card
</div>"""
# Get model data
df = load_leaderboard_data()
model_data = df[df['Model'] == model_name]
if model_data.empty:
return """<div style="text-align: center; color: var(--text-secondary); padding: 40px;">
Model not found in the database
</div>"""
row = model_data.iloc[0]
# Get overall rank
df_with_ac = df[df['Avg AC'] != ''].copy()
df_with_ac['Avg AC'] = pd.to_numeric(df_with_ac['Avg AC'], errors='coerce')
df_sorted = df_with_ac.sort_values('Avg AC', ascending=False).reset_index(drop=True)
try:
rank = df_sorted[df_sorted['Model'] == model_name].index[0] + 1
except:
rank = 'N/A'
# Format values
def format_value(val, decimals=3, prefix='', suffix=''):
if pd.isna(val) or val == '':
return 'N/A'
return f"{prefix}{float(val):.{decimals}f}{suffix}"
# Determine model type icon and badge color
type_icon = "πŸ”“" if row['Model Type'] == 'Open source' else "πŸ”’"
reasoning_icon = "🧠" if row.get('Output Type', '') == 'Reasoning' else "πŸ’‘"
# Calculate performance stars
def get_performance_stars(value, max_val=1.0):
if pd.isna(value) or value == '':
return '⭐' * 0
score = float(value) / max_val
if score >= 0.9:
return '⭐' * 5
elif score >= 0.7:
return '⭐' * 4
elif score >= 0.5:
return '⭐' * 3
elif score >= 0.3:
return '⭐' * 2
else:
return '⭐' * 1
# Create HTML
card_html = f"""
<div class="performance-card">
<div class="card-header">
<h1 class="card-model-name">{model_name}</h1>
<div class="card-stars">
{get_performance_stars(row['Avg AC'])}
</div>
</div>
<div class="metrics-grid" style="margin-bottom: 24px;">
<div class="metric-item">
<div class="metric-icon" style="color: var(--accent-primary);">πŸ†</div>
<div class="metric-label">Overall Rank</div>
<div class="metric-value">#{rank}</div>
</div>
<div class="metric-item">
<div class="metric-icon" style="color: var(--accent-primary);">🎯</div>
<div class="metric-label">Action Completion</div>
<div class="metric-value">{format_value(row['Avg AC'])}</div>
</div>
<div class="metric-item">
<div class="metric-icon" style="color: var(--accent-secondary);">πŸ› οΈ</div>
<div class="metric-label">Tool Selection</div>
<div class="metric-value">{format_value(row['Avg TSQ'])}</div>
</div>
<div class="metric-item">
<div class="metric-icon" style="color: #F5F6F7;">πŸ’°</div>
<div class="metric-label">Avg Cost</div>
<div class="metric-value">{format_value(row['Avg Total Cost'], 3, '$')}</div>
</div>
<div class="metric-item">
<div class="metric-icon" style="color: #F5F6F7;">⚑</div>
<div class="metric-label">Avg Duration</div>
<div class="metric-value">{format_value(row['Avg Session Duration'], 1, '', 's')}</div>
</div>
<div class="metric-item">
<div class="metric-icon" style="color: #F5F6F7;">πŸ’¬</div>
<div class="metric-label">Avg Turns</div>
<div class="metric-value">{format_value(row['Avg Turns'], 1)}</div>
</div>
</div>
<div class="domains-section" style="margin-top: 24px;">
<h3 class="domains-title">πŸ›οΈ Domain Performance</h3>
<div class="domains-grid">
"""
# Add domain scores
domains = [
('🏦', 'Banking'),
('πŸ₯', 'Healthcare'),
('πŸ›‘οΈ', 'Insurance'),
('πŸ’°', 'Investment'),
('πŸ“±', 'Telecom')
]
for domain_icon, domain_name in domains:
ac_col = f'{domain_name} AC'
ac_value = row.get(ac_col, '')
if ac_value != '' and not pd.isna(ac_value):
score_display = f"{float(ac_value):.3f}"
score_color = "var(--accent-primary)"
else:
score_display = "N/A"
score_color = "var(--text-muted)"
card_html += f"""
<div class="domain-item">
<div class="domain-name">{domain_icon}</div>
<div style="font-size: 0.7rem; color: var(--text-secondary); margin-bottom: 2px;">{domain_name}</div>
<div class="domain-score" style="color: {score_color};">{score_display}</div>
</div>
"""
card_html += f"""
</div>
</div>
<div class="card-footer">
<div class="card-url">
<strong>https://galileo.ai/agent-leaderboard</strong>
</div>
</div>
</div>
"""
return card_html
# MODEL PERFORMANCE CARD SECTION
gr.HTML("""
<div class="dark-container" style="margin-top: 32px;">
<div class="section-header">
<span class="section-icon" style="color: var(--accent-primary);">🎯</span>
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;">
Model Performance Card
</h3>
</div>
<p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Geist', sans-serif;">
Comprehensive performance card for any model - perfect for presentations and reports
</p>
<div style="display: flex; gap: 24px; align-items: flex-start;">
<!-- Controls Column -->
<div style="flex: 0 0 280px;">
<div style="background: rgba(245, 246, 247, 0.03); border: 1px solid var(--border-subtle);
border-radius: 16px; padding: 20px; position: sticky; top: 20px;">
""")
card_model_selector = gr.Dropdown(
choices=initial_df['Model'].tolist(),
value=initial_df['Model'].tolist()[0] if len(initial_df) > 0 else None,
label="πŸ€– Select Model",
info="Choose a model to view its performance card",
elem_classes=["dropdown"]
)
download_card_btn = gr.Button(
"πŸ“₯ Download Card as PNG",
variant="secondary",
elem_classes=["download-button"],
elem_id="download-card-btn"
)
gr.HTML("""
</div>
</div>
<!-- Card Display Column -->
<div style="flex: 1; min-width: 0;" id="card-display-container">
""")
# Card display area - generate initial card
initial_model = initial_df['Model'].tolist()[0] if len(initial_df) > 0 else None
initial_card_html = generate_performance_card(initial_model) if initial_model else ""
card_display = gr.HTML(value=initial_card_html, elem_id="performance-card-html")
gr.HTML("""
</div>
</div>
</div>""")
# Add custom CSS for the performance card
gr.HTML("""
<style>
/* Performance Card Styles */
.performance-card {
background: linear-gradient(145deg, rgba(1, 9, 26, 0.98) 0%, rgba(227, 84, 84, 0.05) 100%);
border: 2px solid var(--accent-primary);
border-radius: 24px;
padding: 32px;
max-width: 700px;
margin: 0 auto;
position: relative;
overflow: hidden;
box-shadow:
0 20px 40px rgba(0, 0, 0, 0.5),
0 0 80px rgba(227, 84, 84, 0.2),
inset 0 0 120px rgba(227, 84, 84, 0.05);
}
.performance-card::before {
content: '';
position: absolute;
top: -50%;
left: -50%;
width: 200%;
height: 200%;
background: radial-gradient(circle at center, var(--glow-primary) 0%, transparent 70%);
opacity: 0.1;
animation: pulse 4s ease-in-out infinite;
}
.card-header {
text-align: center;
margin-bottom: 24px;
position: relative;
z-index: 1;
}
.card-badges {
display: flex;
justify-content: center;
gap: 12px;
margin-bottom: 16px;
}
.card-model-name {
font-size: 2rem;
font-weight: 800;
background: linear-gradient(135deg, var(--accent-primary) 0%, var(--accent-secondary) 100%);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
margin-bottom: 8px;
text-shadow: 0 0 40px var(--glow-primary);
line-height: 1.2;
}
.card-stars {
font-size: 1.2rem;
margin: 8px 0;
display: flex;
justify-content: center;
align-items: center;
gap: 2px;
}
.card-vendor {
font-size: 1.2rem;
color: var(--text-secondary);
font-weight: 500;
margin-top: 4px;
}
.metrics-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
gap: 16px;
margin-bottom: 24px;
position: relative;
z-index: 1;
}
.metric-item {
background: rgba(245, 246, 247, 0.05);
border: 1px solid var(--border-subtle);
border-radius: 16px;
padding: 16px;
text-align: center;
transition: all 0.3s ease;
}
.metric-item:hover {
transform: translateY(-4px);
border-color: var(--accent-primary);
box-shadow: 0 8px 24px rgba(227, 84, 84, 0.2);
}
.metric-icon {
font-size: 1.5rem;
margin-bottom: 6px;
filter: drop-shadow(0 0 20px currentColor);
}
.metric-label {
font-size: 0.75rem;
color: var(--text-secondary);
text-transform: uppercase;
letter-spacing: 0.05em;
margin-bottom: 4px;
}
.metric-value {
font-size: 1.4rem;
font-weight: 700;
color: var(--text-primary);
font-family: 'Geist Mono', monospace;
}
.domains-section {
margin-top: 32px;
position: relative;
z-index: 1;
}
.domains-title {
font-size: 1.1rem;
font-weight: 600;
color: var(--text-primary);
margin-bottom: 16px;
text-align: center;
}
.domains-grid {
display: grid;
grid-template-columns: repeat(5, 1fr);
gap: 12px;
}
.domain-item {
background: rgba(245, 246, 247, 0.05);
border: 1px solid var(--border-subtle);
border-radius: 12px;
padding: 12px;
text-align: center;
}
.domain-name {
font-size: 1.4rem;
margin-bottom: 4px;
}
.domain-score {
font-size: 1rem;
font-weight: 600;
color: var(--accent-primary);
}
.card-footer {
text-align: center;
margin-top: 24px;
padding-top: 20px;
border-top: 1px solid var(--border-subtle);
position: relative;
z-index: 1;
}
.card-badge {
display: inline-flex;
align-items: center;
gap: 8px;
padding: 8px 16px;
background: rgba(245, 246, 247, 0.05);
border: 1px solid var(--border-subtle);
border-radius: 20px;
font-size: 0.9rem;
color: var(--text-secondary);
margin: 0 4px;
}
.card-url {
margin-top: 12px;
font-size: 0.75rem;
color: var(--text-muted);
font-family: 'Geist Mono', monospace;
}
.primary-button {
background: linear-gradient(135deg, var(--accent-primary) 0%, #B94545 100%) !important;
color: white !important;
border: none !important;
padding: 10px 20px !important;
font-weight: 600 !important;
transition: all 0.3s ease !important;
}
.primary-button:hover {
transform: translateY(-2px) !important;
box-shadow: 0 8px 24px rgba(227, 84, 84, 0.4) !important;
}
/* Download button styling */
.download-button {
background: linear-gradient(135deg, var(--accent-secondary) 0%, #0A6BC4 100%) !important;
color: white !important;
border: none !important;
padding: 10px 20px !important;
font-weight: 600 !important;
transition: all 0.3s ease !important;
}
.download-button:hover {
transform: translateY(-2px) !important;
box-shadow: 0 8px 24px rgba(16, 152, 247, 0.4) !important;
}
/* Responsive layout for performance card section */
@media (max-width: 1200px) {
.performance-card {
padding: 24px !important;
}
.card-model-name {
font-size: 1.7rem !important;
}
.metric-value {
font-size: 1.2rem !important;
}
}
@media (max-width: 900px) {
/* Stack the controls above the card on smaller screens */
#card-display-container {
margin-top: 20px;
}
.performance-card {
padding: 20px !important;
}
.card-model-name {
font-size: 1.5rem !important;
}
.metric-value {
font-size: 1.1rem !important;
}
.domains-grid {
grid-template-columns: repeat(3, 1fr) !important;
}
}
/* Button states */
.download-button:disabled {
opacity: 0.6 !important;
cursor: not-allowed !important;
}
</style>
<!-- Include html2canvas library -->
<script src="https://cdn.jsdelivr.net/npm/html2canvas@1.4.1/dist/html2canvas.min.js"></script>
""")
# Wire up the card generator to selection change
card_model_selector.change(
fn=generate_performance_card,
inputs=[card_model_selector],
outputs=[card_display]
)
# Wire up download button with improved functionality
download_card_btn.click(
fn=None,
js="""
() => {
// Wait a bit to ensure the card is fully rendered
setTimeout(() => {
const card = document.querySelector('.performance-card');
if (!card) {
alert('Performance card not found. Please select a model first.');
return;
}
// Check if html2canvas is loaded
if (typeof html2canvas === 'undefined') {
// Try to load html2canvas dynamically
const script = document.createElement('script');
script.src = 'https://cdn.jsdelivr.net/npm/html2canvas@1.4.1/dist/html2canvas.min.js';
script.onload = () => {
captureCard();
};
script.onerror = () => {
alert('Failed to load html2canvas library. Please try again.');
};
document.head.appendChild(script);
} else {
captureCard();
}
function captureCard() {
// Show loading indicator
const btn = document.getElementById('download-card-btn');
const originalText = btn.textContent;
btn.textContent = 'Generating...';
btn.disabled = true;
html2canvas(card, {
backgroundColor: '#01091A',
scale: 2,
logging: false,
useCORS: true,
allowTaint: true
}).then(canvas => {
// Create download link
const link = document.createElement('a');
const modelName = card.querySelector('.card-model-name')?.textContent || 'model';
const timestamp = new Date().toISOString().slice(0,10);
const fileName = `${modelName.replace(/[^a-z0-9]/gi, '-').toLowerCase()}-performance-${timestamp}.png`;
link.download = fileName;
link.href = canvas.toDataURL('image/png');
document.body.appendChild(link);
link.click();
document.body.removeChild(link);
// Restore button
btn.textContent = originalText;
btn.disabled = false;
}).catch(error => {
console.error('Error capturing card:', error);
alert('Failed to capture performance card. Please try again.');
btn.textContent = originalText;
btn.disabled = false;
});
}
}, 100);
}
"""
)
# Also update card when filters change to keep model selector in sync
for input_component in filter_inputs:
def update_dropdown_and_card(*args):
filtered_df = apply_filters(load_leaderboard_data(), args[0], args[1], args[2])
choices = filtered_df['Model'].tolist()
# Select first model from filtered list
value = choices[0] if choices else None
return gr.Dropdown(choices=choices, value=value)
input_component.change(
fn=update_dropdown_and_card,
inputs=filter_inputs,
outputs=[card_model_selector]
)
return leaderboard_table
def create_leaderboard_v2_interface():
"""Create the complete leaderboard v2 interface"""
return create_leaderboard_v2_tab()
def create_domain_radar_chart(df, metric_type, selected_models=None, max_models=5):
"""Create a radar chart showing model performance across domains for the selected metric"""
# Map the metric_type to actual column name using shared mapping
actual_metric_type = SORT_COLUMN_MAP.get(metric_type, metric_type)
if selected_models is None or len(selected_models) == 0:
# Default to top 5 models by the selected metric if available
if actual_metric_type in df.columns:
selected_models = df.nlargest(max_models, actual_metric_type)['Model'].tolist()
else:
selected_models = df.head(max_models)['Model'].tolist()
# Limit to max_models for readability
selected_models = selected_models[:max_models]
# Define domain mapping based on metric type
domain_mapping = {
'Avg AC': {
'Banking': 'Banking AC',
'Healthcare': 'Healthcare AC',
'Insurance': 'Insurance AC',
'Investment': 'Investment AC',
'Telecom': 'Telecom AC'
},
'Avg TSQ': {
'Banking': 'Banking TSQ',
'Healthcare': 'Healthcare TSQ',
'Insurance': 'Insurance TSQ',
'Investment': 'Investment TSQ',
'Telecom': 'Telecom TSQ'
},
'Avg Total Cost': {
'Banking': 'Banking Cost',
'Healthcare': 'Healthcare Cost',
'Insurance': 'Insurance Cost',
'Investment': 'Investment Cost',
'Telecom': 'Telecom Cost'
},
'Avg Session Duration': {
'Banking': 'Banking Duration',
'Healthcare': 'Healthcare Duration',
'Insurance': 'Insurance Duration',
'Investment': 'Investment Duration',
'Telecom': 'Telecom Duration'
},
'Avg Turns': {
'Banking': 'Banking Turns',
'Healthcare': 'Healthcare Turns',
'Insurance': 'Insurance Turns',
'Investment': 'Investment Turns',
'Telecom': 'Telecom Turns'
}
}
# Only show radar chart for AC and TSQ metrics that have domain breakdowns
if actual_metric_type not in domain_mapping:
return create_empty_radar_chart(f"Domain breakdown not available for {metric_type}")
fig = go.Figure()
domains = list(domain_mapping[actual_metric_type].keys())
domain_columns = list(domain_mapping[actual_metric_type].values())
# Galileo dark theme color scheme
galileo_dark_colors = [
{'fill': 'rgba(227, 84, 84, 0.25)', 'line': '#E35454', 'name': 'Vanguard'}, # Vanguard Red
{'fill': 'rgba(16, 152, 247, 0.15)', 'line': '#1098F7', 'name': 'Airglow'}, # Airglow Blue
{'fill': 'rgba(245, 246, 247, 0.15)', 'line': '#F5F6F7', 'name': 'Mercury'}, # Light Mercury
{'fill': 'rgba(227, 84, 84, 0.35)', 'line': '#B94545', 'name': 'Deep Red'}, # Darker Vanguard
{'fill': 'rgba(16, 152, 247, 0.25)', 'line': '#0A6BC4', 'name': 'Deep Blue'} # Darker Airglow
]
for idx, model_name in enumerate(selected_models):
model_data = df[df['Model'] == model_name]
if model_data.empty:
continue
model_row = model_data.iloc[0]
values = []
# Get values for each domain
for col in domain_columns:
if col in df.columns and col in model_row:
val = model_row[col]
if pd.isna(val) or val == '':
val = 0
else:
val = float(val)
values.append(val)
else:
values.append(0)
# Close the radar chart by repeating first value
values_plot = values + [values[0]]
domains_plot = domains + [domains[0]]
colors = galileo_dark_colors[idx % len(galileo_dark_colors)]
fig.add_trace(
go.Scatterpolar(
r=values_plot,
theta=domains_plot,
fill='toself',
fillcolor=colors['fill'],
line=dict(
color=colors['line'],
width=3,
shape='spline',
smoothing=0.8
),
marker=dict(
size=10,
color=colors['line'],
symbol='circle',
line=dict(width=2, color='#01091A')
),
name=model_name,
mode="lines+markers",
hovertemplate="<b>%{fullData.name}</b><br>" +
"<span style='color: #94A3B8'>%{theta}</span><br>" +
"<b style='font-size: 14px; color: #F5F6F7'>%{r:.3f}</b><br>" +
"<extra></extra>",
hoverlabel=dict(
bgcolor="rgba(1, 9, 26, 0.95)",
bordercolor=colors['line'],
font=dict(color="#F5F6F7", size=12, family="'Geist', sans-serif")
)
)
)
# Determine appropriate range based on metric type
if actual_metric_type in ['Avg AC', 'Avg TSQ']:
max_range = 1.0
else:
# Calculate max from data for other metrics (Cost, Duration, Turns)
all_values = []
for model_name in selected_models:
model_data = df[df['Model'] == model_name]
if not model_data.empty:
model_row = model_data.iloc[0]
for col in domain_columns:
if col in df.columns and col in model_row:
val = model_row[col]
if pd.notna(val) and val != '':
all_values.append(float(val))
max_range = max(all_values) * 1.1 if all_values else 1.0
# Create custom tick values for better readability
tick_vals = [i * max_range / 5 for i in range(6)]
tick_text = [f"{val:.2f}" for val in tick_vals]
fig.update_layout(
polar=dict(
bgcolor='rgba(245, 246, 247, 0.03)',
radialaxis=dict(
visible=True,
range=[0, max_range],
showline=True,
linewidth=2,
linecolor='rgba(245, 246, 247, 0.2)',
gridcolor='rgba(245, 246, 247, 0.1)',
gridwidth=1,
tickvals=tick_vals,
ticktext=tick_text,
tickfont=dict(
size=11,
color='#94A3B8',
family="'Geist Mono', monospace"
),
tickangle=0
),
angularaxis=dict(
showline=True,
linewidth=2,
linecolor='rgba(245, 246, 247, 0.2)',
gridcolor='rgba(245, 246, 247, 0.08)',
tickfont=dict(
size=14,
family="'Geist', sans-serif",
color='#F5F6F7',
weight=600
),
rotation=90,
direction="clockwise",
),
),
showlegend=True,
legend=dict(
orientation="v",
yanchor="middle",
y=0.5,
xanchor="left",
x=1.05,
font=dict(
size=12,
family="'Geist', sans-serif",
color='#F5F6F7'
),
bgcolor='rgba(1, 9, 26, 0.8)',
bordercolor='rgba(245, 246, 247, 0.2)',
borderwidth=1,
itemsizing='constant',
itemwidth=30
),
title=dict(
text=f"<b>Domain Performance: {metric_type}</b>",
x=0.5,
y=0.97,
font=dict(
size=22,
family="'Geist', sans-serif",
color="#F5F6F7",
weight=700
),
),
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=900,
width=1450,
margin=dict(t=100, b=80, l=80, r=200),
annotations=[
dict(
text="Galileo Agent Leaderboard",
xref="paper", yref="paper",
x=0.98, y=0.02,
xanchor='right', yanchor='bottom',
font=dict(size=10, color='#64748B'),
showarrow=False
)
]
)
return fig
def create_empty_radar_chart(message):
"""Create an empty radar chart with a message"""
fig = go.Figure()
fig.add_annotation(
text=f"πŸ“Š {message}",
xref="paper", yref="paper",
x=0.5, y=0.5,
xanchor='center', yanchor='middle',
font=dict(
size=18,
color="#94A3B8",
family="'Geist', sans-serif"
),
showarrow=False,
bgcolor="rgba(245, 246, 247, 0.05)",
bordercolor="rgba(245, 246, 247, 0.2)",
borderwidth=1,
borderpad=20
)
fig.update_layout(
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=900,
width=1450,
margin=dict(t=100, b=80, l=80, r=200),
title=dict(
text="<b>Domain Performance Chart</b>",
x=0.5,
y=0.97,
font=dict(
size=22,
family="'Geist', sans-serif",
color="#F5F6F7",
weight=700
),
),
annotations=[
dict(
text="Galileo Agent Leaderboard",
xref="paper", yref="paper",
x=0.98, y=0.02,
xanchor='right', yanchor='bottom',
font=dict(size=10, color='#64748B'),
showarrow=False
)
]
)
return fig
# NEW VISUALIZATION FUNCTIONS
def create_cost_performance_scatter(df, metric="Avg AC"):
"""Create scatter plot showing cost vs performance efficiency"""
# Filter out models without cost or performance data
df_filtered = df[(df['Avg Total Cost'] != '') & (df[metric] != '')].copy()
if df_filtered.empty:
return create_empty_chart("No data available for cost-performance analysis")
# Convert to numeric
df_filtered['Avg Total Cost'] = pd.to_numeric(df_filtered['Avg Total Cost'], errors='coerce')
df_filtered[metric] = pd.to_numeric(df_filtered[metric], errors='coerce')
df_filtered['Avg Turns'] = pd.to_numeric(df_filtered['Avg Turns'], errors='coerce')
# Create color mapping for model type
color_map = {
'Proprietary': '#1098F7', # Airglow Blue for Proprietary
'Open source': '#58BC82' # Green for Open source
}
df_filtered['Color'] = df_filtered['Model Type'].map(color_map).fillna('#F5F6F7')
fig = go.Figure()
# Add scatter points
for model_type in df_filtered['Model Type'].unique():
df_type = df_filtered[df_filtered['Model Type'] == model_type]
fig.add_trace(go.Scatter(
x=df_type[metric],
y=df_type['Avg Total Cost'],
mode='markers+text',
name=model_type,
text=df_type['Model'],
textposition="top center",
textfont=dict(size=10, color='#94A3B8'),
marker=dict(
size=df_type['Avg Turns'] * 3, # Size based on number of turns
color=color_map.get(model_type, '#F5F6F7'),
opacity=0.8,
line=dict(width=2, color='#01091A')
),
hovertemplate="<b>%{text}</b><br>" +
f"{metric}: %{{x:.3f}}<br>" +
"Cost: $%{y:.3f}<br>" +
"Turns: %{marker.size:.1f}<br>" +
"<extra></extra>"
))
# Add quadrant lines
median_x = df_filtered[metric].median()
median_y = df_filtered['Avg Total Cost'].median()
fig.add_hline(y=median_y, line_dash="dash", line_color="#64748B", opacity=0.5)
fig.add_vline(x=median_x, line_dash="dash", line_color="#64748B", opacity=0.5)
# Add quadrant labels
fig.add_annotation(x=0.95, y=0.05, text="πŸ’Ž High Performance<br>Low Cost",
showarrow=False, xref="paper", yref="paper",
font=dict(size=12, color="#F5F6F7"), bgcolor="rgba(245, 246, 247, 0.1)")
fig.add_annotation(x=0.05, y=0.95, text="⚠️ Low Performance<br>High Cost",
showarrow=False, xref="paper", yref="paper",
font=dict(size=12, color="#E35454"), bgcolor="rgba(227, 84, 84, 0.1)")
metric_display = "Action Completion" if metric == "Avg AC" else "Tool Selection Quality"
fig.update_layout(
title=dict(
text=f"<b>Cost-Performance Efficiency: {metric_display}</b>",
x=0.5,
y=0.97,
font=dict(size=22, family="'Geist', sans-serif", color="#F5F6F7", weight=700)
),
xaxis=dict(
title=dict(
text=f"<b>{metric_display}</b>",
font=dict(size=16, color="#F5F6F7")
),
tickfont=dict(size=12, color="#94A3B8"),
gridcolor="rgba(245, 246, 247, 0.1)",
zerolinecolor="rgba(245, 246, 247, 0.2)"
),
yaxis=dict(
title=dict(
text="<b>Average Session Cost ($)</b>",
font=dict(size=16, color="#F5F6F7")
),
tickfont=dict(size=12, color="#94A3B8"),
gridcolor="rgba(245, 246, 247, 0.1)",
zerolinecolor="rgba(245, 246, 247, 0.2)"
),
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=900,
width=1450,
showlegend=True,
legend=dict(
orientation="h",
yanchor="bottom",
y=1.02,
xanchor="right",
x=1,
font=dict(size=12, family="'Geist', sans-serif", color='#F5F6F7'),
bgcolor='rgba(1, 9, 26, 0.8)',
bordercolor='rgba(245, 246, 247, 0.2)',
borderwidth=1
),
margin=dict(t=100, b=80, l=80, r=80)
)
return fig
def create_speed_accuracy_plot(df, metric="Avg AC"):
"""Create scatter plot showing speed vs accuracy trade-off"""
# Filter out models without duration or performance data
df_filtered = df[(df['Avg Session Duration'] != '') & (df[metric] != '')].copy()
if df_filtered.empty:
return create_empty_chart("No data available for speed-accuracy analysis")
# Convert to numeric
df_filtered['Avg Session Duration'] = pd.to_numeric(df_filtered['Avg Session Duration'], errors='coerce')
df_filtered[metric] = pd.to_numeric(df_filtered[metric], errors='coerce')
# Create color scale based on cost
df_filtered['Avg Total Cost'] = pd.to_numeric(df_filtered['Avg Total Cost'], errors='coerce')
fig = go.Figure()
# Add scatter trace
fig.add_trace(go.Scatter(
x=df_filtered[metric],
y=df_filtered['Avg Session Duration'],
mode='markers+text',
text=df_filtered['Model'],
textposition="top center",
textfont=dict(size=9, color='#94A3B8'),
marker=dict(
size=12,
color=df_filtered['Avg Total Cost'],
colorscale=[[0, '#01091A'], [0.5, '#1098F7'], [1, '#E35454']],
showscale=True,
colorbar=dict(
title=dict(
text="Cost ($)",
font=dict(color="#F5F6F7")
),
tickfont=dict(color="#94A3B8"),
bgcolor="rgba(1, 9, 26, 0.8)",
bordercolor="rgba(245, 246, 247, 0.2)",
borderwidth=1,
x=1.02
),
line=dict(width=2, color='#01091A')
),
hovertemplate="<b>%{text}</b><br>" +
f"{metric}: %{{x:.3f}}<br>" +
"Duration: %{y:.1f}s<br>" +
"Cost: $%{marker.color:.3f}<br>" +
"<extra></extra>"
))
# Add quadrant lines
median_x = df_filtered[metric].median()
median_y = df_filtered['Avg Session Duration'].median()
fig.add_hline(y=median_y, line_dash="dash", line_color="#64748B", opacity=0.5)
fig.add_vline(x=median_x, line_dash="dash", line_color="#64748B", opacity=0.5)
# Add quadrant labels
fig.add_annotation(x=0.95, y=0.05, text="⚑ Fast & Accurate",
showarrow=False, xref="paper", yref="paper",
font=dict(size=12, color="#F5F6F7", weight=600))
fig.add_annotation(x=0.05, y=0.95, text="🐌 Slow & Inaccurate",
showarrow=False, xref="paper", yref="paper",
font=dict(size=12, color="#E35454", weight=600))
metric_display = "Action Completion" if metric == "Avg AC" else "Tool Selection Quality"
fig.update_layout(
title=dict(
text=f"<b>Speed vs Accuracy Trade-off: {metric_display}</b>",
x=0.5,
y=0.97,
font=dict(size=22, family="'Geist', sans-serif", color="#F5F6F7", weight=700)
),
xaxis=dict(
title=dict(
text=f"<b>{metric_display}</b>",
font=dict(size=16, color="#F5F6F7")
),
tickfont=dict(size=12, color="#94A3B8"),
gridcolor="rgba(245, 246, 247, 0.1)",
zerolinecolor="rgba(245, 246, 247, 0.2)"
),
yaxis=dict(
title=dict(
text="<b>Average Session Duration (seconds)</b>",
font=dict(size=16, color="#F5F6F7")
),
tickfont=dict(size=12, color="#94A3B8"),
gridcolor="rgba(245, 246, 247, 0.1)",
zerolinecolor="rgba(245, 246, 247, 0.2)"
),
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=900,
width=1450,
margin=dict(t=100, b=80, l=80, r=120)
)
return fig
def create_performance_heatmap(df):
"""Create a heatmap showing all metrics for all models"""
# Select relevant columns
metrics = ['Avg AC', 'Avg TSQ', 'Avg Total Cost', 'Avg Session Duration', 'Avg Turns']
# Filter models with data
df_filtered = df[df['Avg AC'] != ''].copy()
if df_filtered.empty:
return create_empty_chart("No data available for performance heatmap")
# Convert to numeric and normalize
for col in metrics:
df_filtered[col] = pd.to_numeric(df_filtered[col], errors='coerce')
# Create normalized data (0-1 scale)
# For cost, duration, and turns, lower is better so we invert
normalized_data = []
metric_labels = []
for col in metrics:
if col in ['Avg Total Cost', 'Avg Session Duration', 'Avg Turns']:
# Invert these metrics (lower is better)
normalized = 1 - (df_filtered[col] - df_filtered[col].min()) / (df_filtered[col].max() - df_filtered[col].min())
else:
# Higher is better for AC and TSQ
normalized = (df_filtered[col] - df_filtered[col].min()) / (df_filtered[col].max() - df_filtered[col].min())
normalized_data.append(normalized.values)
# Create better labels
label_map = {
'Avg AC': 'Action Completion',
'Avg TSQ': 'Tool Selection',
'Avg Total Cost': 'Cost Efficiency',
'Avg Session Duration': 'Speed',
'Avg Turns': 'Conversation Efficiency'
}
metric_labels.append(label_map.get(col, col))
# Create heatmap
fig = go.Figure(data=go.Heatmap(
z=normalized_data,
x=df_filtered['Model'].tolist(),
y=metric_labels,
colorscale=[[0, '#01091A'], [0.5, '#1098F7'], [1, '#E35454']],
hovertemplate="<b>%{x}</b><br>" +
"%{y}: %{z:.2f}<br>" +
"<extra></extra>",
text=[[f"{val:.2f}" for val in row] for row in normalized_data],
texttemplate="%{text}",
textfont={"size": 10, "color": "#F5F6F7"},
showscale=True,
colorbar=dict(
title=dict(
text="Performance<br>Score",
font=dict(color="#F5F6F7")
),
tickfont=dict(color="#94A3B8"),
bgcolor="rgba(1, 9, 26, 0.8)",
bordercolor="rgba(245, 246, 247, 0.2)",
borderwidth=1
)
))
fig.update_layout(
title=dict(
text="<b>Comprehensive Performance Heatmap</b>",
x=0.5,
y=0.97,
font=dict(size=22, family="'Geist', sans-serif", color="#F5F6F7", weight=700)
),
xaxis=dict(
side="bottom",
tickfont=dict(size=11, color="#94A3B8"),
tickangle=-45
),
yaxis=dict(
tickfont=dict(size=13, color="#F5F6F7", weight=600)
),
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=700,
width=1550,
margin=dict(t=100, b=120, l=170, r=120)
)
return fig
def create_domain_specialization_matrix(df, metric_type="AC"):
"""Create bubble chart showing domain specialization"""
domains = ['Banking', 'Healthcare', 'Insurance', 'Investment', 'Telecom']
# Prepare data
data = []
for _, model in df.iterrows():
if model['Model'] == '':
continue
model_avg = pd.to_numeric(model[f'Avg {metric_type}'], errors='coerce')
if pd.isna(model_avg):
continue
for domain in domains:
domain_col = f'{domain} {metric_type}'
if domain_col in model and model[domain_col] != '':
domain_val = pd.to_numeric(model[domain_col], errors='coerce')
if not pd.isna(domain_val):
# Calculate specialization strength (deviation from model average)
specialization = domain_val - model_avg
data.append({
'Model': model['Model'],
'Domain': domain,
'Performance': domain_val,
'Specialization': specialization,
'Model Type': model['Model Type']
})
if not data:
return create_empty_chart("No domain specialization data available")
df_plot = pd.DataFrame(data)
# Create bubble chart
fig = go.Figure()
# Color based on specialization strength
fig.add_trace(go.Scatter(
x=df_plot['Domain'],
y=df_plot['Model'],
mode='markers',
marker=dict(
size=df_plot['Performance'] * 30, # Size based on absolute performance
color=df_plot['Specialization'],
colorscale=[[0, '#1098F7'], [0.5, '#F5F6F7'], [1, '#E35454']],
showscale=True,
colorbar=dict(
title=dict(
text="Specialization<br>Strength",
font=dict(color="#F5F6F7")
),
tickfont=dict(color="#94A3B8"),
bgcolor="rgba(1, 9, 26, 0.8)",
bordercolor="rgba(245, 246, 247, 0.2)",
borderwidth=1
),
line=dict(width=2, color='#01091A'),
opacity=0.8
),
text=[f"Performance: {p:.3f}<br>Specialization: {s:+.3f}"
for p, s in zip(df_plot['Performance'], df_plot['Specialization'])],
hovertemplate="<b>%{y}</b><br>" +
"Domain: %{x}<br>" +
"%{text}<br>" +
"<extra></extra>"
))
metric_display = "Action Completion" if metric_type == "AC" else "Tool Selection Quality"
fig.update_layout(
title=dict(
text=f"<b>Domain Specialization Matrix: {metric_display}</b>",
x=0.5,
y=0.97,
font=dict(size=22, family="'Geist', sans-serif", color="#F5F6F7", weight=700)
),
xaxis=dict(
title=dict(
text="<b>Business Domains</b>",
font=dict(size=16, color="#F5F6F7")
),
tickfont=dict(size=13, color="#F5F6F7"),
gridcolor="rgba(245, 246, 247, 0.1)"
),
yaxis=dict(
title=dict(
text="<b>Models</b>",
font=dict(size=16, color="#F5F6F7")
),
tickfont=dict(size=11, color="#94A3B8"),
gridcolor="rgba(245, 246, 247, 0.1)"
),
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=1100,
width=1450,
margin=dict(t=100, b=80, l=220, r=120)
)
return fig
def create_performance_gap_analysis(df, metric_type="AC"):
"""Create range plot showing performance gaps by domain"""
domains = ['Banking', 'Healthcare', 'Insurance', 'Investment', 'Telecom']
# Calculate min, max, median for each domain
gap_data = []
for domain in domains:
domain_col = f'{domain} {metric_type}'
if domain_col in df.columns:
domain_values = pd.to_numeric(df[domain_col], errors='coerce').dropna()
if len(domain_values) > 0:
gap_data.append({
'Domain': domain,
'Min': domain_values.min(),
'Max': domain_values.max(),
'Median': domain_values.median(),
'Q1': domain_values.quantile(0.25),
'Q3': domain_values.quantile(0.75),
'Gap': domain_values.max() - domain_values.min()
})
if not gap_data:
return create_empty_chart("No data available for gap analysis")
df_gap = pd.DataFrame(gap_data)
df_gap = df_gap.sort_values('Gap', ascending=True)
fig = go.Figure()
# Add range bars
for idx, row in df_gap.iterrows():
# Add full range line
fig.add_trace(go.Scatter(
x=[row['Min'], row['Max']],
y=[row['Domain'], row['Domain']],
mode='lines',
line=dict(color='#64748B', width=2),
showlegend=False,
hoverinfo='skip'
))
# Add IQR box
fig.add_trace(go.Scatter(
x=[row['Q1'], row['Q3'], row['Q3'], row['Q1'], row['Q1']],
y=[row['Domain'], row['Domain'], row['Domain'], row['Domain'], row['Domain']],
fill='toself',
fillcolor='rgba(227, 84, 84, 0.3)',
line=dict(color='#E35454', width=2),
showlegend=False,
hoverinfo='skip',
mode='lines'
))
# Add median marker
fig.add_trace(go.Scatter(
x=[row['Median']],
y=[row['Domain']],
mode='markers',
marker=dict(
size=12,
color='#E35454',
symbol='diamond',
line=dict(width=2, color='#01091A')
),
showlegend=False,
hovertemplate=f"<b>{row['Domain']}</b><br>" +
f"Min: {row['Min']:.3f}<br>" +
f"Q1: {row['Q1']:.3f}<br>" +
f"Median: {row['Median']:.3f}<br>" +
f"Q3: {row['Q3']:.3f}<br>" +
f"Max: {row['Max']:.3f}<br>" +
f"Gap: {row['Gap']:.3f}<br>" +
"<extra></extra>"
))
# Add min/max points
for idx, row in df_gap.iterrows():
fig.add_trace(go.Scatter(
x=[row['Min'], row['Max']],
y=[row['Domain'], row['Domain']],
mode='markers',
marker=dict(size=8, color='#F5F6F7', line=dict(width=2, color='#01091A')),
showlegend=False,
hoverinfo='skip'
))
metric_display = "Action Completion" if metric_type == "AC" else "Tool Selection Quality"
fig.update_layout(
title=dict(
text=f"<b>Performance Gap Analysis by Domain: {metric_display}</b>",
x=0.5,
y=0.97,
font=dict(size=22, family="'Geist', sans-serif", color="#F5F6F7", weight=700)
),
xaxis=dict(
title=dict(
text=f"<b>{metric_display} Score</b>",
font=dict(size=16, color="#F5F6F7")
),
tickfont=dict(size=12, color="#94A3B8"),
gridcolor="rgba(245, 246, 247, 0.1)",
range=[0, 1] if metric_type in ['AC', 'TSQ'] else None
),
yaxis=dict(
title=dict(
text="<b>Business Domain</b>",
font=dict(size=16, color="#F5F6F7")
),
tickfont=dict(size=13, color="#F5F6F7"),
gridcolor="rgba(245, 246, 247, 0.1)"
),
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=800,
width=1450,
margin=dict(t=100, b=80, l=140, r=80),
showlegend=False
)
# Add legend manually
fig.add_annotation(
text="β—† Median ━ IQR ─ Full Range",
xref="paper", yref="paper",
x=0.98, y=0.02,
xanchor='right', yanchor='bottom',
font=dict(size=12, color='#94A3B8'),
showarrow=False
)
return fig
def create_empty_chart(message):
"""Create an empty chart with a message"""
fig = go.Figure()
fig.add_annotation(
text=f"πŸ“Š {message}",
xref="paper", yref="paper",
x=0.5, y=0.5,
xanchor='center', yanchor='middle',
font=dict(
size=18,
color="#94A3B8",
family="'Geist', sans-serif"
),
showarrow=False,
bgcolor="rgba(245, 246, 247, 0.05)",
bordercolor="rgba(245, 246, 247, 0.2)",
borderwidth=1,
borderpad=20
)
fig.update_layout(
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=700,
width=1450,
margin=dict(t=80, b=80, l=80, r=80)
)