djstrong's picture
Refactor app.py to use JSON for benchmark data, removing CSV and metadata dependencies. Update performance plotting to reflect new data structure and enhance visualization with cultural context. Introduce benchmark report JSON file for structured model evaluation results.
fd35185
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json
def create_performance_plot(json_path='benchmark_report.json'):
# Define whitelist of interesting models (partial matches)
WHITELIST = [
'Meta Llama 4 Maverick',
'Anthropic Claude 3.7 Sonnet',
'OpenAI GPT-4o'
]
# Load the benchmark results from JSON
with open(json_path, 'r') as f:
json_data = json.load(f)
# Create DataFrame from JSON data
df = pd.DataFrame(json_data)
# Rename columns for consistency
df = df.rename(columns={
"Model Name": "Model Path",
"Model Size": "Model Size Raw"
})
# Calculate overall benchmark score as average of Avg (object) and Avg (country)
df['Benchmark Score'] = (df['Avg (object)'] + df['Avg (country)']) / 2
# Process model sizes - convert to numeric, handle "-" and extract numbers
df['Model Size'] = df['Model Size Raw'].replace("-", np.nan)
# Extract numeric values from size strings like "72 MB" -> 72 or plain "72" -> 72
def extract_size(size_val):
if pd.isna(size_val):
return np.nan
if isinstance(size_val, (int, float)):
return float(size_val)
if isinstance(size_val, str):
# Try to extract number from string (handles both "72" and "72 MB")
import re
match = re.search(r'(\d+(?:\.\d+)?)', str(size_val))
if match:
return float(match.group(1))
return np.nan
df['Model Size'] = df['Model Size'].apply(extract_size)
# Remove models without size information for plotting
df_with_size = df[df['Model Size'].notna()].copy()
# Print models without size before filtering
print("\nModels without size assigned:")
models_without_size = df[df['Model Size'].isna()]
for idx, row in models_without_size.iterrows():
print(f"- {row['Model Path']}")
# Remove extreme outliers (scores that are clearly errors)
if len(df_with_size) > 0:
q1 = df_with_size['Benchmark Score'].quantile(0.25)
q3 = df_with_size['Benchmark Score'].quantile(0.75)
iqr = q3 - q1
df_with_size = df_with_size[
(df_with_size['Benchmark Score'] >= q1 - 1.5 * iqr) &
(df_with_size['Benchmark Score'] <= q3 + 1.5 * iqr)
]
# Find models on Pareto frontier
sizes = sorted(df_with_size['Model Size'].unique())
frontier_points = []
max_score = float('-inf')
frontier_models = set()
for size in sizes:
# Get scores for models of this size or smaller
subset = df_with_size[df_with_size['Model Size'] <= size]
if len(subset) > 0:
max_score_idx = subset['Benchmark Score'].idxmax()
current_max = subset.loc[max_score_idx, 'Benchmark Score']
if current_max > max_score:
max_score = current_max
frontier_points.append((size, max_score))
frontier_models.add(subset.loc[max_score_idx, 'Model Path'])
# Filter models - keep those on Pareto frontier or matching whitelist
df_with_size['Keep'] = False
for idx, row in df_with_size.iterrows():
if row['Model Path'] in frontier_models:
df_with_size.loc[idx, 'Keep'] = True
else:
for pattern in WHITELIST:
if pattern in row['Model Path']:
df_with_size.loc[idx, 'Keep'] = True
break
# Also include models without size if they're in whitelist
df_no_size = df[df['Model Size'].isna()].copy()
df_no_size['Keep'] = False
for idx, row in df_no_size.iterrows():
for pattern in WHITELIST:
if pattern in row['Model Path']:
df_no_size.loc[idx, 'Keep'] = True
break
# Combine datasets for plotting
plot_df = df_with_size[df_with_size['Keep']].copy()
# Create the plot
fig = plt.figure(figsize=(12, 8))
if len(plot_df) > 0:
# Create scatter plot
plt.scatter(plot_df['Model Size'],
plot_df['Benchmark Score'],
alpha=0.6, s=60)
# Add labels for points
for idx, row in plot_df.iterrows():
# Use the full model name for labeling
model_name = row['Model Path']
plt.annotate(model_name,
(row['Model Size'], row['Benchmark Score']),
xytext=(5, 5), textcoords='offset points',
fontsize=8,
bbox=dict(facecolor='white', alpha=0.7, edgecolor='none', pad=0.5))
# Plot the Pareto frontier line
if frontier_points:
frontier_x, frontier_y = zip(*frontier_points)
plt.plot(frontier_x, frontier_y, 'r--', label='Pareto frontier', linewidth=2)
# Add vertical line for consumer GPU budget (assuming 24GB can handle ~12B parameters)
plt.axvline(x=12, color='gray', linestyle=':', label='Consumer-budget GPU limit', ymin=-0.15, clip_on=False)
plt.text(12, plt.ylim()[0] - (plt.ylim()[1] - plt.ylim()[0]) * 0.1,
'Consumer-budget\nGPU (24GB) limit\nin half precision',
horizontalalignment='center', verticalalignment='top')
# Customize the plot
plt.grid(True, linestyle='--', alpha=0.7)
plt.xlabel('Model Size (billions of parameters)')
plt.ylabel('Benchmark Score (Average of Object & Country Recognition)')
plt.title('Polish Photo Recognition: Model Performance vs Size')
# Add legend
plt.legend()
# Set reasonable axis limits
if len(plot_df) > 0:
plt.xlim(left=0)
plt.ylim(bottom=0)
# Adjust layout to prevent label cutoff
plt.tight_layout()
return fig
if __name__ == "__main__":
# When run as a script, save the plot to a file
fig = create_performance_plot()
fig.savefig('model_performance.png', dpi=300, bbox_inches='tight')