|
import pandas as pd |
|
import matplotlib.pyplot as plt |
|
import numpy as np |
|
import json |
|
|
|
def create_performance_plot(json_path='benchmark_report.json'): |
|
|
|
WHITELIST = [ |
|
'Meta Llama 4 Maverick', |
|
'Anthropic Claude 3.7 Sonnet', |
|
'OpenAI GPT-4o' |
|
] |
|
|
|
|
|
with open(json_path, 'r') as f: |
|
json_data = json.load(f) |
|
|
|
|
|
df = pd.DataFrame(json_data) |
|
|
|
|
|
df = df.rename(columns={ |
|
"Model Name": "Model Path", |
|
"Model Size": "Model Size Raw" |
|
}) |
|
|
|
|
|
df['Benchmark Score'] = (df['Avg (object)'] + df['Avg (country)']) / 2 |
|
|
|
|
|
df['Model Size'] = df['Model Size Raw'].replace("-", np.nan) |
|
|
|
|
|
def extract_size(size_val): |
|
if pd.isna(size_val): |
|
return np.nan |
|
if isinstance(size_val, (int, float)): |
|
return float(size_val) |
|
if isinstance(size_val, str): |
|
|
|
import re |
|
match = re.search(r'(\d+(?:\.\d+)?)', str(size_val)) |
|
if match: |
|
return float(match.group(1)) |
|
return np.nan |
|
|
|
df['Model Size'] = df['Model Size'].apply(extract_size) |
|
|
|
|
|
df_with_size = df[df['Model Size'].notna()].copy() |
|
|
|
|
|
print("\nModels without size assigned:") |
|
models_without_size = df[df['Model Size'].isna()] |
|
for idx, row in models_without_size.iterrows(): |
|
print(f"- {row['Model Path']}") |
|
|
|
|
|
if len(df_with_size) > 0: |
|
q1 = df_with_size['Benchmark Score'].quantile(0.25) |
|
q3 = df_with_size['Benchmark Score'].quantile(0.75) |
|
iqr = q3 - q1 |
|
df_with_size = df_with_size[ |
|
(df_with_size['Benchmark Score'] >= q1 - 1.5 * iqr) & |
|
(df_with_size['Benchmark Score'] <= q3 + 1.5 * iqr) |
|
] |
|
|
|
|
|
sizes = sorted(df_with_size['Model Size'].unique()) |
|
frontier_points = [] |
|
max_score = float('-inf') |
|
frontier_models = set() |
|
|
|
for size in sizes: |
|
|
|
subset = df_with_size[df_with_size['Model Size'] <= size] |
|
if len(subset) > 0: |
|
max_score_idx = subset['Benchmark Score'].idxmax() |
|
current_max = subset.loc[max_score_idx, 'Benchmark Score'] |
|
if current_max > max_score: |
|
max_score = current_max |
|
frontier_points.append((size, max_score)) |
|
frontier_models.add(subset.loc[max_score_idx, 'Model Path']) |
|
|
|
|
|
df_with_size['Keep'] = False |
|
for idx, row in df_with_size.iterrows(): |
|
if row['Model Path'] in frontier_models: |
|
df_with_size.loc[idx, 'Keep'] = True |
|
else: |
|
for pattern in WHITELIST: |
|
if pattern in row['Model Path']: |
|
df_with_size.loc[idx, 'Keep'] = True |
|
break |
|
|
|
|
|
df_no_size = df[df['Model Size'].isna()].copy() |
|
df_no_size['Keep'] = False |
|
for idx, row in df_no_size.iterrows(): |
|
for pattern in WHITELIST: |
|
if pattern in row['Model Path']: |
|
df_no_size.loc[idx, 'Keep'] = True |
|
break |
|
|
|
|
|
plot_df = df_with_size[df_with_size['Keep']].copy() |
|
|
|
|
|
fig = plt.figure(figsize=(12, 8)) |
|
|
|
if len(plot_df) > 0: |
|
|
|
plt.scatter(plot_df['Model Size'], |
|
plot_df['Benchmark Score'], |
|
alpha=0.6, s=60) |
|
|
|
|
|
for idx, row in plot_df.iterrows(): |
|
|
|
model_name = row['Model Path'] |
|
|
|
plt.annotate(model_name, |
|
(row['Model Size'], row['Benchmark Score']), |
|
xytext=(5, 5), textcoords='offset points', |
|
fontsize=8, |
|
bbox=dict(facecolor='white', alpha=0.7, edgecolor='none', pad=0.5)) |
|
|
|
|
|
if frontier_points: |
|
frontier_x, frontier_y = zip(*frontier_points) |
|
plt.plot(frontier_x, frontier_y, 'r--', label='Pareto frontier', linewidth=2) |
|
|
|
|
|
plt.axvline(x=12, color='gray', linestyle=':', label='Consumer-budget GPU limit', ymin=-0.15, clip_on=False) |
|
plt.text(12, plt.ylim()[0] - (plt.ylim()[1] - plt.ylim()[0]) * 0.1, |
|
'Consumer-budget\nGPU (24GB) limit\nin half precision', |
|
horizontalalignment='center', verticalalignment='top') |
|
|
|
|
|
plt.grid(True, linestyle='--', alpha=0.7) |
|
plt.xlabel('Model Size (billions of parameters)') |
|
plt.ylabel('Benchmark Score (Average of Object & Country Recognition)') |
|
plt.title('Polish Photo Recognition: Model Performance vs Size') |
|
|
|
|
|
plt.legend() |
|
|
|
|
|
if len(plot_df) > 0: |
|
plt.xlim(left=0) |
|
plt.ylim(bottom=0) |
|
|
|
|
|
plt.tight_layout() |
|
|
|
return fig |
|
|
|
if __name__ == "__main__": |
|
|
|
fig = create_performance_plot() |
|
fig.savefig('model_performance.png', dpi=300, bbox_inches='tight') |