import pandas as pd import matplotlib.pyplot as plt import numpy as np import json def create_performance_plot(json_path='benchmark_report.json'): # Define whitelist of interesting models (partial matches) WHITELIST = [ 'Meta Llama 4 Maverick', 'Anthropic Claude 3.7 Sonnet', 'OpenAI GPT-4o' ] # Load the benchmark results from JSON with open(json_path, 'r') as f: json_data = json.load(f) # Create DataFrame from JSON data df = pd.DataFrame(json_data) # Rename columns for consistency df = df.rename(columns={ "Model Name": "Model Path", "Model Size": "Model Size Raw" }) # Calculate overall benchmark score as average of Avg (object) and Avg (country) df['Benchmark Score'] = (df['Avg (object)'] + df['Avg (country)']) / 2 # Process model sizes - convert to numeric, handle "-" and extract numbers df['Model Size'] = df['Model Size Raw'].replace("-", np.nan) # Extract numeric values from size strings like "72 MB" -> 72 or plain "72" -> 72 def extract_size(size_val): if pd.isna(size_val): return np.nan if isinstance(size_val, (int, float)): return float(size_val) if isinstance(size_val, str): # Try to extract number from string (handles both "72" and "72 MB") import re match = re.search(r'(\d+(?:\.\d+)?)', str(size_val)) if match: return float(match.group(1)) return np.nan df['Model Size'] = df['Model Size'].apply(extract_size) # Remove models without size information for plotting df_with_size = df[df['Model Size'].notna()].copy() # Print models without size before filtering print("\nModels without size assigned:") models_without_size = df[df['Model Size'].isna()] for idx, row in models_without_size.iterrows(): print(f"- {row['Model Path']}") # Remove extreme outliers (scores that are clearly errors) if len(df_with_size) > 0: q1 = df_with_size['Benchmark Score'].quantile(0.25) q3 = df_with_size['Benchmark Score'].quantile(0.75) iqr = q3 - q1 df_with_size = df_with_size[ (df_with_size['Benchmark Score'] >= q1 - 1.5 * iqr) & (df_with_size['Benchmark Score'] <= q3 + 1.5 * iqr) ] # Find models on Pareto frontier sizes = sorted(df_with_size['Model Size'].unique()) frontier_points = [] max_score = float('-inf') frontier_models = set() for size in sizes: # Get scores for models of this size or smaller subset = df_with_size[df_with_size['Model Size'] <= size] if len(subset) > 0: max_score_idx = subset['Benchmark Score'].idxmax() current_max = subset.loc[max_score_idx, 'Benchmark Score'] if current_max > max_score: max_score = current_max frontier_points.append((size, max_score)) frontier_models.add(subset.loc[max_score_idx, 'Model Path']) # Filter models - keep those on Pareto frontier or matching whitelist df_with_size['Keep'] = False for idx, row in df_with_size.iterrows(): if row['Model Path'] in frontier_models: df_with_size.loc[idx, 'Keep'] = True else: for pattern in WHITELIST: if pattern in row['Model Path']: df_with_size.loc[idx, 'Keep'] = True break # Also include models without size if they're in whitelist df_no_size = df[df['Model Size'].isna()].copy() df_no_size['Keep'] = False for idx, row in df_no_size.iterrows(): for pattern in WHITELIST: if pattern in row['Model Path']: df_no_size.loc[idx, 'Keep'] = True break # Combine datasets for plotting plot_df = df_with_size[df_with_size['Keep']].copy() # Create the plot fig = plt.figure(figsize=(12, 8)) if len(plot_df) > 0: # Create scatter plot plt.scatter(plot_df['Model Size'], plot_df['Benchmark Score'], alpha=0.6, s=60) # Add labels for points for idx, row in plot_df.iterrows(): # Use the full model name for labeling model_name = row['Model Path'] plt.annotate(model_name, (row['Model Size'], row['Benchmark Score']), xytext=(5, 5), textcoords='offset points', fontsize=8, bbox=dict(facecolor='white', alpha=0.7, edgecolor='none', pad=0.5)) # Plot the Pareto frontier line if frontier_points: frontier_x, frontier_y = zip(*frontier_points) plt.plot(frontier_x, frontier_y, 'r--', label='Pareto frontier', linewidth=2) # Add vertical line for consumer GPU budget (assuming 24GB can handle ~12B parameters) plt.axvline(x=12, color='gray', linestyle=':', label='Consumer-budget GPU limit', ymin=-0.15, clip_on=False) plt.text(12, plt.ylim()[0] - (plt.ylim()[1] - plt.ylim()[0]) * 0.1, 'Consumer-budget\nGPU (24GB) limit\nin half precision', horizontalalignment='center', verticalalignment='top') # Customize the plot plt.grid(True, linestyle='--', alpha=0.7) plt.xlabel('Model Size (billions of parameters)') plt.ylabel('Benchmark Score (Average of Object & Country Recognition)') plt.title('Polish Photo Recognition: Model Performance vs Size') # Add legend plt.legend() # Set reasonable axis limits if len(plot_df) > 0: plt.xlim(left=0) plt.ylim(bottom=0) # Adjust layout to prevent label cutoff plt.tight_layout() return fig if __name__ == "__main__": # When run as a script, save the plot to a file fig = create_performance_plot() fig.savefig('model_performance.png', dpi=300, bbox_inches='tight')