Polish_Cultural_Vision_Benchmark

Running

File size: 6,025 Bytes

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json

def create_performance_plot(json_path='benchmark_report.json'):
    # Define whitelist of interesting models (partial matches)
    WHITELIST = [
        'Meta Llama 4 Maverick',
        'Anthropic Claude 3.7 Sonnet',
        'OpenAI GPT-4o'
    ]

    # Load the benchmark results from JSON
    with open(json_path, 'r') as f:
        json_data = json.load(f)

    # Create DataFrame from JSON data
    df = pd.DataFrame(json_data)
    
    # Rename columns for consistency
    df = df.rename(columns={
        "Model Name": "Model Path",
        "Model Size": "Model Size Raw"
    })
    
    # Calculate overall benchmark score as average of Avg (object) and Avg (country)
    df['Benchmark Score'] = (df['Avg (object)'] + df['Avg (country)']) / 2
    
    # Process model sizes - convert to numeric, handle "-" and extract numbers
    df['Model Size'] = df['Model Size Raw'].replace("-", np.nan)
    
    # Extract numeric values from size strings like "72 MB" -> 72 or plain "72" -> 72
    def extract_size(size_val):
        if pd.isna(size_val):
            return np.nan
        if isinstance(size_val, (int, float)):
            return float(size_val)
        if isinstance(size_val, str):
            # Try to extract number from string (handles both "72" and "72 MB")
            import re
            match = re.search(r'(\d+(?:\.\d+)?)', str(size_val))
            if match:
                return float(match.group(1))
        return np.nan
    
    df['Model Size'] = df['Model Size'].apply(extract_size)
    
    # Remove models without size information for plotting
    df_with_size = df[df['Model Size'].notna()].copy()
    
    # Print models without size before filtering
    print("\nModels without size assigned:")
    models_without_size = df[df['Model Size'].isna()]
    for idx, row in models_without_size.iterrows():
        print(f"- {row['Model Path']}")

    # Remove extreme outliers (scores that are clearly errors)
    if len(df_with_size) > 0:
        q1 = df_with_size['Benchmark Score'].quantile(0.25)
        q3 = df_with_size['Benchmark Score'].quantile(0.75)
        iqr = q3 - q1
        df_with_size = df_with_size[
            (df_with_size['Benchmark Score'] >= q1 - 1.5 * iqr) & 
            (df_with_size['Benchmark Score'] <= q3 + 1.5 * iqr)
        ]

    # Find models on Pareto frontier
    sizes = sorted(df_with_size['Model Size'].unique())
    frontier_points = []
    max_score = float('-inf')
    frontier_models = set()

    for size in sizes:
        # Get scores for models of this size or smaller
        subset = df_with_size[df_with_size['Model Size'] <= size]
        if len(subset) > 0:
            max_score_idx = subset['Benchmark Score'].idxmax()
            current_max = subset.loc[max_score_idx, 'Benchmark Score']
            if current_max > max_score:
                max_score = current_max
                frontier_points.append((size, max_score))
                frontier_models.add(subset.loc[max_score_idx, 'Model Path'])

    # Filter models - keep those on Pareto frontier or matching whitelist
    df_with_size['Keep'] = False
    for idx, row in df_with_size.iterrows():
        if row['Model Path'] in frontier_models:
            df_with_size.loc[idx, 'Keep'] = True
        else:
            for pattern in WHITELIST:
                if pattern in row['Model Path']:
                    df_with_size.loc[idx, 'Keep'] = True
                    break

    # Also include models without size if they're in whitelist
    df_no_size = df[df['Model Size'].isna()].copy()
    df_no_size['Keep'] = False
    for idx, row in df_no_size.iterrows():
        for pattern in WHITELIST:
            if pattern in row['Model Path']:
                df_no_size.loc[idx, 'Keep'] = True
                break

    # Combine datasets for plotting
    plot_df = df_with_size[df_with_size['Keep']].copy()

    # Create the plot
    fig = plt.figure(figsize=(12, 8))

    if len(plot_df) > 0:
        # Create scatter plot
        plt.scatter(plot_df['Model Size'], 
                   plot_df['Benchmark Score'],
                   alpha=0.6, s=60)

        # Add labels for points
        for idx, row in plot_df.iterrows():
            # Use the full model name for labeling
            model_name = row['Model Path']
            
            plt.annotate(model_name,
                        (row['Model Size'], row['Benchmark Score']),
                        xytext=(5, 5), textcoords='offset points',
                        fontsize=8,
                        bbox=dict(facecolor='white', alpha=0.7, edgecolor='none', pad=0.5))

        # Plot the Pareto frontier line
        if frontier_points:
            frontier_x, frontier_y = zip(*frontier_points)
            plt.plot(frontier_x, frontier_y, 'r--', label='Pareto frontier', linewidth=2)

        # Add vertical line for consumer GPU budget (assuming 24GB can handle ~12B parameters)
        plt.axvline(x=12, color='gray', linestyle=':', label='Consumer-budget GPU limit', ymin=-0.15, clip_on=False)
        plt.text(12, plt.ylim()[0] - (plt.ylim()[1] - plt.ylim()[0]) * 0.1, 
                 'Consumer-budget\nGPU (24GB) limit\nin half precision', 
                 horizontalalignment='center', verticalalignment='top')

    # Customize the plot
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.xlabel('Model Size (billions of parameters)')
    plt.ylabel('Benchmark Score (Average of Object & Country Recognition)')
    plt.title('Polish Photo Recognition: Model Performance vs Size')

    # Add legend
    plt.legend()

    # Set reasonable axis limits
    if len(plot_df) > 0:
        plt.xlim(left=0)
        plt.ylim(bottom=0)

    # Adjust layout to prevent label cutoff
    plt.tight_layout()
    
    return fig

if __name__ == "__main__":
    # When run as a script, save the plot to a file
    fig = create_performance_plot()
    fig.savefig('model_performance.png', dpi=300, bbox_inches='tight')