File size: 6,025 Bytes
7a9f32a
 
 
 
 
fd35185
7a9f32a
 
fd35185
 
 
7a9f32a
 
fd35185
 
 
7a9f32a
fd35185
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7a9f32a
 
fd35185
 
 
7a9f32a
 
fd35185
 
 
 
 
 
 
 
7a9f32a
 
fd35185
7a9f32a
 
 
 
 
 
fd35185
7a9f32a
 
 
 
 
 
 
 
 
fd35185
 
7a9f32a
fd35185
7a9f32a
 
 
fd35185
7a9f32a
 
fd35185
 
 
 
 
 
 
 
 
 
 
7a9f32a
 
 
 
fd35185
 
 
 
 
7a9f32a
fd35185
 
 
7a9f32a
 
fd35185
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f820897
7a9f32a
 
 
fd35185
 
7a9f32a
 
 
 
fd35185
 
 
 
 
7a9f32a
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json

def create_performance_plot(json_path='benchmark_report.json'):
    # Define whitelist of interesting models (partial matches)
    WHITELIST = [
        'Meta Llama 4 Maverick',
        'Anthropic Claude 3.7 Sonnet',
        'OpenAI GPT-4o'
    ]

    # Load the benchmark results from JSON
    with open(json_path, 'r') as f:
        json_data = json.load(f)

    # Create DataFrame from JSON data
    df = pd.DataFrame(json_data)
    
    # Rename columns for consistency
    df = df.rename(columns={
        "Model Name": "Model Path",
        "Model Size": "Model Size Raw"
    })
    
    # Calculate overall benchmark score as average of Avg (object) and Avg (country)
    df['Benchmark Score'] = (df['Avg (object)'] + df['Avg (country)']) / 2
    
    # Process model sizes - convert to numeric, handle "-" and extract numbers
    df['Model Size'] = df['Model Size Raw'].replace("-", np.nan)
    
    # Extract numeric values from size strings like "72 MB" -> 72 or plain "72" -> 72
    def extract_size(size_val):
        if pd.isna(size_val):
            return np.nan
        if isinstance(size_val, (int, float)):
            return float(size_val)
        if isinstance(size_val, str):
            # Try to extract number from string (handles both "72" and "72 MB")
            import re
            match = re.search(r'(\d+(?:\.\d+)?)', str(size_val))
            if match:
                return float(match.group(1))
        return np.nan
    
    df['Model Size'] = df['Model Size'].apply(extract_size)
    
    # Remove models without size information for plotting
    df_with_size = df[df['Model Size'].notna()].copy()
    
    # Print models without size before filtering
    print("\nModels without size assigned:")
    models_without_size = df[df['Model Size'].isna()]
    for idx, row in models_without_size.iterrows():
        print(f"- {row['Model Path']}")

    # Remove extreme outliers (scores that are clearly errors)
    if len(df_with_size) > 0:
        q1 = df_with_size['Benchmark Score'].quantile(0.25)
        q3 = df_with_size['Benchmark Score'].quantile(0.75)
        iqr = q3 - q1
        df_with_size = df_with_size[
            (df_with_size['Benchmark Score'] >= q1 - 1.5 * iqr) & 
            (df_with_size['Benchmark Score'] <= q3 + 1.5 * iqr)
        ]

    # Find models on Pareto frontier
    sizes = sorted(df_with_size['Model Size'].unique())
    frontier_points = []
    max_score = float('-inf')
    frontier_models = set()

    for size in sizes:
        # Get scores for models of this size or smaller
        subset = df_with_size[df_with_size['Model Size'] <= size]
        if len(subset) > 0:
            max_score_idx = subset['Benchmark Score'].idxmax()
            current_max = subset.loc[max_score_idx, 'Benchmark Score']
            if current_max > max_score:
                max_score = current_max
                frontier_points.append((size, max_score))
                frontier_models.add(subset.loc[max_score_idx, 'Model Path'])

    # Filter models - keep those on Pareto frontier or matching whitelist
    df_with_size['Keep'] = False
    for idx, row in df_with_size.iterrows():
        if row['Model Path'] in frontier_models:
            df_with_size.loc[idx, 'Keep'] = True
        else:
            for pattern in WHITELIST:
                if pattern in row['Model Path']:
                    df_with_size.loc[idx, 'Keep'] = True
                    break

    # Also include models without size if they're in whitelist
    df_no_size = df[df['Model Size'].isna()].copy()
    df_no_size['Keep'] = False
    for idx, row in df_no_size.iterrows():
        for pattern in WHITELIST:
            if pattern in row['Model Path']:
                df_no_size.loc[idx, 'Keep'] = True
                break

    # Combine datasets for plotting
    plot_df = df_with_size[df_with_size['Keep']].copy()

    # Create the plot
    fig = plt.figure(figsize=(12, 8))

    if len(plot_df) > 0:
        # Create scatter plot
        plt.scatter(plot_df['Model Size'], 
                   plot_df['Benchmark Score'],
                   alpha=0.6, s=60)

        # Add labels for points
        for idx, row in plot_df.iterrows():
            # Use the full model name for labeling
            model_name = row['Model Path']
            
            plt.annotate(model_name,
                        (row['Model Size'], row['Benchmark Score']),
                        xytext=(5, 5), textcoords='offset points',
                        fontsize=8,
                        bbox=dict(facecolor='white', alpha=0.7, edgecolor='none', pad=0.5))

        # Plot the Pareto frontier line
        if frontier_points:
            frontier_x, frontier_y = zip(*frontier_points)
            plt.plot(frontier_x, frontier_y, 'r--', label='Pareto frontier', linewidth=2)

        # Add vertical line for consumer GPU budget (assuming 24GB can handle ~12B parameters)
        plt.axvline(x=12, color='gray', linestyle=':', label='Consumer-budget GPU limit', ymin=-0.15, clip_on=False)
        plt.text(12, plt.ylim()[0] - (plt.ylim()[1] - plt.ylim()[0]) * 0.1, 
                 'Consumer-budget\nGPU (24GB) limit\nin half precision', 
                 horizontalalignment='center', verticalalignment='top')

    # Customize the plot
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.xlabel('Model Size (billions of parameters)')
    plt.ylabel('Benchmark Score (Average of Object & Country Recognition)')
    plt.title('Polish Photo Recognition: Model Performance vs Size')

    # Add legend
    plt.legend()

    # Set reasonable axis limits
    if len(plot_df) > 0:
        plt.xlim(left=0)
        plt.ylim(bottom=0)

    # Adjust layout to prevent label cutoff
    plt.tight_layout()
    
    return fig

if __name__ == "__main__":
    # When run as a script, save the plot to a file
    fig = create_performance_plot()
    fig.savefig('model_performance.png', dpi=300, bbox_inches='tight')