Polish_Cultural_Vision_Benchmark

Running

Polish_Cultural_Vision_Benchmark / plot_results.py

Refactor app.py to use JSON for benchmark data, removing CSV and metadata dependencies. Update performance plotting to reflect new data structure and enhance visualization with cultural context. Introduce benchmark report JSON file for structured model evaluation results.

fd35185 2 months ago

raw

history blame contribute delete

6.03 kB

	import pandas as pd
	import matplotlib.pyplot as plt
	import numpy as np
	import json

	def create_performance_plot(json_path='benchmark_report.json'):
	# Define whitelist of interesting models (partial matches)
	WHITELIST = [
	'Meta Llama 4 Maverick',
	'Anthropic Claude 3.7 Sonnet',
	'OpenAI GPT-4o'
	]

	# Load the benchmark results from JSON
	with open(json_path, 'r') as f:
	json_data = json.load(f)

	# Create DataFrame from JSON data
	df = pd.DataFrame(json_data)

	# Rename columns for consistency
	df = df.rename(columns={
	"Model Name": "Model Path",
	"Model Size": "Model Size Raw"
	})

	# Calculate overall benchmark score as average of Avg (object) and Avg (country)
	df['Benchmark Score'] = (df['Avg (object)'] + df['Avg (country)']) / 2

	# Process model sizes - convert to numeric, handle "-" and extract numbers
	df['Model Size'] = df['Model Size Raw'].replace("-", np.nan)

	# Extract numeric values from size strings like "72 MB" -> 72 or plain "72" -> 72
	def extract_size(size_val):
	if pd.isna(size_val):
	return np.nan
	if isinstance(size_val, (int, float)):
	return float(size_val)
	if isinstance(size_val, str):
	# Try to extract number from string (handles both "72" and "72 MB")
	import re
	match = re.search(r'(\d+(?:\.\d+)?)', str(size_val))
	if match:
	return float(match.group(1))
	return np.nan

	df['Model Size'] = df['Model Size'].apply(extract_size)

	# Remove models without size information for plotting
	df_with_size = df[df['Model Size'].notna()].copy()

	# Print models without size before filtering
	print("\nModels without size assigned:")
	models_without_size = df[df['Model Size'].isna()]
	for idx, row in models_without_size.iterrows():
	print(f"- {row['Model Path']}")

	# Remove extreme outliers (scores that are clearly errors)
	if len(df_with_size) > 0:
	q1 = df_with_size['Benchmark Score'].quantile(0.25)
	q3 = df_with_size['Benchmark Score'].quantile(0.75)
	iqr = q3 - q1
	df_with_size = df_with_size[
	(df_with_size['Benchmark Score'] >= q1 - 1.5 * iqr) &
	(df_with_size['Benchmark Score'] <= q3 + 1.5 * iqr)
	]

	# Find models on Pareto frontier
	sizes = sorted(df_with_size['Model Size'].unique())
	frontier_points = []
	max_score = float('-inf')
	frontier_models = set()

	for size in sizes:
	# Get scores for models of this size or smaller
	subset = df_with_size[df_with_size['Model Size'] <= size]
	if len(subset) > 0:
	max_score_idx = subset['Benchmark Score'].idxmax()
	current_max = subset.loc[max_score_idx, 'Benchmark Score']
	if current_max > max_score:
	max_score = current_max
	frontier_points.append((size, max_score))
	frontier_models.add(subset.loc[max_score_idx, 'Model Path'])

	# Filter models - keep those on Pareto frontier or matching whitelist
	df_with_size['Keep'] = False
	for idx, row in df_with_size.iterrows():
	if row['Model Path'] in frontier_models:
	df_with_size.loc[idx, 'Keep'] = True
	else:
	for pattern in WHITELIST:
	if pattern in row['Model Path']:
	df_with_size.loc[idx, 'Keep'] = True
	break

	# Also include models without size if they're in whitelist
	df_no_size = df[df['Model Size'].isna()].copy()
	df_no_size['Keep'] = False
	for idx, row in df_no_size.iterrows():
	for pattern in WHITELIST:
	if pattern in row['Model Path']:
	df_no_size.loc[idx, 'Keep'] = True
	break

	# Combine datasets for plotting
	plot_df = df_with_size[df_with_size['Keep']].copy()

	# Create the plot
	fig = plt.figure(figsize=(12, 8))

	if len(plot_df) > 0:
	# Create scatter plot
	plt.scatter(plot_df['Model Size'],
	plot_df['Benchmark Score'],
	alpha=0.6, s=60)

	# Add labels for points
	for idx, row in plot_df.iterrows():
	# Use the full model name for labeling
	model_name = row['Model Path']

	plt.annotate(model_name,
	(row['Model Size'], row['Benchmark Score']),
	xytext=(5, 5), textcoords='offset points',
	fontsize=8,
	bbox=dict(facecolor='white', alpha=0.7, edgecolor='none', pad=0.5))

	# Plot the Pareto frontier line
	if frontier_points:
	frontier_x, frontier_y = zip(*frontier_points)
	plt.plot(frontier_x, frontier_y, 'r--', label='Pareto frontier', linewidth=2)

	# Add vertical line for consumer GPU budget (assuming 24GB can handle ~12B parameters)
	plt.axvline(x=12, color='gray', linestyle=':', label='Consumer-budget GPU limit', ymin=-0.15, clip_on=False)
	plt.text(12, plt.ylim()[0] - (plt.ylim()[1] - plt.ylim()[0]) * 0.1,
	'Consumer-budget\nGPU (24GB) limit\nin half precision',
	horizontalalignment='center', verticalalignment='top')

	# Customize the plot
	plt.grid(True, linestyle='--', alpha=0.7)
	plt.xlabel('Model Size (billions of parameters)')
	plt.ylabel('Benchmark Score (Average of Object & Country Recognition)')
	plt.title('Polish Photo Recognition: Model Performance vs Size')

	# Add legend
	plt.legend()

	# Set reasonable axis limits
	if len(plot_df) > 0:
	plt.xlim(left=0)
	plt.ylim(bottom=0)

	# Adjust layout to prevent label cutoff
	plt.tight_layout()

	return fig

	if __name__ == "__main__":
	# When run as a script, save the plot to a file
	fig = create_performance_plot()
	fig.savefig('model_performance.png', dpi=300, bbox_inches='tight')