Spaces:
Running
Running
"""Main script to run AI model evaluation benchmarks""" | |
import argparse | |
import asyncio | |
import json | |
import os | |
import yaml | |
from datetime import datetime | |
from typing import List, Dict, Any | |
from dotenv import load_dotenv | |
import pandas as pd | |
from apis.api_factory import APIFactory | |
from benchmarks import get_benchmark, BenchmarkResult | |
# Load environment variables | |
load_dotenv() | |
def load_config(config_path: str = 'official_config.yaml') -> dict: | |
"""Load configuration from YAML file""" | |
with open(config_path, 'r') as f: | |
config = yaml.safe_load(f) | |
# Replace environment variables | |
def replace_env_vars(obj): | |
if isinstance(obj, str) and obj.startswith('${') and obj.endswith('}'): | |
env_var = obj[2:-1] | |
return os.getenv(env_var, obj) | |
elif isinstance(obj, dict): | |
return {k: replace_env_vars(v) for k, v in obj.items()} | |
elif isinstance(obj, list): | |
return [replace_env_vars(item) for item in obj] | |
return obj | |
return replace_env_vars(config) | |
def save_results(results: List[BenchmarkResult], output_dir: str): | |
"""Save evaluation results""" | |
os.makedirs(output_dir, exist_ok=True) | |
# Create timestamp for this run | |
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') | |
# Save detailed results as JSON | |
detailed_results = [] | |
for result in results: | |
detailed_results.append({ | |
'benchmark': result.benchmark_name, | |
'model': result.model_name, | |
'total_questions': result.total_questions, | |
'correct': result.correct, | |
'accuracy': result.accuracy, | |
'avg_response_time': result.avg_response_time, | |
'timestamp': timestamp | |
}) | |
json_path = os.path.join(output_dir, f'results_{timestamp}.json') | |
with open(json_path, 'w') as f: | |
json.dump(detailed_results, f, indent=2) | |
# Save summary as CSV | |
df = pd.DataFrame(detailed_results) | |
csv_path = os.path.join(output_dir, f'summary_{timestamp}.csv') | |
df.to_csv(csv_path, index=False) | |
# Save raw results for debugging | |
for result in results: | |
raw_path = os.path.join(output_dir, f'{result.model_name}_{result.benchmark_name}_{timestamp}_raw.json') | |
with open(raw_path, 'w') as f: | |
json.dump(result.raw_results, f, indent=2) | |
return json_path, csv_path | |
def print_results_table(results: List[BenchmarkResult]): | |
"""Print results in a nice table format""" | |
if not results: | |
return | |
# Group by model | |
model_results = {} | |
for result in results: | |
if result.model_name not in model_results: | |
model_results[result.model_name] = {} | |
model_results[result.model_name][result.benchmark_name] = result | |
# Print header | |
benchmarks = list(set(r.benchmark_name for r in results)) | |
benchmarks.sort() | |
print("\n" + "="*80) | |
print("EVALUATION RESULTS") | |
print("="*80) | |
# Create table | |
header = ["Model"] + benchmarks + ["Average"] | |
print(f"{'Model':<20}", end="") | |
for bench in benchmarks: | |
print(f"{bench:<15}", end="") | |
print(f"{'Average':<10}") | |
print("-"*80) | |
# Print results for each model | |
for model, bench_results in model_results.items(): | |
print(f"{model:<20}", end="") | |
scores = [] | |
for bench in benchmarks: | |
if bench in bench_results: | |
score = bench_results[bench].accuracy * 100 | |
scores.append(score) | |
print(f"{score:>6.1f}% ", end="") | |
else: | |
print(f"{'N/A':<15}", end="") | |
# Calculate average | |
if scores: | |
avg = sum(scores) / len(scores) | |
print(f"{avg:>6.1f}%") | |
else: | |
print("N/A") | |
print("="*80) | |
async def run_single_evaluation(api, benchmark_name: str, config: dict) -> BenchmarkResult: | |
"""Run a single benchmark evaluation""" | |
benchmark = get_benchmark(benchmark_name) | |
# Get benchmark-specific config | |
bench_config = config['benchmarks'].get(benchmark_name, {}) | |
eval_config = config['evaluation'] | |
# Merge configs | |
kwargs = { | |
**eval_config, | |
'concurrent_requests': eval_config.get('concurrent_requests', 5) | |
} | |
# Add benchmark-specific configs but exclude sample_size | |
for key, value in bench_config.items(): | |
if key != 'sample_size': | |
kwargs[key] = value | |
# Run benchmark | |
result = await benchmark.run_benchmark( | |
api, | |
sample_size=bench_config.get('sample_size'), | |
**kwargs | |
) | |
return result | |
async def main(): | |
parser = argparse.ArgumentParser(description='Run AI benchmark evaluation') | |
parser.add_argument('--models', nargs='+', help='Models to evaluate (e.g., gpt-4o claude-3-opus)') | |
parser.add_argument('--benchmarks', nargs='+', help='Benchmarks to run (e.g., mmlu gsm8k)') | |
parser.add_argument('--config', default='config.yaml', help='Config file path') | |
parser.add_argument('--output-dir', default='results', help='Output directory for results') | |
parser.add_argument('--no-save', action='store_true', help='Do not save results to files') | |
args = parser.parse_args() | |
# Load configuration | |
config = load_config(args.config) | |
# Determine which models to evaluate | |
if args.models: | |
models_to_eval = args.models | |
else: | |
# Get all models from config | |
models_to_eval = [] | |
for provider, provider_config in config['models'].items(): | |
for model in provider_config.get('models', []): | |
models_to_eval.append(model) | |
# Determine which benchmarks to run | |
if args.benchmarks: | |
benchmarks_to_run = args.benchmarks | |
else: | |
# Get enabled benchmarks from config | |
benchmarks_to_run = [ | |
name for name, bench_config in config['benchmarks'].items() | |
if bench_config.get('enabled', True) | |
] | |
print(f"Models to evaluate: {models_to_eval}") | |
print(f"Benchmarks to run: {benchmarks_to_run}") | |
# Run evaluations | |
all_results = [] | |
for model_name in models_to_eval: | |
print(f"\n{'='*60}") | |
print(f"Evaluating model: {model_name}") | |
print(f"{'='*60}") | |
try: | |
# Create API instance | |
api = APIFactory.create_api(model_name, config) | |
# Run each benchmark | |
for benchmark_name in benchmarks_to_run: | |
print(f"\nRunning {benchmark_name} benchmark...") | |
try: | |
result = await run_single_evaluation(api, benchmark_name, config) | |
all_results.append(result) | |
print(f"[OK] {benchmark_name}: {result.accuracy*100:.1f}% accuracy") | |
except Exception as e: | |
print(f"[ERROR] {benchmark_name}: Error - {e}") | |
except Exception as e: | |
print(f"Failed to create API for {model_name}: {e}") | |
continue | |
# Print results table | |
print_results_table(all_results) | |
# Save results | |
if not args.no_save and all_results: | |
json_path, csv_path = save_results(all_results, args.output_dir) | |
print(f"\nResults saved to:") | |
print(f" - {json_path}") | |
print(f" - {csv_path}") | |
if __name__ == "__main__": | |
asyncio.run(main()) |