"""Main script to run AI model evaluation benchmarks""" import argparse import asyncio import json import os import yaml from datetime import datetime from typing import List, Dict, Any from dotenv import load_dotenv import pandas as pd from apis.api_factory import APIFactory from benchmarks import get_benchmark, BenchmarkResult # Load environment variables load_dotenv() def load_config(config_path: str = 'official_config.yaml') -> dict: """Load configuration from YAML file""" with open(config_path, 'r') as f: config = yaml.safe_load(f) # Replace environment variables def replace_env_vars(obj): if isinstance(obj, str) and obj.startswith('${') and obj.endswith('}'): env_var = obj[2:-1] return os.getenv(env_var, obj) elif isinstance(obj, dict): return {k: replace_env_vars(v) for k, v in obj.items()} elif isinstance(obj, list): return [replace_env_vars(item) for item in obj] return obj return replace_env_vars(config) def save_results(results: List[BenchmarkResult], output_dir: str): """Save evaluation results""" os.makedirs(output_dir, exist_ok=True) # Create timestamp for this run timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') # Save detailed results as JSON detailed_results = [] for result in results: detailed_results.append({ 'benchmark': result.benchmark_name, 'model': result.model_name, 'total_questions': result.total_questions, 'correct': result.correct, 'accuracy': result.accuracy, 'avg_response_time': result.avg_response_time, 'timestamp': timestamp }) json_path = os.path.join(output_dir, f'results_{timestamp}.json') with open(json_path, 'w') as f: json.dump(detailed_results, f, indent=2) # Save summary as CSV df = pd.DataFrame(detailed_results) csv_path = os.path.join(output_dir, f'summary_{timestamp}.csv') df.to_csv(csv_path, index=False) # Save raw results for debugging for result in results: raw_path = os.path.join(output_dir, f'{result.model_name}_{result.benchmark_name}_{timestamp}_raw.json') with open(raw_path, 'w') as f: json.dump(result.raw_results, f, indent=2) return json_path, csv_path def print_results_table(results: List[BenchmarkResult]): """Print results in a nice table format""" if not results: return # Group by model model_results = {} for result in results: if result.model_name not in model_results: model_results[result.model_name] = {} model_results[result.model_name][result.benchmark_name] = result # Print header benchmarks = list(set(r.benchmark_name for r in results)) benchmarks.sort() print("\n" + "="*80) print("EVALUATION RESULTS") print("="*80) # Create table header = ["Model"] + benchmarks + ["Average"] print(f"{'Model':<20}", end="") for bench in benchmarks: print(f"{bench:<15}", end="") print(f"{'Average':<10}") print("-"*80) # Print results for each model for model, bench_results in model_results.items(): print(f"{model:<20}", end="") scores = [] for bench in benchmarks: if bench in bench_results: score = bench_results[bench].accuracy * 100 scores.append(score) print(f"{score:>6.1f}% ", end="") else: print(f"{'N/A':<15}", end="") # Calculate average if scores: avg = sum(scores) / len(scores) print(f"{avg:>6.1f}%") else: print("N/A") print("="*80) async def run_single_evaluation(api, benchmark_name: str, config: dict) -> BenchmarkResult: """Run a single benchmark evaluation""" benchmark = get_benchmark(benchmark_name) # Get benchmark-specific config bench_config = config['benchmarks'].get(benchmark_name, {}) eval_config = config['evaluation'] # Merge configs kwargs = { **eval_config, 'concurrent_requests': eval_config.get('concurrent_requests', 5) } # Add benchmark-specific configs but exclude sample_size for key, value in bench_config.items(): if key != 'sample_size': kwargs[key] = value # Run benchmark result = await benchmark.run_benchmark( api, sample_size=bench_config.get('sample_size'), **kwargs ) return result async def main(): parser = argparse.ArgumentParser(description='Run AI benchmark evaluation') parser.add_argument('--models', nargs='+', help='Models to evaluate (e.g., gpt-4o claude-3-opus)') parser.add_argument('--benchmarks', nargs='+', help='Benchmarks to run (e.g., mmlu gsm8k)') parser.add_argument('--config', default='config.yaml', help='Config file path') parser.add_argument('--output-dir', default='results', help='Output directory for results') parser.add_argument('--no-save', action='store_true', help='Do not save results to files') args = parser.parse_args() # Load configuration config = load_config(args.config) # Determine which models to evaluate if args.models: models_to_eval = args.models else: # Get all models from config models_to_eval = [] for provider, provider_config in config['models'].items(): for model in provider_config.get('models', []): models_to_eval.append(model) # Determine which benchmarks to run if args.benchmarks: benchmarks_to_run = args.benchmarks else: # Get enabled benchmarks from config benchmarks_to_run = [ name for name, bench_config in config['benchmarks'].items() if bench_config.get('enabled', True) ] print(f"Models to evaluate: {models_to_eval}") print(f"Benchmarks to run: {benchmarks_to_run}") # Run evaluations all_results = [] for model_name in models_to_eval: print(f"\n{'='*60}") print(f"Evaluating model: {model_name}") print(f"{'='*60}") try: # Create API instance api = APIFactory.create_api(model_name, config) # Run each benchmark for benchmark_name in benchmarks_to_run: print(f"\nRunning {benchmark_name} benchmark...") try: result = await run_single_evaluation(api, benchmark_name, config) all_results.append(result) print(f"[OK] {benchmark_name}: {result.accuracy*100:.1f}% accuracy") except Exception as e: print(f"[ERROR] {benchmark_name}: Error - {e}") except Exception as e: print(f"Failed to create API for {model_name}: {e}") continue # Print results table print_results_table(all_results) # Save results if not args.no_save and all_results: json_path, csv_path = save_results(all_results, args.output_dir) print(f"\nResults saved to:") print(f" - {json_path}") print(f" - {csv_path}") if __name__ == "__main__": asyncio.run(main())