# Official benchmark configuration matching lm-eval settings models: openai: api_key: "${OPENAI_API_KEY}" models: - "gpt-4o" - "gpt-4-turbo" - "gpt-3.5-turbo" anthropic: api_key: "${ANTHROPIC_API_KEY}" models: - "claude-3-5-sonnet-20241022" - "claude-3-opus-20240229" - "claude-3-haiku-20240307" grok: api_key: "${GROK_API_KEY}" base_url: "https://api.x.ai/v1" models: - "grok-4-0709" - "grok-beta" - "grok-2-latest" benchmarks: mmlu: enabled: true sample_size: null # Use full dataset subjects: ["all"] # Official settings num_fewshot: 5 doc_to_choice: ["A", "B", "C", "D"] gsm8k: enabled: true sample_size: null # Full test set (1319 samples) # Official settings num_fewshot: 8 # 8-shot CoT use_cot: true humaneval: enabled: true sample_size: null # Full test set (164 samples) # Official settings pass_at_k: [1] # Calculate Pass@1 do_sample: false # Deterministic generation gpqa: enabled: true sample_size: null subset: "gpqa_main" # or "gpqa_diamond" for harder subset math: enabled: true sample_size: null # Full test set (5000 samples) # Official settings use_sympy: true # Use SymPy for equivalence checking evaluation: # Generation settings matching lm-eval temperature: 0.0 # Deterministic for evaluation max_tokens: 2048 top_p: 1.0 # For HumanEval code generation humaneval_max_tokens: 1024 # System settings timeout: 60 # Increased for complex problems max_retries: 3 concurrent_requests: 5 rate_limit_delay: 0.5 output: save_results: true results_dir: "results" generate_report: true plot_graphs: true save_raw_outputs: true # Save all model outputs for debugging