Spaces:
Running
Running
# Official benchmark configuration matching lm-eval settings | |
models: | |
openai: | |
api_key: "${OPENAI_API_KEY}" | |
models: | |
- "gpt-4o" | |
- "gpt-4-turbo" | |
- "gpt-3.5-turbo" | |
anthropic: | |
api_key: "${ANTHROPIC_API_KEY}" | |
models: | |
- "claude-3-5-sonnet-20241022" | |
- "claude-3-opus-20240229" | |
- "claude-3-haiku-20240307" | |
grok: | |
api_key: "${GROK_API_KEY}" | |
base_url: "https://api.x.ai/v1" | |
models: | |
- "grok-4-0709" | |
- "grok-beta" | |
- "grok-2-latest" | |
benchmarks: | |
mmlu: | |
enabled: true | |
sample_size: null # Use full dataset | |
subjects: ["all"] | |
# Official settings | |
num_fewshot: 5 | |
doc_to_choice: ["A", "B", "C", "D"] | |
gsm8k: | |
enabled: true | |
sample_size: null # Full test set (1319 samples) | |
# Official settings | |
num_fewshot: 8 # 8-shot CoT | |
use_cot: true | |
humaneval: | |
enabled: true | |
sample_size: null # Full test set (164 samples) | |
# Official settings | |
pass_at_k: [1] # Calculate Pass@1 | |
do_sample: false # Deterministic generation | |
gpqa: | |
enabled: true | |
sample_size: null | |
subset: "gpqa_main" # or "gpqa_diamond" for harder subset | |
math: | |
enabled: true | |
sample_size: null # Full test set (5000 samples) | |
# Official settings | |
use_sympy: true # Use SymPy for equivalence checking | |
evaluation: | |
# Generation settings matching lm-eval | |
temperature: 0.0 # Deterministic for evaluation | |
max_tokens: 2048 | |
top_p: 1.0 | |
# For HumanEval code generation | |
humaneval_max_tokens: 1024 | |
# System settings | |
timeout: 60 # Increased for complex problems | |
max_retries: 3 | |
concurrent_requests: 5 | |
rate_limit_delay: 0.5 | |
output: | |
save_results: true | |
results_dir: "results" | |
generate_report: true | |
plot_graphs: true | |
save_raw_outputs: true # Save all model outputs for debugging |