grok4-gpqa-eval / official_config.yaml
TeddyYao's picture
Upload 38 files
8474f02 verified
# Official benchmark configuration matching lm-eval settings
models:
openai:
api_key: "${OPENAI_API_KEY}"
models:
- "gpt-4o"
- "gpt-4-turbo"
- "gpt-3.5-turbo"
anthropic:
api_key: "${ANTHROPIC_API_KEY}"
models:
- "claude-3-5-sonnet-20241022"
- "claude-3-opus-20240229"
- "claude-3-haiku-20240307"
grok:
api_key: "${GROK_API_KEY}"
base_url: "https://api.x.ai/v1"
models:
- "grok-4-0709"
- "grok-beta"
- "grok-2-latest"
benchmarks:
mmlu:
enabled: true
sample_size: null # Use full dataset
subjects: ["all"]
# Official settings
num_fewshot: 5
doc_to_choice: ["A", "B", "C", "D"]
gsm8k:
enabled: true
sample_size: null # Full test set (1319 samples)
# Official settings
num_fewshot: 8 # 8-shot CoT
use_cot: true
humaneval:
enabled: true
sample_size: null # Full test set (164 samples)
# Official settings
pass_at_k: [1] # Calculate Pass@1
do_sample: false # Deterministic generation
gpqa:
enabled: true
sample_size: null
subset: "gpqa_main" # or "gpqa_diamond" for harder subset
math:
enabled: true
sample_size: null # Full test set (5000 samples)
# Official settings
use_sympy: true # Use SymPy for equivalence checking
evaluation:
# Generation settings matching lm-eval
temperature: 0.0 # Deterministic for evaluation
max_tokens: 2048
top_p: 1.0
# For HumanEval code generation
humaneval_max_tokens: 1024
# System settings
timeout: 60 # Increased for complex problems
max_retries: 3
concurrent_requests: 5
rate_limit_delay: 0.5
output:
save_results: true
results_dir: "results"
generate_report: true
plot_graphs: true
save_raw_outputs: true # Save all model outputs for debugging