Spaces:
Running
Running
File size: 1,841 Bytes
8474f02 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
# Official benchmark configuration matching lm-eval settings
models:
openai:
api_key: "${OPENAI_API_KEY}"
models:
- "gpt-4o"
- "gpt-4-turbo"
- "gpt-3.5-turbo"
anthropic:
api_key: "${ANTHROPIC_API_KEY}"
models:
- "claude-3-5-sonnet-20241022"
- "claude-3-opus-20240229"
- "claude-3-haiku-20240307"
grok:
api_key: "${GROK_API_KEY}"
base_url: "https://api.x.ai/v1"
models:
- "grok-4-0709"
- "grok-beta"
- "grok-2-latest"
benchmarks:
mmlu:
enabled: true
sample_size: null # Use full dataset
subjects: ["all"]
# Official settings
num_fewshot: 5
doc_to_choice: ["A", "B", "C", "D"]
gsm8k:
enabled: true
sample_size: null # Full test set (1319 samples)
# Official settings
num_fewshot: 8 # 8-shot CoT
use_cot: true
humaneval:
enabled: true
sample_size: null # Full test set (164 samples)
# Official settings
pass_at_k: [1] # Calculate Pass@1
do_sample: false # Deterministic generation
gpqa:
enabled: true
sample_size: null
subset: "gpqa_main" # or "gpqa_diamond" for harder subset
math:
enabled: true
sample_size: null # Full test set (5000 samples)
# Official settings
use_sympy: true # Use SymPy for equivalence checking
evaluation:
# Generation settings matching lm-eval
temperature: 0.0 # Deterministic for evaluation
max_tokens: 2048
top_p: 1.0
# For HumanEval code generation
humaneval_max_tokens: 1024
# System settings
timeout: 60 # Increased for complex problems
max_retries: 3
concurrent_requests: 5
rate_limit_delay: 0.5
output:
save_results: true
results_dir: "results"
generate_report: true
plot_graphs: true
save_raw_outputs: true # Save all model outputs for debugging |