from .base_benchmark import BaseBenchmark, BenchmarkResult from .mmlu_benchmark import MMLUBenchmark from .gsm8k_benchmark import GSM8KBenchmark from .humaneval_benchmark import HumanEvalBenchmark from .gpqa_benchmark import GPQABenchmark from .math_benchmark import MATHBenchmark BENCHMARK_REGISTRY = { 'mmlu': MMLUBenchmark, 'gsm8k': GSM8KBenchmark, 'humaneval': HumanEvalBenchmark, 'gpqa': GPQABenchmark, 'math': MATHBenchmark } def get_benchmark(name: str) -> BaseBenchmark: """Get benchmark instance by name""" if name.lower() not in BENCHMARK_REGISTRY: raise ValueError(f"Unknown benchmark: {name}. Available: {list(BENCHMARK_REGISTRY.keys())}") return BENCHMARK_REGISTRY[name.lower()]()