grok4-gpqa-eval / benchmarks /math_benchmark.py
TeddyYao's picture
Upload 38 files
8474f02 verified
from .base_benchmark import BaseBenchmark
from typing import Dict, Any, Optional, Tuple
from datasets import load_dataset
import re
from .evaluation_utils import normalize_math_answer, is_math_equiv
class MATHBenchmark(BaseBenchmark):
"""MATH (Mathematics) benchmark for competition-level problems"""
LEVELS = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
TYPES = ['Algebra', 'Counting & Probability', 'Geometry', 'Intermediate Algebra',
'Number Theory', 'Prealgebra', 'Precalculus']
def __init__(self):
super().__init__(name="MATH", dataset_name="hendrycks/competition_math")
async def load_dataset(self, sample_size: Optional[int] = None, **kwargs):
"""Load MATH dataset"""
dataset = load_dataset(self.dataset_name, split='test')
# Filter by difficulty level if specified
difficulty_levels = kwargs.get('difficulty', ['all'])
if 'all' not in difficulty_levels:
dataset = dataset.filter(lambda x: x['level'] in difficulty_levels)
self.dataset = []
for sample in dataset:
self.dataset.append({
'problem': sample['problem'],
'solution': sample['solution'],
'level': sample['level'],
'type': sample['type'],
'raw_sample': sample
})
# Shuffle dataset
import random
random.shuffle(self.dataset)
if sample_size and len(self.dataset) > sample_size:
self.dataset = self.dataset[:sample_size]
def extract_answer(self, solution: str) -> Optional[str]:
"""Extract the final answer from MATH solution using lm-eval's method"""
# Find all boxed content
boxed_matches = re.findall(r'\\boxed\{([^{}]*)\}', solution)
fbox_matches = re.findall(r'\\fbox\{([^{}]*)\}', solution)
all_matches = boxed_matches + fbox_matches
if all_matches:
# Return the last boxed answer
return all_matches[-1].strip()
return None
def extract_model_answer(self, response: str) -> Optional[str]:
"""Extract answer from model response"""
# Try to find boxed answer first
answer = self.extract_answer(response)
if answer:
return answer
# If no boxed answer, look for common patterns
# "The answer is X"
match = re.search(r'answer is[\s:]*([^.\n]+)', response, re.IGNORECASE)
if match:
return match.group(1).strip()
# "Therefore, X"
match = re.search(r'therefore[,\s]+([^.\n]+)', response, re.IGNORECASE)
if match:
return match.group(1).strip()
return None
def format_prompt(self, sample: Dict[str, Any]) -> str:
"""Format MATH problem as prompt"""
prompt = f"""Solve the following mathematics problem step by step. Show all your work and put your final answer in the format \\boxed{{answer}}.
Problem: {sample['problem']}
Solution:"""
return prompt
async def evaluate_sample(self, api, sample: Dict[str, Any], **kwargs) -> Tuple[bool, Dict[str, Any]]:
"""Evaluate a single MATH sample"""
prompt = self.format_prompt(sample)
try:
response = await api.generate_with_retry(prompt, **kwargs)
# Extract correct answer
correct_answer = self.extract_answer(sample['solution'])
# Extract model's answer
model_answer = self.extract_model_answer(response)
# Compare answers using mathematical equivalence
is_correct = False
if correct_answer and model_answer:
# Use the official equivalence checking
is_correct = is_math_equiv(model_answer, correct_answer)
result = {
'problem': sample['problem'],
'level': sample['level'],
'type': sample['type'],
'correct_answer': correct_answer,
'model_answer': model_answer,
'model_response': response,
'is_correct': is_correct
}
return is_correct, result
except Exception as e:
result = {
'problem': sample['problem'],
'level': sample['level'],
'type': sample['type'],
'error': str(e),
'is_correct': False
}
return False, result