Spaces:

TeddyYao
/

grok4-gpqa-eval

Running

App Files Files Community

grok4-gpqa-eval / benchmarks /math_benchmark.py

TeddyYao

Upload 38 files

8474f02 verified 23 days ago

raw

history blame contribute delete

4.68 kB

	from .base_benchmark import BaseBenchmark
	from typing import Dict, Any, Optional, Tuple
	from datasets import load_dataset
	import re
	from .evaluation_utils import normalize_math_answer, is_math_equiv

	class MATHBenchmark(BaseBenchmark):
	"""MATH (Mathematics) benchmark for competition-level problems"""

	LEVELS = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
	TYPES = ['Algebra', 'Counting & Probability', 'Geometry', 'Intermediate Algebra',
	'Number Theory', 'Prealgebra', 'Precalculus']

	def __init__(self):
	super().__init__(name="MATH", dataset_name="hendrycks/competition_math")

	async def load_dataset(self, sample_size: Optional[int] = None, **kwargs):
	"""Load MATH dataset"""
	dataset = load_dataset(self.dataset_name, split='test')

	# Filter by difficulty level if specified
	difficulty_levels = kwargs.get('difficulty', ['all'])
	if 'all' not in difficulty_levels:
	dataset = dataset.filter(lambda x: x['level'] in difficulty_levels)

	self.dataset = []
	for sample in dataset:
	self.dataset.append({
	'problem': sample['problem'],
	'solution': sample['solution'],
	'level': sample['level'],
	'type': sample['type'],
	'raw_sample': sample
	})

	# Shuffle dataset
	import random
	random.shuffle(self.dataset)

	if sample_size and len(self.dataset) > sample_size:
	self.dataset = self.dataset[:sample_size]

	def extract_answer(self, solution: str) -> Optional[str]:
	"""Extract the final answer from MATH solution using lm-eval's method"""
	# Find all boxed content
	boxed_matches = re.findall(r'\\boxed\{([^{}]*)\}', solution)
	fbox_matches = re.findall(r'\\fbox\{([^{}]*)\}', solution)

	all_matches = boxed_matches + fbox_matches

	if all_matches:
	# Return the last boxed answer
	return all_matches[-1].strip()

	return None

	def extract_model_answer(self, response: str) -> Optional[str]:
	"""Extract answer from model response"""
	# Try to find boxed answer first
	answer = self.extract_answer(response)
	if answer:
	return answer

	# If no boxed answer, look for common patterns
	# "The answer is X"
	match = re.search(r'answer is[\s:]*([^.\n]+)', response, re.IGNORECASE)
	if match:
	return match.group(1).strip()

	# "Therefore, X"
	match = re.search(r'therefore[,\s]+([^.\n]+)', response, re.IGNORECASE)
	if match:
	return match.group(1).strip()

	return None

	def format_prompt(self, sample: Dict[str, Any]) -> str:
	"""Format MATH problem as prompt"""
	prompt = f"""Solve the following mathematics problem step by step. Show all your work and put your final answer in the format \\boxed{{answer}}.

	Problem: {sample['problem']}

	Solution:"""
	return prompt

	async def evaluate_sample(self, api, sample: Dict[str, Any], **kwargs) -> Tuple[bool, Dict[str, Any]]:
	"""Evaluate a single MATH sample"""
	prompt = self.format_prompt(sample)

	try:
	response = await api.generate_with_retry(prompt, **kwargs)

	# Extract correct answer
	correct_answer = self.extract_answer(sample['solution'])

	# Extract model's answer
	model_answer = self.extract_model_answer(response)

	# Compare answers using mathematical equivalence
	is_correct = False
	if correct_answer and model_answer:
	# Use the official equivalence checking
	is_correct = is_math_equiv(model_answer, correct_answer)

	result = {
	'problem': sample['problem'],
	'level': sample['level'],
	'type': sample['type'],
	'correct_answer': correct_answer,
	'model_answer': model_answer,
	'model_response': response,
	'is_correct': is_correct
	}

	return is_correct, result

	except Exception as e:
	result = {
	'problem': sample['problem'],
	'level': sample['level'],
	'type': sample['type'],
	'error': str(e),
	'is_correct': False
	}
	return False, result