grok4-gpqa-eval / benchmarks /humaneval_benchmark.py
TeddyYao's picture
Upload 38 files
8474f02 verified
from .base_benchmark import BaseBenchmark
from typing import Dict, Any, Optional, Tuple
from datasets import load_dataset
import subprocess
import tempfile
import os
import sys
import re
class HumanEvalBenchmark(BaseBenchmark):
"""HumanEval code generation benchmark"""
def __init__(self):
super().__init__(name="HumanEval", dataset_name="openai_humaneval")
async def load_dataset(self, sample_size: Optional[int] = None, **kwargs):
"""Load HumanEval dataset"""
dataset = load_dataset(self.dataset_name, split='test')
self.dataset = []
for sample in dataset:
self.dataset.append({
'task_id': sample['task_id'],
'prompt': sample['prompt'],
'canonical_solution': sample['canonical_solution'],
'test': sample['test'],
'entry_point': sample['entry_point'],
'raw_sample': sample
})
if sample_size and len(self.dataset) > sample_size:
self.dataset = self.dataset[:sample_size]
def format_prompt(self, sample: Dict[str, Any]) -> str:
"""Format HumanEval problem as prompt"""
# lm-eval uses just the raw prompt without additional instructions
return sample['prompt']
def extract_code(self, response: str, entry_point: str, prompt: str) -> str:
"""Extract code from model response"""
# Clean the response - handle markdown code blocks
code = response.strip()
# Remove markdown code block markers
if code.startswith('```python'):
code = code[9:] # Remove ```python
elif code.startswith('```'):
code = code[3:] # Remove ```
if code.endswith('```'):
code = code[:-3] # Remove trailing ```
code = code.strip()
# If the response contains the complete function, use it directly
if f"def {entry_point}" in code:
return code
else:
# Fallback: assume it's completion to be added after prompt
stop_sequences = ['\nclass', '\ndef', '\n#', '\nif __name__']
for stop in stop_sequences:
pos = code.find(stop)
if pos > 0:
code = code[:pos]
break
return prompt + code
def run_test(self, code: str, test_code: str) -> Tuple[bool, str]:
"""Run the test code and return success status and output"""
with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
# Write the complete test file
f.write(code + '\n\n' + test_code)
f.flush()
try:
# Run the test
result = subprocess.run(
[sys.executable, f.name],
capture_output=True,
text=True,
timeout=10
)
if result.returncode == 0:
return True, result.stdout
else:
return False, result.stderr
except subprocess.TimeoutExpired:
return False, "Timeout: Code execution took too long"
except Exception as e:
return False, f"Error running test: {str(e)}"
finally:
# Clean up
try:
os.unlink(f.name)
except:
pass
async def evaluate_sample(self, api, sample: Dict[str, Any], **kwargs) -> Tuple[bool, Dict[str, Any]]:
"""Evaluate a single HumanEval sample"""
prompt = self.format_prompt(sample)
try:
response = await api.generate_with_retry(prompt, **kwargs)
# Extract code from response
code = self.extract_code(response, sample['entry_point'], sample['prompt'])
# Run the test
is_correct, test_output = self.run_test(code, sample['test'])
result = {
'task_id': sample['task_id'],
'prompt': sample['prompt'],
'model_response': response,
'extracted_code': code,
'is_correct': is_correct,
'test_output': test_output,
'entry_point': sample['entry_point']
}
return is_correct, result
except Exception as e:
result = {
'task_id': sample['task_id'],
'prompt': sample['prompt'],
'error': str(e),
'is_correct': False
}
return False, result