from .base_benchmark import BaseBenchmark from typing import Dict, Any, Optional, Tuple from datasets import load_dataset import subprocess import tempfile import os import sys import re class HumanEvalBenchmark(BaseBenchmark): """HumanEval code generation benchmark""" def __init__(self): super().__init__(name="HumanEval", dataset_name="openai_humaneval") async def load_dataset(self, sample_size: Optional[int] = None, **kwargs): """Load HumanEval dataset""" dataset = load_dataset(self.dataset_name, split='test') self.dataset = [] for sample in dataset: self.dataset.append({ 'task_id': sample['task_id'], 'prompt': sample['prompt'], 'canonical_solution': sample['canonical_solution'], 'test': sample['test'], 'entry_point': sample['entry_point'], 'raw_sample': sample }) if sample_size and len(self.dataset) > sample_size: self.dataset = self.dataset[:sample_size] def format_prompt(self, sample: Dict[str, Any]) -> str: """Format HumanEval problem as prompt""" # lm-eval uses just the raw prompt without additional instructions return sample['prompt'] def extract_code(self, response: str, entry_point: str, prompt: str) -> str: """Extract code from model response""" # Clean the response - handle markdown code blocks code = response.strip() # Remove markdown code block markers if code.startswith('```python'): code = code[9:] # Remove ```python elif code.startswith('```'): code = code[3:] # Remove ``` if code.endswith('```'): code = code[:-3] # Remove trailing ``` code = code.strip() # If the response contains the complete function, use it directly if f"def {entry_point}" in code: return code else: # Fallback: assume it's completion to be added after prompt stop_sequences = ['\nclass', '\ndef', '\n#', '\nif __name__'] for stop in stop_sequences: pos = code.find(stop) if pos > 0: code = code[:pos] break return prompt + code def run_test(self, code: str, test_code: str) -> Tuple[bool, str]: """Run the test code and return success status and output""" with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f: # Write the complete test file f.write(code + '\n\n' + test_code) f.flush() try: # Run the test result = subprocess.run( [sys.executable, f.name], capture_output=True, text=True, timeout=10 ) if result.returncode == 0: return True, result.stdout else: return False, result.stderr except subprocess.TimeoutExpired: return False, "Timeout: Code execution took too long" except Exception as e: return False, f"Error running test: {str(e)}" finally: # Clean up try: os.unlink(f.name) except: pass async def evaluate_sample(self, api, sample: Dict[str, Any], **kwargs) -> Tuple[bool, Dict[str, Any]]: """Evaluate a single HumanEval sample""" prompt = self.format_prompt(sample) try: response = await api.generate_with_retry(prompt, **kwargs) # Extract code from response code = self.extract_code(response, sample['entry_point'], sample['prompt']) # Run the test is_correct, test_output = self.run_test(code, sample['test']) result = { 'task_id': sample['task_id'], 'prompt': sample['prompt'], 'model_response': response, 'extracted_code': code, 'is_correct': is_correct, 'test_output': test_output, 'entry_point': sample['entry_point'] } return is_correct, result except Exception as e: result = { 'task_id': sample['task_id'], 'prompt': sample['prompt'], 'error': str(e), 'is_correct': False } return False, result