Spaces:
Running
Running
from .base_benchmark import BaseBenchmark | |
from typing import Dict, Any, Optional, Tuple | |
from datasets import load_dataset | |
import subprocess | |
import tempfile | |
import os | |
import sys | |
import re | |
class HumanEvalBenchmark(BaseBenchmark): | |
"""HumanEval code generation benchmark""" | |
def __init__(self): | |
super().__init__(name="HumanEval", dataset_name="openai_humaneval") | |
async def load_dataset(self, sample_size: Optional[int] = None, **kwargs): | |
"""Load HumanEval dataset""" | |
dataset = load_dataset(self.dataset_name, split='test') | |
self.dataset = [] | |
for sample in dataset: | |
self.dataset.append({ | |
'task_id': sample['task_id'], | |
'prompt': sample['prompt'], | |
'canonical_solution': sample['canonical_solution'], | |
'test': sample['test'], | |
'entry_point': sample['entry_point'], | |
'raw_sample': sample | |
}) | |
if sample_size and len(self.dataset) > sample_size: | |
self.dataset = self.dataset[:sample_size] | |
def format_prompt(self, sample: Dict[str, Any]) -> str: | |
"""Format HumanEval problem as prompt""" | |
# lm-eval uses just the raw prompt without additional instructions | |
return sample['prompt'] | |
def extract_code(self, response: str, entry_point: str, prompt: str) -> str: | |
"""Extract code from model response""" | |
# Clean the response - handle markdown code blocks | |
code = response.strip() | |
# Remove markdown code block markers | |
if code.startswith('```python'): | |
code = code[9:] # Remove ```python | |
elif code.startswith('```'): | |
code = code[3:] # Remove ``` | |
if code.endswith('```'): | |
code = code[:-3] # Remove trailing ``` | |
code = code.strip() | |
# If the response contains the complete function, use it directly | |
if f"def {entry_point}" in code: | |
return code | |
else: | |
# Fallback: assume it's completion to be added after prompt | |
stop_sequences = ['\nclass', '\ndef', '\n#', '\nif __name__'] | |
for stop in stop_sequences: | |
pos = code.find(stop) | |
if pos > 0: | |
code = code[:pos] | |
break | |
return prompt + code | |
def run_test(self, code: str, test_code: str) -> Tuple[bool, str]: | |
"""Run the test code and return success status and output""" | |
with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f: | |
# Write the complete test file | |
f.write(code + '\n\n' + test_code) | |
f.flush() | |
try: | |
# Run the test | |
result = subprocess.run( | |
[sys.executable, f.name], | |
capture_output=True, | |
text=True, | |
timeout=10 | |
) | |
if result.returncode == 0: | |
return True, result.stdout | |
else: | |
return False, result.stderr | |
except subprocess.TimeoutExpired: | |
return False, "Timeout: Code execution took too long" | |
except Exception as e: | |
return False, f"Error running test: {str(e)}" | |
finally: | |
# Clean up | |
try: | |
os.unlink(f.name) | |
except: | |
pass | |
async def evaluate_sample(self, api, sample: Dict[str, Any], **kwargs) -> Tuple[bool, Dict[str, Any]]: | |
"""Evaluate a single HumanEval sample""" | |
prompt = self.format_prompt(sample) | |
try: | |
response = await api.generate_with_retry(prompt, **kwargs) | |
# Extract code from response | |
code = self.extract_code(response, sample['entry_point'], sample['prompt']) | |
# Run the test | |
is_correct, test_output = self.run_test(code, sample['test']) | |
result = { | |
'task_id': sample['task_id'], | |
'prompt': sample['prompt'], | |
'model_response': response, | |
'extracted_code': code, | |
'is_correct': is_correct, | |
'test_output': test_output, | |
'entry_point': sample['entry_point'] | |
} | |
return is_correct, result | |
except Exception as e: | |
result = { | |
'task_id': sample['task_id'], | |
'prompt': sample['prompt'], | |
'error': str(e), | |
'is_correct': False | |
} | |
return False, result |