|
import fire
|
|
import sys
|
|
|
|
from .data import HUMAN_EVAL
|
|
from .evaluation import evaluate_functional_correctness
|
|
|
|
|
|
def entry_point(
|
|
sample_file: str,
|
|
k: str = "1,10,100",
|
|
n_workers: int = 4,
|
|
timeout: float = 3.0,
|
|
problem_file: str = "",
|
|
is_mbpp: bool = False,
|
|
):
|
|
"""
|
|
Evaluates the functional correctness of generated samples, and writes
|
|
results to f"{sample_file}_results.jsonl.gz"
|
|
"""
|
|
k = list(map(int, k.split(",")))
|
|
results = evaluate_functional_correctness(sample_file, k, n_workers, timeout, problem_file, is_mbpp)
|
|
print(results)
|
|
|
|
|
|
def main():
|
|
fire.Fire(entry_point)
|
|
|
|
|
|
sys.exit(main())
|
|
|