Spaces:

WNJXYK
/

RPC

Running

WNJXYK commited on 8 days ago

Commit

22c93a7

verified ·

1 Parent(s): fba7b7c

Upload 16 files

Browse files

Files changed (16) hide show

__init__.py +0 -0
all_exps.sh +10 -0
app.py +174 -0
build_cache.py +46 -0
compute_perp.py +144 -0
compute_rpc.py +136 -0
compute_sc.py +108 -0
data_processing/answer_extraction.py +362 -0
data_processing/process_utils.py +191 -0
eval/eval_script.py +190 -0
eval/eval_utils.py +400 -0
eval/ocwcourses_eval_utils.py +266 -0
eval/python_executor.py +193 -0
main.py +55 -0
metrics.py +115 -0
requirements.txt +10 -0

__init__.py ADDED Viewed

File without changes

all_exps.sh ADDED Viewed

	@@ -0,0 +1,10 @@

+for model in InternLM2-Math-Plus-7B Deepseek-Math-RL-7B InternLM2-Math-Plus-1.8B; do
+    for method in PPL SC RPC; do
+        python main.py --dataset MATH --model $model --method $method --K 64
+    done
+    for dataset in MathOdyssey AIME OlympiadBench; do
+        for method in PPL SC RPC; do
+            python main.py --dataset $dataset --model $model --method $method --K 128
+        done
+    done
+done

app.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import gradio as gr
+import json, os
+from huggingface_hub import hf_hub_download
+from compute_perp import prep_evaluator, numberic_compare, check_equal
+from compute_sc import sc_evaluator
+from compute_rpc import wpc_evaluator
+import numpy as np
+def greet(name):
+    return "Hello " + name + "!!"
+json_file = {"predict": [], "answer": [], "completion": [], "mean_logprob": [], "prompt": []}
+demo = gr.Blocks()
+with demo:
+    paper_title = gr.HTML("""<div align='center'><h1>[NeurIPS 2025] A Theoretical Study on Bridging Internal Probability and Self-Consistency for LLM Reasoning</h1></div>""")
+    paper_info  = gr.HTML("""<div align="center"><h3><a href="https://arxiv.org/pdf/2502.00511">📄 [Paper]</a> • <a href="https://wnjxyk.github.io/RPC">🌐 [Project]</a> • <a href="#" onclick="document.getElementById('bibtex-popup').style.display='block';">📚 [BibTeX]</a><h3>
+    <div id="bibtex-popup" style="display:none; position:fixed; top:50%; left:50%; transform:translate(-50%, -50%); background:white; padding:20px; border:1px solid #ccc; box-shadow:0 0 10px rgba(0,0,0,0.2); z-index:1000; max-width:80%; overflow:auto;">
+        <pre style="white-space:pre-wrap; font-size:12px; text-align:left;">@inproceedings{zhou24theoretical,
+    author    = {Zhou, Zhi and Tan, Yuhao and Li, Zenan and Yao, Yuan and Guo, Lan-Zhe and Li, Yu-Feng and Ma, Xiaoxing},
+    title     = {A Theorecial Study on Bridging Internal Probability and Self-Consistency for LLM Reasoning},
+    booktitle = {Advances in Neural Information Processing Systems},
+    year      = {2025},
+}</pre>
+        <button onclick="document.getElementById('bibtex-popup').style.display='none';" style="margin-top:10px; padding:5px 10px;">Close</button>
+    </div></div>""")
+    with gr.Column():
+        gr.Markdown("## 1. Experimental Settings")
+        with gr.Row():
+            dataset = gr.Dropdown(
+                choices=["MATH", "MathOdyssey", "AIME", "OlympiadBench"],
+                value="MathOdyssey",
+                label="Dataset",
+                interactive=True
+            )
+            model = gr.Dropdown(
+                choices=["Deepseek-Math-RL-7B", "InternLM2-Math-Plus-1.8B", "InternLM2-Math-Plus-7B"],
+                value="InternLM2-Math-Plus-7B",
+                label="Model",
+                interactive=True
+            )
+            k_value = gr.Dropdown(
+                choices=[8, 16, 32, 64, 128],
+                value=128,
+                label="K (Number of Sampled Reasoning Paths)",
+                interactive=True
+            )
+            seed = gr.Number(
+                label="Random Seed",
+                value=998244353,
+                step=1,
+                interactive=True
+            )
+            def update_k_value(dataset_choice):
+                if dataset_choice == "MATH":
+                    return gr.update(choices=[8, 16, 32, 64], value=min(64, k_value.value))
+                else:
+                    return gr.update(choices=[8, 16, 32, 64, 128], value=k_value.value)
+            dataset.change(fn=update_k_value, inputs=dataset, outputs=k_value)
+        load_btn = gr.Button("Load All Problems")
+    with gr.Column(visible=False) as content_column:
+        gr.Markdown("## 2. Problem Selection")
+        with gr.Group():
+            data_info = gr.Textbox(label="Experiment Info", value="")
+            problem_id = gr.Dropdown(
+                choices=[1],
+                value=1,
+                label="Problem ID (We removed (1) problems that were unlikely to be answered correctly using any of the methods; (2) easy problems)",
+                interactive=True
+            )
+            with gr.Row():
+                problem_prompt = gr.Textbox(label="Problem Prompt", value="", scale=3)
+                problem_answer = gr.Textbox(label="Problem Answer", value="", scale=1)
+            def update_problem_info(problem_id):
+                return gr.update(value=json_file['prompt'][problem_id-1], label=f"Problem#{problem_id} Prompt"), gr.update(value=json_file['answer'][problem_id-1], label=f"Problem#{problem_id} Answer")
+            problem_id.change(fn=update_problem_info, inputs=problem_id, outputs=[problem_prompt, problem_answer])
+        run_btn = gr.Button("Run Evaluation")
+    with gr.Column(visible=False) as result_column:
+        gr.Markdown("## 3. Experiment Result")
+        with gr.Row():
+            with gr.Column():
+                gr.Markdown("### PPL (Internal Probability)")
+                ppl_result = gr.Markdown()
+            with gr.Column():
+                gr.Markdown("### SC (Self-Consistency)")
+                sc_result = gr.Markdown(value="")
+            with gr.Column():
+                gr.Markdown("### RPC (Ours)")
+                rpc_result = gr.Markdown(value="")
+    def get_available_problems():
+        global json_file
+        answer = np.array(json_file["accuracy"]).mean(axis=0)
+        # print(answer.shape)
+        # Select indices where the answer is greater than 0.3
+        available_indices = np.where((answer > 0.3) & (answer < 0.5))[0]
+        available_indices = available_indices + 1
+        # print(available_indices)
+        return available_indices.tolist()
+    def load(dataset, model, k_value, seed):
+        try:
+            repo_id = {
+                "MATH": "WNJXYK/MATH-Reasoning-Paths",
+                "MathOdyssey": "WNJXYK/MathOdyssey-Reasoning-Paths",
+                "AIME": "WNJXYK/AIME_1983_2024-Reasoning-Paths",
+                "OlympiadBench": "WNJXYK/OlympiadBench-Reasoning-Paths"
+            }[dataset]
+            filename = f"{model}.json"
+            yield f"Downloading sampled reasoning paths from Hugging Face {repo_id}...", gr.update(visible=False), gr.update(), gr.update()
+            file_path = hf_hub_download(repo_id=repo_id, filename=filename, repo_type="dataset")
+            global json_file
+            with open(file_path, 'r', encoding='utf-8') as f:
+                json_file = json.load(f)
+            clist = get_available_problems()
+            # yield "Removing downloaded file..."
+            # print(file_path)
+            os.remove(file_path)
+            yield "Loading complete! You can now select a problem ID.", gr.update(visible=True), gr.update(value=f"Dataset: {dataset}\tModel: {model}\tK: {k_value}\tSeed: {seed}"), gr.update(choices=clist, value=clist[0])
+        except Exception as e:
+            yield f"Error: {str(e)}"
+    def res_to_str(correct, answers, topk=10):
+        answers = sorted(answers, key=lambda x: x[1], reverse=True)
+        response = "| # | Answer | Probability | Correct |\n|---|--------|------------|--------|\n"
+        for i in range(min(len(answers), topk)):
+            correct_mark = "✅" if answers[i][2] else "❌"
+            wrapped_answer = answers[i][0] if len(answers[i][0]) <= 10 else answers[i][0][:10] + "..."
+            response += f"| Top-{i+1} | {wrapped_answer} | {answers[i][1]:.2f} | {correct_mark} |\n"
+        return response
+    def evaluate(problem_id):
+        ppl_correct, ppl_answers = prep_evaluator(
+            json_file["predict"][problem_id-1],
+            json_file["completion"][problem_id-1],
+            json_file["mean_logprob"][problem_id-1],
+            json_file["answer"][problem_id-1],
+            numberic_compare,
+            check_equal
+        )
+        sc_correct, sc_answers = sc_evaluator(
+            json_file["predict"][problem_id-1],
+            json_file["completion"][problem_id-1],
+            json_file["mean_logprob"][problem_id-1],
+            json_file["answer"][problem_id-1],
+            numberic_compare,
+            check_equal
+        )
+        rpc_correct, rpc_answers = wpc_evaluator(
+            json_file["predict"][problem_id-1],
+            json_file["completion"][problem_id-1],
+            json_file["mean_logprob"][problem_id-1],
+            json_file["answer"][problem_id-1],
+            numberic_compare,
+            check_equal
+        )
+        return gr.update(visible=True), gr.update(value=res_to_str(ppl_correct, ppl_answers)), gr.update(value=res_to_str(sc_correct, sc_answers)), gr.update(value=res_to_str(rpc_correct, rpc_answers))
+    load_btn.click(fn=load, inputs=[dataset, model, k_value, seed],outputs=[load_btn, content_column, data_info, problem_id], show_progress="inside")
+    run_btn.click(fn=evaluate, inputs=problem_id, outputs=[result_column, ppl_result, sc_result, rpc_result])
+if __name__ == "__main__":
+    demo.launch()

build_cache.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from compute_perp import check_equal
+import multiprocessing, json, os, time
+def solve(predict, answer):
+    cache_dict = {}
+    m = len(predict)
+    for i in range(m):
+        key = str(predict[i]) + "<##>" + str(answer)
+        rev_key = str(answer) + "<##>" + str(predict[i])
+        if key in cache_dict or rev_key in cache_dict:
+            continue
+        val = check_equal(predict[i], answer)
+        cache_dict[key] = val
+        cache_dict[rev_key] = val
+    for i in range(m):
+        for j in range(m):
+            key = str(predict[i]) + "<##>" + str(predict[j])
+            rev_key = str(predict[j]) + "<##>" + str(predict[i])
+            if key in cache_dict or rev_key in cache_dict:
+                continue
+            val = check_equal(predict[i], predict[j])
+            cache_dict[key] = val
+            cache_dict[rev_key] = val
+    return cache_dict
+def cache(data, cache_path):
+    if os.path.exists(cache_path):
+        print(f"Cache file {cache_path} exists, skip!")
+        return
+    start_time = time.time()
+    predicts = data["predict"]
+    answers = data["answer"]
+    n = len(predicts)
+    cache_dict = {}
+    with multiprocessing.Pool() as pool:
+        results = pool.starmap(
+            solve, [(predicts[i], answers[i]) for i in range(n)]
+        )
+    for result in results:
+        cache_dict.update(result)
+    with open(cache_path, "w") as fw:
+        json.dump(cache_dict, fw)
+    print(f"Cache file {cache_path} built in {time.time() - start_time:.2f}S")

compute_perp.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import json
+import metrics
+import argparse
+import numpy as np
+import multiprocessing
+from tqdm import trange
+import signal, functools
+import re, os, sys, random, time
+from fraction import Fraction
+from data_processing.answer_extraction import *
+from functools import lru_cache
+from eval.eval_script import *
+MAX_INT = sys.maxsize
+INVALID_ANS = "[Invalid]"
+INF = 1e9
+__all__ = [
+    "check_equal",
+    "check_equal_without_timeout",
+    "numberic_compare",
+    "Evaluator",
+]
+@lru_cache(maxsize=1000000)
+def check_equal_without_timeout(ans_1, ans_2):
+    return math_equal(ans_1, ans_2)
+def check_equal(ans_1, ans_2, cache_dict=None):
+    try:
+        if cache_dict is not None:
+            key = str(ans_1) + "<##>" + str(ans_2)
+            if key in cache_dict: return cache_dict[key]
+            print("Miss")
+        return check_equal_without_timeout(ans_1, ans_2)
+    except TimeoutError as e:
+        return False
+def numberic_compare(ai, aj, ci, cj, cache_dict=None):
+    return check_equal(ai, aj, cache_dict)
+def prep_evaluator(
+    predicts, completions, perplexities, answer, equal_func, check_equal
+):
+    m = len(predicts)
+    # Compute maximum probability
+    max_perplexity = -INF
+    max_perplexity_count = 0.0
+    for i in range(m):
+        if perplexities[i] > max_perplexity:
+            max_perplexity = perplexities[i]
+            max_perplexity_count = 0.0
+        if perplexities[i] >= max_perplexity:
+            max_perplexity_count += 1.0
+    # Compute accuracy
+    correct, answers = 0, []
+    for i in range(m):
+        ans_i = predicts[i]
+        answers.append([ans_i, np.exp(perplexities[i]), check_equal(ans_i, answer)])
+        if perplexities[i] < max_perplexity: continue
+        if check_equal(ans_i, answer):
+            correct += 1.0 / max_perplexity_count
+    return correct, answers
+class Evaluator:
+    def __init__(self):
+        self.name = "Perplexity"
+    def process(self, json_file, cache_file, equal_func, evaluator, K, seed=0):
+        # with open(file_path, 'r', encoding='utf-8') as f:
+        #     results = json.load(f)
+        results = json_file
+        n = len(results["predict"])
+        m = len(results["predict"][0])
+        indices = list(range(m))
+        random.seed(seed)
+        random.shuffle(indices)
+        indices = indices[: K]
+        if cache_file is not None:
+            def cache_equal_func(ai, aj, ci, cj):
+                return equal_func(ai, aj, ci, cj, cache_file)
+            def cache_check_equal(ai, aj):
+                return check_equal(ai, aj, cache_file)
+        else:
+            cache_equal_func = equal_func
+            cache_check_equal = check_equal
+        predicts, completions, perplexities, answers = [], [], [], []
+        for i in range(0, n):
+            predicts.append([results["predict"][i][j] for j in indices])
+            completions.append([results["completion"][i][j] for j in indices])
+            perplexities.append([results["mean_logprob"][i][j] for j in indices])
+            answers.append(results["answer"][i])
+        n = len(predicts)
+        start_time = time.time()
+        outputs = []
+        for idx in trange(n):
+            res = evaluator(
+                predicts[idx],
+                completions[idx],
+                perplexities[idx],
+                answers[idx],
+                cache_equal_func,
+                cache_check_equal,
+            )
+            outputs.append(res)
+        print(f"Running Time with Single Process Mode with Seed #{seed}: {time.time() - start_time:.2f}S")
+        for i in trange(n):
+            m = len(outputs[i][1])
+            for j in range(m):
+                ans, prob, flag = outputs[i][1][j]
+        maximum, max_bins = metrics.compute_maximum_metrics([x[1] for x in outputs])
+        average, avg_bins = metrics.compute_average_metrics([x[1] for x in outputs])
+        accs = np.mean([x[0] for x in outputs])
+        return accs * 100.0, maximum, average, max_bins, avg_bins
+    def worker(self, args):
+        json_file, cache_file, K, seed = args
+        acc, maximum, average, max_bins, avg_bins = self.process(
+            json_file=json_file,
+            cache_file=cache_file,
+            equal_func=numberic_compare,
+            evaluator=prep_evaluator,
+            K=K,
+            seed=seed
+        )
+        return acc, maximum, average
+    def solve(self, json_file, cache_file=None, repeats=10, K=128):
+        accs, maxs, avgs = [], [], []
+        with multiprocessing.Pool() as pool:
+            results = pool.map(self.worker, [(json_file, cache_file, K, seed) for seed in range(repeats)])
+        accs, maxs, _ = zip(*results)
+        accs, maxs = np.array(accs), np.array(maxs)
+        return {
+            "Accuracy": f"{accs.mean():.2f} ± {accs.std():.2f}",
+            "ECE": f"{maxs[:, 0].mean() * 100.0:.2f} ± {maxs[:, 0].std() * 100.0:.2f}",
+        }

compute_rpc.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import json
+import metrics
+import argparse
+import numpy as np
+import multiprocessing
+from tqdm import trange
+import signal, functools
+from scipy.special import gamma
+import re, os, sys, random, time
+from scipy.stats import weibull_min
+from scipy.optimize import minimize
+from fraction import Fraction
+from data_processing.answer_extraction import *
+from eval.eval_script import *
+from compute_perp import Evaluator, numberic_compare
+from compute_sc import DSU
+MAX_INT = sys.maxsize
+INVALID_ANS = "[Invalid]"
+#### Reasoning Pruning Module: Model probability with Weibull distribution ####
+def weibull_pdf(x, k, lam):
+    return (k / lam) * (x / lam) ** (k - 1) * np.exp(-((x / lam) ** k))
+def weibull_mean(k, lam):
+    return lam * gamma(1 + 1 / k)
+def mixture_pdf(x, w1, k1, lam1, k2, lam2):
+    return w1 * weibull_pdf(x, k1, lam1) + (1 - w1) * weibull_pdf(x, k2, lam2)
+def neg_log_likelihood(params, data):
+    w1, k1, lam1, k2, lam2 = params
+    pdf_vals = mixture_pdf(data, w1, k1, lam1, k2, lam2)
+    return -np.sum(np.log(pdf_vals))
+def calculate_membership_probabilities(data, w1, k1, lam1, k2, lam2):
+    pdf1 = weibull_pdf(data, k1, lam1)
+    pdf2 = weibull_pdf(data, k2, lam2)
+    prob1 = w1 * pdf1 / (w1 * pdf1 + (1 - w1) * pdf2)
+    prob2 = 1 - prob1
+    return prob1, prob2
+### Perplexity Consistency Module: Bridging the probability with self-consistency ####
+def wpc_evaluator(predicts, completions, perplexities, answer, equal_func, check_equal):
+    m = len(predicts)
+    dsu = DSU(m)
+    probas = [np.exp(perplexities[i]) for i in range(m)]
+    mean_proba = np.mean(probas)
+    # Model probability with Weibull distribution
+    initial_guess = [0.5, 1.0, 1.0, 1.5, 2.0]
+    result = minimize(
+        neg_log_likelihood,
+        initial_guess,
+        args=(probas,),
+        bounds=[(0.2, 0.8), (0.01, None), (0.01, None), (0.01, None), (0.01, None)],
+    )
+    w1, k1, lam1, k2, lam2 = result.x
+    if weibull_mean(k1, lam1) < weibull_mean(k2, lam2):
+        k1, lam1, k2, lam2 = k2, lam2, k1, lam1
+        w1 = 1 - w1
+    # Pruning reasoning paths with low probabilities
+    remove = 0
+    for i in range(m):
+        completion_i = completions[i]
+        logprob_i = perplexities[i]
+        proba_i = np.exp(logprob_i)
+        p1, p2 = calculate_membership_probabilities(proba_i, w1, k1, lam1, k2, lam2)
+        if p1 < p2 and proba_i < mean_proba:
+            proba_i = 0
+            remove += 1
+        else:
+            dsu.attr[i][completion_i] = set([proba_i])
+    # Combining internal probabilities and self-consistency
+    for i in range(m):
+        if dsu.get_father(i) != i:
+            continue
+        for j in range(i):
+            ans_i = predicts[i]
+            ans_j = predicts[j]
+            completion_i = completions[i]
+            completion_j = completions[j]
+            if equal_func(ans_i, ans_j, completion_i, completion_j):
+                dsu.merge(i, j)
+    # Compute majority votes with probabilities
+    max_prob, max_prob_count = 0, 0
+    for i in range(m):
+        if dsu.get_father(i) != i:
+            continue
+        prob_i = np.sum([np.sum(list(dsu.attr[i][k])) for k in dsu.attr[i].keys()])
+        if prob_i > max_prob:
+            max_prob = prob_i
+            max_prob_count = 0
+        if prob_i >= max_prob:
+            max_prob_count += 1
+    # Compute accuracy
+    correct, answers = 0, []
+    for i in range(m):
+        if dsu.get_father(i) != i:
+            continue
+        ans_i = predicts[i]
+        prob_i = np.sum([np.sum(list(dsu.attr[i][k])) for k in dsu.attr[i].keys()])
+        answers.append([ans_i, prob_i, check_equal(ans_i, answer)])
+        if prob_i < max_prob:
+            continue
+        if check_equal(ans_i, answer):
+            correct += 1.0 / max_prob_count
+    # Normalize probabilities
+    sum_proba = np.sum([x[1] for x in answers])
+    for i in range(len(answers)):
+        answers[i][1] /= sum_proba
+    return correct, answers
+class RPCEvaluator(Evaluator):
+    def __init__(self,):
+        self.name = "RPC"
+    def worker(self, args):
+        json_file, cache_file, K, seed = args
+        acc, maximum, average, max_bins, avg_bins = self.process(
+            json_file=json_file,
+            cache_file=cache_file,
+            equal_func=numberic_compare,
+            evaluator=wpc_evaluator,
+            K=K,
+            seed=seed
+        )
+        return acc, maximum, average

compute_sc.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import json
+import metrics
+import argparse
+import numpy as np
+import multiprocessing
+from tqdm import trange
+import signal, functools
+import re, os, sys, random, time
+from fraction import Fraction
+from data_processing.answer_extraction import *
+from eval.eval_script import *
+from compute_perp import Evaluator, numberic_compare
+MAX_INT = sys.maxsize
+INVALID_ANS = "[Invalid]"
+__all__ = ["DSU"]
+class DSU:
+    def __init__(self, n):
+        self.n = n
+        self.father = [i for i in range(n)]
+        self.size = [1 for i in range(n)]
+        self.attr = [{} for i in range(n)]
+    def get_father(self, x):
+        if self.father[x] == x:
+            return x
+        self.father[x] = self.get_father(self.father[x])
+        return self.father[x]
+    def merge(self, x, y):
+        fx = self.get_father(x)
+        fy = self.get_father(y)
+        if fx == fy:
+            return
+        self.father[fy] = fx
+        self.size[fx] += self.size[fy]
+        self.size[fy] = 0
+        for key in self.attr[fy].keys():
+            if key not in self.attr[fx]:
+                self.attr[fx][key] = self.attr[fy][key]
+            else:
+                self.attr[fx][key] |= self.attr[fy][key]
+        self.attr[fy] = {}
+def sc_evaluator(predicts, completions, perplexities, answer, equal_func, check_equal):
+    m = len(predicts)
+    dsu = DSU(m)
+    # Merge answer for self-consistency
+    for i in range(m):
+        if dsu.get_father(i) != i:
+            continue
+        for j in range(i):
+            ans_i = predicts[i]
+            ans_j = predicts[j]
+            completion_i = completions[i]
+            completion_j = completions[j]
+            if equal_func(ans_i, ans_j, completion_i, completion_j):
+                dsu.merge(i, j)
+    # Compute majority votes
+    max_size, max_size_count = 0, 0
+    for i in range(m):
+        if dsu.get_father(i) != i:
+            continue
+        if dsu.size[i] > max_size:
+            max_size = dsu.size[i]
+            max_size_count = 0
+        if dsu.size[i] == max_size:
+            max_size_count += 1
+    # Compute accuracy
+    correct, answers = 0, []
+    for i in range(m):
+        if dsu.get_father(i) != i:
+            continue
+        ans_i = predicts[i]
+        answers.append([ans_i, dsu.size[i] / m, check_equal(ans_i, answer)])
+        if dsu.size[i] < max_size:
+            continue
+        if check_equal(ans_i, answer):
+            correct += 1.0 / max_size_count
+    # Normalize probabilities
+    sum_proba = np.sum([x[1] for x in answers])
+    for i in range(len(answers)):
+        answers[i][1] /= sum_proba
+    return correct, answers
+class SCEvaluator(Evaluator):
+    def __init__(self):
+        self.name = "Self-Consistency"
+    def worker(self, args):
+        json_file, cache_file, K, seed = args
+        acc, maximum, average, max_bins, avg_bins = self.process(
+            json_file=json_file,
+            cache_file=cache_file,
+            equal_func=numberic_compare,
+            evaluator=sc_evaluator,
+            K=K,
+            seed=seed
+        )
+        return acc, maximum, average

data_processing/answer_extraction.py ADDED Viewed

	@@ -0,0 +1,362 @@

+import re
+import regex
+def _fix_fracs(string):
+    substrs = string.split("\\frac")
+    new_str = substrs[0]
+    if len(substrs) > 1:
+        substrs = substrs[1:]
+        for substr in substrs:
+            new_str += "\\frac"
+            if len(substr) > 0 and substr[0] == "{":
+                new_str += substr
+            else:
+                try:
+                    assert len(substr) >= 2
+                except:
+                    return string
+                a = substr[0]
+                b = substr[1]
+                if b != "{":
+                    if len(substr) > 2:
+                        post_substr = substr[2:]
+                        new_str += "{" + a + "}{" + b + "}" + post_substr
+                    else:
+                        new_str += "{" + a + "}{" + b + "}"
+                else:
+                    if len(substr) > 2:
+                        post_substr = substr[2:]
+                        new_str += "{" + a + "}" + b + post_substr
+                    else:
+                        new_str += "{" + a + "}" + b
+    string = new_str
+    return string
+def _fix_a_slash_b(string):
+    if len(string.split("/")) != 2:
+        return string
+    a = string.split("/")[0]
+    b = string.split("/")[1]
+    try:
+        if "sqrt" not in a:
+            a = int(a)
+        if "sqrt" not in b:
+            b = int(b)
+        assert string == "{}/{}".format(a, b)
+        new_string = "\\frac{" + str(a) + "}{" + str(b) + "}"
+        return new_string
+    except:
+        return string
+def _fix_sqrt(string):
+    _string = re.sub(r"\\sqrt(-?[0-9.a-zA-Z]+)", r"\\sqrt{\1}", string)
+    _string = re.sub(r"\\sqrt\s+(\w+)$", r"\\sqrt{\1}", _string)
+    return _string
+def _fix_tan(string):
+    _string = re.sub(r"\\tan(-?[0-9.a-zA-Z]+)", r"\\tan{\1}", string)
+    _string = re.sub(r"\\tan\s+(\w+)$", r"\\tan{\1}", _string)
+    return _string
+def strip_string(string):
+    string = str(string).strip()
+    # linebreaks
+    string = string.replace("\n", "")
+    # right "."
+    string = string.rstrip(".")
+    # remove inverse spaces
+    string = string.replace("\\!", "")
+    # string = string.replace("\\ ", "")
+    # replace \\ with \
+    # string = string.replace("\\\\", "\\")
+    # string = string.replace("\\\\", "\\")
+    if string.startswith("\\text{") and string.endswith("}"):
+        string = string.split("{", 1)[1][:-1]
+    # replace tfrac and dfrac with frac
+    string = string.replace("tfrac", "frac")
+    string = string.replace("dfrac", "frac")
+    string = string.replace("cfrac", "frac")
+    # remove \left and \right
+    string = string.replace("\\left", "")
+    string = string.replace("\\right", "")
+    # Remove unit: miles, dollars if after is not none
+    _string = re.sub(r"\\text{.*?}$", "", string).strip()
+    if _string != "" and _string != string:
+        # print("Warning: unit not removed: '{}' -> '{}'".format(string, _string))
+        string = _string
+    # Remove circ (degrees)
+    string = string.replace("^{\\circ}", "").strip()
+    string = string.replace("^\\circ", "").strip()
+    string = regex.sub(r"\{(c|m)?m\}(\^(2|3))?", "", string).strip()
+    string = regex.sub(r"p\.m\.$", "", string).strip()
+    string = regex.sub(r"(\d)\s*t$", r"\1", string).strip()
+    # remove dollar signs
+    string = string.replace("\\$", "")
+    string = string.replace("$", "")
+    # string = string.replace("\\text", "")
+    string = string.replace("x\\in", "")
+    # remove percentage
+    string = string.replace("\\%", "%")
+    string = string.replace("\%", "%")
+    # string = string.replace("%", "")
+    # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
+    string = string.replace(" .", " 0.")
+    string = string.replace("{.", "{0.")
+    # cdot
+    string = string.replace("\\cdot", "")
+    # inf
+    string = string.replace("infinity", "\\infty")
+    if "\\infty" not in string:
+        string = string.replace("inf", "\\infty")
+    string = string.replace("+\\inity", "\\infty")
+    # and
+    # string = string.replace("and", "")
+    string = string.replace("\\mathbf", "")
+    string = string.replace("\\mathrm", "")
+    # use regex to remove \mbox{...}
+    string = re.sub(r"\\mbox{.*?}", "", string)
+    # quote
+    string.replace("'", "")
+    string.replace('"', "")
+    # i, j
+    if "j" in string and "i" not in string:
+        string = string.replace("j", "i")
+    # replace a.000b where b is not number or b is end, with ab, use regex
+    string = re.sub(r"(\d+)\.0+([^\d])", r"\1\2", string)
+    string = re.sub(r"(\d+)\.0+$", r"\1", string)
+    # if empty, return empty string
+    if len(string) == 0:
+        return string
+    if string[0] == ".":
+        string = "0" + string
+    # to consider: get rid of e.g. "k = " or "q = " at beginning
+    # if len(string.split("=")) == 2:
+    #     if len(string.split("=")[0]) <= 2:
+    #         string = string.split("=")[1]
+    string = _fix_sqrt(string)
+    string = _fix_tan(string)
+    string = string.replace(" ", "")
+    # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b}
+    string = _fix_fracs(string)
+    # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
+    string = _fix_a_slash_b(string)
+    string = regex.sub(r"(\\|,|\.)+$", "", string)
+    return string
+def extract_boxed_answers(text):
+    answers = []
+    for piece in text.split("boxed{")[1:]:
+        n = 0
+        for i in range(len(piece)):
+            if piece[i] == "{":
+                n += 1
+            elif piece[i] == "}":
+                n -= 1
+                if n < 0:
+                    if i + 1 < len(piece) and piece[i + 1] == "%":
+                        answers.append(piece[: i + 1])
+                    else:
+                        answers.append(piece[:i])
+                    break
+    return answers
+def extract_program_output(pred_str):
+    """
+    extract output between the last ```output\n...\n```
+    """
+    if "```output" not in pred_str:
+        return ""
+    if "```output" in pred_str:
+        pred_str = pred_str.split("```output")[-1]
+    if "```" in pred_str:
+        pred_str = pred_str.split("```")[0]
+    output = pred_str.strip()
+    return output
+def extract_answer(pred_str, exhaust=False):
+    pred = []
+    if "final answer is $" in pred_str and "$. I hope" in pred_str:
+        tmp = pred_str.split("final answer is $", 1)[1]
+        pred = [tmp.split("$. I hope", 1)[0].strip()]
+    elif "boxed" in pred_str:
+        pred = extract_boxed_answers(pred_str)
+    elif "he answer is" in pred_str:
+        pred = [pred_str.split("he answer is")[-1].strip()]
+    else:
+        program_output = extract_program_output(pred_str)
+        if program_output != "":
+            # fall back to program
+            pred.append(program_output)
+        else:  # use the last number
+            pattern = "-?\d*\.?\d+"
+            ans = re.findall(pattern, pred_str.replace(",", ""))
+            if len(ans) >= 1:
+                ans = ans[-1]
+            else:
+                ans = ""
+            if ans:
+                pred.append(ans)
+    # multiple line
+    _pred = []
+    for ans in pred:
+        ans = ans.strip().split("\n")[0]
+        ans = ans.lstrip(":")
+        ans = ans.rstrip(".")
+        ans = ans.rstrip("/")
+        ans = strip_string(ans)
+        _pred.append(ans)
+    if exhaust:
+        return _pred
+    else:
+        return _pred[-1] if _pred else ""
+def extract_math_answer(question, reasoning, task):
+    answer = []
+    for ans in extract_answer(reasoning, exhaust=True):
+        if "separated by commas" in question and all(ch not in ans for ch in "()[]"):
+            answer.extend([a.strip() for a in ans.split(",")])
+        elif regex.search(r"\\text\{\s*and\s*\}", ans):
+            answer.extend(
+                [
+                    a.strip()
+                    for a in regex.sub(r"\\text\{\s*and\s*\}", "[SEP]", ans).split(
+                        "[SEP]"
+                    )
+                ]
+            )
+        else:
+            answer.append(ans.strip())
+    return answer
+def extract_math_few_shot_cot_answer(question, reasoning, task):
+    if "Problem:" in reasoning:
+        reasoning = reasoning.split("Problem:", 1)[0]
+    return extract_math_answer(question, reasoning, task)
+def extract_last_single_answer(question, reasoning, task):
+    return extract_answer(reasoning, exhaust=False)
+def extract_gsm_few_shot_cot_answer(question, reasoning, task):
+    if "Q: " in reasoning:
+        reasoning = reasoning.split("Q: ", 1)[0]
+    pred = [s for s in regex.findall(r"-?\d+\.?\d*", reasoning)]
+    if pred:
+        return pred[-1]
+    else:
+        return "[invalid]"
+def extract_agieval_gaokao_mathcloze_few_shot_cot_test(question, reasoning, task):
+    if "问题 " in reasoning:
+        reasoning = reasoning.split("问题 ", 1)[0]
+    if "答案是" in reasoning:
+        ans = reasoning.split("答案是", 1)[1].strip()
+        ans = ans.split("\n")[0].strip()
+        ans = [ans.strip("$")]
+    else:
+        ans = ["placeholder"]
+    return ans
+def extract_agieval_gaokao_mathqa_few_shot_cot_test(question, reasoning, task):
+    if "问题 " in reasoning:
+        reasoning = reasoning.split("问题 ", 1)[0]
+    if "答案是" in reasoning:
+        ans = reasoning.split("答案是", 1)[1].strip()
+        ans = ans.split("\n")[0].strip()
+    else:
+        ans = "placeholder"
+    return ans
+def extract_sat_few_shot_answer(question, reasoning, task):
+    if "Problem:" in reasoning:
+        reasoning = reasoning.split("Problem:", 1)[0]
+    patt = regex.search(r"the final answer is \(?(?P<ans>[abcd])\)?", reasoning.lower())
+    if patt is not None:
+        return patt.group("ans").upper()
+    return "placeholder"
+def extract_ocwcourses_few_shot_answer(question, reasoning, task):
+    if "Problem:" in reasoning:
+        reasoning = reasoning.split("Problem:", 1)[0]
+    patt = regex.search(
+        r"final answer is (?P<ans>.*)\. I hope it is correct.", reasoning
+    )
+    if patt is None:
+        pred = "[invalid]"
+        print(f"DEBUG >>>\n{reasoning}", flush=True)
+    else:
+        pred = patt.group("ans")
+    return pred
+def extract_mmlu_stem(question, reasoning, task):
+    if "Problem:" in reasoning:
+        reasoning = reasoning.split("Problem:", 1)[0]
+    return extract_sat_few_shot_answer(question, reasoning, task)
+def extract_minif2f_isabelle(question, reasoning, task):
+    if "Informal:" in reasoning:
+        reasoning = reasoning.split("Informal:", 1)[0]
+    return reasoning.strip()
+def extract_cmath_few_shot_test(question, reasoning, task):
+    if "问题：" in reasoning:
+        reasoning = reasoning.split("问题：", 1)[0]
+    if "答案是" in reasoning:
+        ans = reasoning.split("答案是", 1)[1].strip()
+        ans = ans.split("\n")[0]
+        ans = ans.strip("：")
+        ans = ans.strip("。")
+        try:
+            ans = [s for s in regex.findall(r"-?\d+\.?\d*", ans)][-1]
+        except:
+            print(f"DEBUG CMATH: {reasoning}", flush=True)
+            ans = "[invalid]"
+    else:
+        ans = extract_last_single_answer(question, reasoning, task)
+    return ans

data_processing/process_utils.py ADDED Viewed

	@@ -0,0 +1,191 @@

+import regex
+from data_processing.answer_extraction import extract_math_answer, strip_string
+def process_gsm8k_test(item):
+    sample = {
+        "dataset": "gsm8k-cot",
+        "id": item["id"],
+        "messages": [
+            {"role": "user", "content": item["question"]},
+            {
+                "role": "assistant",
+                "content": regex.sub(r"<<[^<>]*>>", "", item["cot"])
+                + "\nSo the answer is $\\boxed{"
+                + item["answer"].strip()
+                + "}$.",
+            },
+        ],
+        "answer": item["answer"].replace(",", ""),
+    }
+    yield sample
+def process_math_test(item):
+    question = item["problem"]
+    try:
+        answer = extract_math_answer(question, item["solution"], task="cot")
+    except:
+        return
+    sample = {
+        "dataset": "math-cot",
+        "id": item["id"],
+        "level": item["level"],
+        "type": item["type"],
+        "category": item["category"],
+        "messages": [
+            {"role": "user", "content": question},
+            {
+                "role": "assistant",
+                "content": "\n".join(
+                    regex.split(r"(?<=\.) (?=[A-Z])", item["solution"])
+                ),
+            },
+        ],
+        "answer": answer,
+    }
+    yield sample
+def process_math_sat(item):
+    options = item["options"].strip()
+    assert "A" == options[0]
+    options = "(" + options
+    for ch in "BCDEFG":
+        if f" {ch}) " in options:
+            options = regex.sub(f" {ch}\) ", f" ({ch}) ", options)
+    question = f"{item['question'].strip()}\nWhat of the following is the right choice? Explain your answer.\n{options.strip()}"
+    messages = [
+        {"role": "user", "content": question},
+        {"role": "assistant", "content": item["Answer"]},
+    ]
+    item = {
+        "dataset": "math_sat",
+        "id": item["id"],
+        "language": "en",
+        "messages": messages,
+        "answer": item["Answer"],
+    }
+    yield item
+def process_ocwcourses(item):
+    messages = [
+        {"role": "user", "content": item["problem"].strip()},
+        {"role": "assistant", "content": item["solution"].strip()},
+    ]
+    item = {
+        "dataset": "OCWCourses",
+        "id": item["id"],
+        "language": "en",
+        "messages": messages,
+        "answer": item["answer"],
+    }
+    yield item
+def process_mmlu_stem(item):
+    options = item["options"]
+    for i, (label, option) in enumerate(zip("ABCD", options)):
+        options[i] = f"({label}) {str(option).strip()}"
+    options = ", ".join(options)
+    question = f"{item['question'].strip()}\nWhat of the following is the right choice? Explain your answer.\n{options}"
+    messages = [
+        {"role": "user", "content": question},
+        {"role": "assistant", "content": item["answer"]},
+    ]
+    item = {
+        "dataset": "MMLU-STEM",
+        "id": item["id"],
+        "language": "en",
+        "messages": messages,
+        "answer": item["answer"],
+    }
+    yield item
+def process_mgsm_zh(item):
+    item["answer"] = item["answer"].replace(",", "")
+    yield item
+def process_cmath(item):
+    item = {
+        "dataset": "cmath",
+        "id": item["id"],
+        "grade": item["grade"],
+        "reasoning_step": item["reasoning_step"],
+        "messages": [
+            {"role": "user", "content": item["question"].strip()},
+            {"role": "assistant", "content": ""},
+        ],
+        "answer": item["golden"].strip().replace(",", ""),
+    }
+    yield item
+def process_agieval_gaokao_math_cloze(item):
+    item = {
+        "dataset": "agieval-gaokao-math-cloze",
+        "id": item["id"],
+        "messages": [
+            {"role": "user", "content": item["question"].strip()},
+            {"role": "assistant", "content": ""},
+        ],
+        "answer": [strip_string(ans) for ans in item["answer"].strip().split(";")],
+    }
+    yield item
+def process_agieval_gaokao_mathqa(item):
+    question = item["question"].strip()
+    options = []
+    for option in item["options"]:
+        option = option.strip()
+        assert option[0] == "("
+        assert option[2] == ")"
+        assert option[1] in "ABCD"
+        option = f"{option[1]}: {option[3:].strip()}"
+        options.append(option.strip())
+    question = f"{question}\n{options}"
+    item = {
+        "dataset": "agieval-gaokao-mathqa",
+        "id": item["id"],
+        "messages": [
+            {"role": "user", "content": question},
+            {"role": "assistant", "content": ""},
+        ],
+        "answer": item["label"],
+    }
+    yield item
+def process_agieval_gaokao_mathqa_few_shot_cot_test(item):
+    question = item["question"].strip().rstrip("\\")
+    options = " ".join([opt.strip() for opt in item["options"]])
+    question = f"{question}\n从以下选项中选择:    {options}"
+    item = {
+        "dataset": "agieval-gaokao-mathqa",
+        "id": item["id"],
+        "messages": [
+            {"role": "user", "content": question},
+            {"role": "assistant", "content": ""},
+        ],
+        "answer": item["label"],
+    }
+    yield item
+def process_minif2f_isabelle(item):
+    question = f"(*### Problem\n\n{item['informal_statement'].strip()}\n\n### Solution\n\n{item['informal_proof'].strip()} *)\n\nFormal:\n{item['formal_statement'].strip()}"
+    item = {
+        "dataset": "minif2f-isabelle",
+        "id": item["id"],
+        "messages": [
+            {"role": "user", "content": question},
+            {"role": "assistant", "content": ""},
+        ],
+        "answer": "placeholder",
+    }
+    yield item

eval/eval_script.py ADDED Viewed

	@@ -0,0 +1,190 @@

+import regex
+from copy import deepcopy
+from eval.eval_utils import math_equal
+from eval.ocwcourses_eval_utils import (
+    normalize_numeric,
+    numeric_equality,
+    normalize_symbolic_equation,
+    SymbolicMathMixin,
+)
+def is_correct(item, pred_key="prediction", prec=1e-3):
+    pred = item[pred_key]
+    ans = item["answer"]
+    if isinstance(pred, list) and isinstance(ans, list):
+        pred_matched = set()
+        ans_matched = set()
+        for i in range(len(pred)):
+            for j in range(len(ans)):
+                item_cpy = deepcopy(item)
+                item_cpy.update({pred_key: pred[i], "answer": ans[j]})
+                if is_correct(item_cpy, pred_key=pred_key, prec=prec):
+                    pred_matched.add(i)
+                    ans_matched.add(j)
+                    if item_cpy[pred_key] == "2,3,4":
+                        print(item, flush=True)
+                        print("wtf", flush=True)
+        return len(pred_matched) == len(pred) and len(ans_matched) == len(ans)
+    elif isinstance(pred, str) and isinstance(ans, str):
+        if "\\cup" in pred and "\\cup" in ans:
+            item = deepcopy(item)
+            item.update(
+                {
+                    pred_key: pred.split("\\cup"),
+                    "answer": ans.split("\\cup"),
+                }
+            )
+            return is_correct(item, pred_key=pred_key, prec=prec)
+        else:
+            label = False
+            try:
+                label = (
+                    abs(
+                        float(regex.sub(r",", "", str(pred)))
+                        - float(regex.sub(r",", "", str(ans)))
+                    )
+                    < prec
+                )
+            except:
+                pass
+            label = label or (ans and pred == ans) or math_equal(pred, ans)
+            return label
+    else:
+        print(item, flush=True)
+        raise NotImplementedError()
+def eval_math(item, pred_key="prediction", prec=1e-3):
+    pred = item[pred_key]
+    if pred_key == "program_output" and isinstance(pred, str):
+        pred = [pred]
+    ans = item["answer"]
+    if isinstance(pred, list) and isinstance(ans, list):
+        # for some questions in MATH, `reference` repeats answers
+        _ans = []
+        for a in ans:
+            if a not in _ans:
+                _ans.append(a)
+        ans = _ans
+        # some predictions for MATH questions also repeats answers
+        _pred = []
+        for a in pred:
+            if a not in _pred:
+                _pred.append(a)
+        # some predictions mistakenly box non-answer strings
+        pred = _pred[-len(ans) :]
+    item.update({pred_key: pred, "answer": ans})
+    return is_correct(item, pred_key=pred_key, prec=prec)
+def eval_last_single_answer(item, pred_key="prediction", prec=1e-3):
+    for key in [pred_key, "answer"]:
+        assert isinstance(item[key], str), f"{key} = `{item[key]}` is not a str"
+    return is_correct(item, pred_key=pred_key, prec=prec)
+def eval_agieval_gaokao_math_cloze(item, pred_key="prediction", prec=1e-3):
+    if pred_key == "program_output" and isinstance(item[pred_key], str):
+        item[pred_key] = [item[pred_key]]
+    for key in [pred_key, "answer"]:
+        assert isinstance(item[key], list), f"{key} = `{item[key]}` is not a list"
+    pred = item[pred_key]
+    ans = item["answer"]
+    _pred = []
+    for p in pred:
+        p = p + ";"
+        while p:
+            left_brackets = 0
+            for i in range(len(p)):
+                if p[i] == ";" or (p[i] == "," and left_brackets == 0):
+                    _p, p = p[:i].strip(), p[i + 1 :].strip()
+                    if _p not in _pred:
+                        _pred.append(_p)
+                    break
+                elif p[i] in "([{":
+                    left_brackets += 1
+                elif p[i] in ")]}":
+                    left_brackets -= 1
+    pred = _pred[-len(ans) :]
+    if len(pred) == len(ans):
+        for p, a in zip(pred, ans):
+            item.update(
+                {
+                    pred_key: p,
+                    "answer": a,
+                }
+            )
+            if not is_correct(item, pred_key=pred_key, prec=prec):
+                return False
+        return True
+    else:
+        return False
+def eval_agieval_gaokao_mathqa(item, pred_key="prediction", prec=1e-3):
+    if pred_key == "program_output" and isinstance(item[pred_key], str):
+        item[pred_key] = [item[pred_key]]
+    pred_str = " ".join(item[pred_key])
+    ans = item["answer"]
+    tag = None
+    idx = -1
+    for t in "ABCD":
+        if t in pred_str and pred_str.index(t) > idx:
+            tag = t
+            idx = pred_str.index(t)
+    return tag == ans
+def eval_math_sat(item, pred_key="prediction", prec=1e-3):
+    for key in [pred_key, "answer"]:
+        assert isinstance(item[key], str), f"{key} = `{item[key]}` is not a str"
+    return item[pred_key].lower() == item["answer"].lower()
+def eval_mmlu_stem(item, pred_key="prediction", prec=1e-3):
+    return eval_math_sat(item, pred_key=pred_key, prec=prec)
+def eval_ocwcourses(item, pred_key="prediction", prec=1e-3):
+    INVALID_ANSWER = "[invalidanswer]"
+    for key in [pred_key, "answer"]:
+        assert isinstance(item[key], str), f"{key} = `{item[key]}` is not a str"
+    pred = item[pred_key]
+    ans = item["answer"]
+    try:
+        float(ans)
+        normalize_fn = normalize_numeric
+        is_equiv = numeric_equality
+        answer_type = "numeric"
+    except ValueError:
+        if "=" in ans:
+            normalize_fn = normalize_symbolic_equation
+            is_equiv = lambda x, y: x == y
+            answer_type = "equation"
+        else:
+            normalize_fn = SymbolicMathMixin().normalize_tex
+            is_equiv = SymbolicMathMixin().is_tex_equiv
+            answer_type = "expression"
+    correct_answer = normalize_fn(ans)
+    unnormalized_answer = pred if pred else INVALID_ANSWER
+    model_answer = normalize_fn(unnormalized_answer)
+    if unnormalized_answer == INVALID_ANSWER:
+        acc = 0
+    elif model_answer == INVALID_ANSWER:
+        acc = 0
+    elif is_equiv(model_answer, correct_answer):
+        acc = 1
+    else:
+        acc = 0
+    return acc
+def eval_minif2f_isabelle(item, pred_key="prediction", prec=1e-3):
+    return True

eval/eval_utils.py ADDED Viewed

	@@ -0,0 +1,400 @@

+import multiprocessing
+from math import isclose
+import numpy as np
+from typing import Union, Any, Dict
+from sympy import simplify, N
+from sympy.parsing.sympy_parser import parse_expr
+from sympy.parsing.latex import parse_latex
+import re
+import regex
+from data_processing.answer_extraction import (
+    extract_answer,
+    extract_program_output,
+    strip_string,
+)
+def extract_program(result: str, last_only=True):
+    """
+    extract the program after "```python", and before "```"
+    """
+    program = ""
+    start = False
+    for line in result.split("\n"):
+        if line.startswith("```python"):
+            if last_only:
+                program = ""  # only extract the last program
+            else:
+                program += "\n# ========\n"
+            start = True
+        elif line.startswith("```"):
+            start = False
+        elif start:
+            program += line + "\n"
+    return program
+def parse_ground_truth(example: Dict[str, Any], data_name):
+    if "gt_cot" in example:
+        return example["gt_cot"], strip_string(example["gt"])
+    # parse ground truth
+    if data_name in ["math", "ocw"]:
+        gt_cot = example["solution"]
+        gt_ans = extract_answer(gt_cot)
+    elif data_name == "gsm8k":
+        gt_cot, gt_ans = example["answer"].split("####")
+    elif data_name == "gsm-hard":
+        gt_cot, gt_ans = example["code"], example["target"]
+    elif data_name == "svamp":
+        gt_cot, gt_ans = example["Equation"], example["Answer"]
+    elif data_name == "asdiv":
+        gt_cot = example["formula"]
+        gt_ans = re.sub(r"\(.*?\)", "", example["answer"])
+    elif data_name == "mawps":
+        gt_cot, gt_ans = None, example["target"]
+    elif data_name == "tabmwp":
+        gt_cot = example["solution"]
+        gt_ans = example["answer"]
+        if example["ans_type"] in ["integer_number", "decimal_number"]:
+            if "/" in gt_ans:
+                gt_ans = int(gt_ans.split("/")[0]) / int(gt_ans.split("/")[1])
+            elif "," in gt_ans:
+                gt_ans = float(gt_ans.replace(",", ""))
+            elif "%" in gt_ans:
+                gt_ans = float(gt_ans.split("%")[0]) / 100
+            else:
+                gt_ans = float(gt_ans)
+    elif data_name == "bbh":
+        gt_cot, gt_ans = None, example["target"]
+    else:
+        raise NotImplementedError(data_name)
+    # post process
+    gt_cot = str(gt_cot).strip()
+    gt_ans = strip_string(gt_ans)
+    return gt_cot, gt_ans
+def parse_question(example, data_name):
+    question = ""
+    if data_name == "asdiv":
+        question = f"{example['body'].strip()} {example['question'].strip()}"
+    elif data_name == "svamp":
+        body = example["Body"].strip()
+        if not body.endswith("."):
+            body = body + "."
+        question = f'{body} {example["Question"].strip()}'
+    elif data_name == "tabmwp":
+        title_str = (
+            f'regarding "{example["table_title"]}" ' if example["table_title"] else ""
+        )
+        question = f"Read the following table {title_str}and answer a question:\n"
+        question += f'{example["table"]}\n{example["question"]}'
+        if example["choices"]:
+            question += (
+                f' Please select from the following options: {example["choices"]}'
+            )
+    else:
+        for key in ["question", "problem", "Question", "input"]:
+            if key in example:
+                question = example[key]
+                break
+    assert question != ""
+    return question.strip()
+def run_execute(executor, result, prompt_type, execute=False):
+    if not result or result == "error":
+        return None, None
+    report = None
+    if "program_only" in prompt_type:
+        prediction = extract_program_output(result)
+    elif prompt_type in ["pot", "pal"] and execute:
+        code = extract_program(result)
+        prediction, report = executor.apply(code)
+    else:
+        prediction = extract_answer(result)
+    prediction = strip_string(prediction)
+    return prediction, report
+def parse_digits(num):
+    # format: 234.23 || 23%
+    num = regex.sub(",", "", str(num))
+    try:
+        return float(num)
+    except:
+        if num.endswith("%"):
+            num = num[:-1]
+            if num.endswith("\\"):
+                num = num[:-1]
+            try:
+                return float(num) / 100
+            except:
+                pass
+    return None
+def is_digit(num):
+    # paired with parse_digits
+    return parse_digits(num) is not None
+def normalize_prediction(prediction):
+    try:  # 1. numerical equal
+        if is_digit(prediction):
+            prediction = np.round(float(str(prediction).replace(",", "")), 6)
+        return str(prediction)
+    except:
+        pass
+    # 2. symbolic equal
+    prediction = str(prediction).strip()
+    ## deal with [], (), {}
+    brackets = []
+    while (
+        prediction.startswith("[")
+        and prediction.endswith("]")
+        or (prediction.startswith("(") and prediction.endswith(")"))
+    ):
+        bracket = prediction[0]
+        prediction = prediction[1:-1]
+    if brackets and "," in prediction:
+        pred_parts = [normalize_prediction(part) for part in prediction.split(",")]
+        prediction = ",".join(pred_parts)
+    if brackets:
+        for b in reversed(brackets):
+            if b == "[":
+                prediction = "[" + prediction + "]"
+            else:
+                assert b == "("
+                prediction = "(" + prediction + ")"
+    def _parse(s):
+        for f in [parse_latex, parse_expr]:
+            try:
+                return f(s)
+            except:
+                pass
+        return s
+    prediction = _parse(prediction)
+    for s in ["{", "}", "(", ")"]:
+        prediction = prediction.replace(s, "")
+    return prediction
+def math_equal(
+    prediction: Union[bool, float, str],
+    reference: Union[float, str],
+    include_percentage: bool = True,
+    is_close: bool = True,
+    timeout: bool = False,
+) -> bool:
+    """
+    Exact match of math if and only if:
+    1. numerical equal: both can convert to float and are equal
+    2. symbolic equal: both can convert to sympy expression and are equal
+    """
+    if str(prediction) == str(reference):
+        return True
+    try:  # 1. numerical equal
+        if is_digit(prediction) and is_digit(reference):
+            prediction = parse_digits(prediction)
+            reference = parse_digits(reference)
+            # number questions
+            if include_percentage:
+                gt_result = [reference / 100, reference, reference * 100]
+            else:
+                gt_result = [reference]
+            for item in gt_result:
+                try:
+                    if is_close:
+                        if isclose(item, prediction, abs_tol=1e-3):
+                            return True
+                    else:
+                        if item == prediction:
+                            return True
+                except Exception:
+                    continue
+            return False
+    except:
+        pass
+    if not prediction and prediction not in [0, False]:
+        return False
+    # 2. symbolic equal
+    reference = str(reference).strip()
+    prediction = str(prediction).strip()
+    if (
+        regex.match(r"(\(|\[).+(\)|\])", prediction) is not None
+        and regex.match(r"(\(|\[).+(\)|\])", reference) is not None
+    ):
+        pred_parts = prediction[1:-1].split(",")
+        ref_parts = reference[1:-1].split(",")
+        if len(pred_parts) == len(ref_parts):
+            if all(
+                [
+                    math_equal(
+                        pred_parts[i], ref_parts[i], include_percentage, is_close
+                    )
+                    for i in range(len(pred_parts))
+                ]
+            ):
+                return True
+    if (
+        (
+            prediction.startswith("\\begin{pmatrix}")
+            or prediction.startswith("\\begin{bmatrix}")
+        )
+        and (
+            prediction.endswith("\\end{pmatrix}")
+            or prediction.endswith("\\end{bmatrix}")
+        )
+        and (
+            reference.startswith("\\begin{pmatrix}")
+            or reference.startswith("\\begin{bmatrix}")
+        )
+        and (
+            reference.endswith("\\end{pmatrix}") or reference.endswith("\\end{bmatrix}")
+        )
+    ):
+        pred_lines = [
+            line.strip()
+            for line in prediction[
+                len("\\begin{pmatrix}") : -len("\\end{pmatrix}")
+            ].split("\\\\")
+            if line.strip()
+        ]
+        ref_lines = [
+            line.strip()
+            for line in reference[
+                len("\\begin{pmatrix}") : -len("\\end{pmatrix}")
+            ].split("\\\\")
+            if line.strip()
+        ]
+        matched = True
+        if len(pred_lines) == len(ref_lines):
+            for pred_line, ref_line in zip(pred_lines, ref_lines):
+                pred_parts = pred_line.split("&")
+                ref_parts = ref_line.split("&")
+                if len(pred_parts) == len(ref_parts):
+                    if not all(
+                        [
+                            math_equal(
+                                pred_parts[i],
+                                ref_parts[i],
+                                include_percentage,
+                                is_close,
+                            )
+                            for i in range(len(pred_parts))
+                        ]
+                    ):
+                        matched = False
+                        break
+                else:
+                    matched = False
+                if not matched:
+                    break
+        else:
+            matched = False
+        if matched:
+            return True
+    if prediction.count("=") == 1 and reference.count("=") == 1:
+        pred = prediction.split("=")
+        pred = f"{pred[0].strip()} - ({pred[1].strip()})"
+        ref = reference.split("=")
+        ref = f"{ref[0].strip()} - ({ref[1].strip()})"
+        if symbolic_equal(pred, ref) or symbolic_equal(f"-({pred})", ref):
+            return True
+    elif (
+        prediction.count("=") == 1
+        and len(prediction.split("=")[0].strip()) <= 2
+        and "=" not in reference
+    ):
+        if math_equal(
+            prediction.split("=")[1], reference, include_percentage, is_close
+        ):
+            return True
+    elif (
+        reference.count("=") == 1
+        and len(reference.split("=")[0].strip()) <= 2
+        and "=" not in prediction
+    ):
+        if math_equal(
+            prediction, reference.split("=")[1], include_percentage, is_close
+        ):
+            return True
+    # symbolic equal with sympy
+    if timeout:
+        if call_with_timeout(symbolic_equal_process, prediction, reference):
+            return True
+    else:
+        if symbolic_equal(prediction, reference):
+            return True
+    return False
+def math_equal_process(param):
+    return math_equal(param[-2], param[-1])
+def symbolic_equal(a, b):
+    def _parse(s):
+        for f in [parse_latex, parse_expr]:
+            try:
+                return f(s)
+            except:
+                pass
+        return s
+    a = _parse(a)
+    b = _parse(b)
+    try:
+        if simplify(a - b) == 0:
+            return True
+    except:
+        pass
+    try:
+        if isclose(N(a), N(b), abs_tol=1e-3):
+            return True
+    except:
+        pass
+    return False
+def symbolic_equal_process(a, b, output_queue):
+    result = symbolic_equal(a, b)
+    output_queue.put(result)
+def call_with_timeout(func, *args, timeout=1, **kwargs):
+    output_queue = multiprocessing.Queue()
+    process_args = args + (output_queue,)
+    process = multiprocessing.Process(target=func, args=process_args, kwargs=kwargs)
+    process.start()
+    process.join(timeout)
+    if process.is_alive():
+        process.terminate()
+        process.join()
+        return False
+    return output_queue.get()

eval/ocwcourses_eval_utils.py ADDED Viewed

	@@ -0,0 +1,266 @@

+import re
+import numpy as np
+import sympy
+from sympy.core.sympify import SympifyError
+from sympy.parsing.latex import parse_latex
+import signal
+INVALID_ANSWER = "[invalidanswer]"
+class timeout:
+    def __init__(self, seconds=1, error_message="Timeout"):
+        self.seconds = seconds
+        self.error_message = error_message
+    def handle_timeout(self, signum, frame):
+        raise TimeoutError(self.error_message)
+    def __enter__(self):
+        signal.signal(signal.SIGALRM, self.handle_timeout)
+        signal.alarm(self.seconds)
+    def __exit__(self, type, value, traceback):
+        signal.alarm(0)
+def normalize_numeric(s):
+    if s is None:
+        return None
+    for unit in [
+        "eV",
+        " \\mathrm{~kg} \\cdot \\mathrm{m} / \\mathrm{s}",
+        " kg m/s",
+        "kg*m/s",
+        "kg",
+        "m/s",
+        "m / s",
+        "m s^{-1}",
+        "\\text{ m/s}",
+        " \\mathrm{m/s}",
+        " \\text{ m/s}",
+        "g/mole",
+        "g/mol",
+        "\\mathrm{~g}",
+        "\\mathrm{~g} / \\mathrm{mol}",
+        "W",
+        "erg/s",
+        "years",
+        "year",
+        "cm",
+    ]:
+        s = s.replace(unit, "")
+        s = s.strip()
+    for maybe_unit in ["m", "s", "cm"]:
+        s = s.replace("\\mathrm{" + maybe_unit + "}", "")
+        s = s.replace("\\mathrm{~" + maybe_unit + "}", "")
+        s = s.strip()
+    s = s.strip("$")
+    try:
+        return float(eval(s))
+    except:
+        try:
+            expr = parse_latex(s)
+            if expr.is_number:
+                return float(expr)
+            return INVALID_ANSWER
+        except:
+            return INVALID_ANSWER
+def numeric_equality(n1, n2, threshold=0.01):
+    if n1 is None or n2 is None:
+        return False
+    if np.isclose(n1, 0) or np.isclose(n2, 0) or np.isclose(n1 - n2, 0):
+        return np.abs(n1 - n2) < threshold * (n1 + n2) / 2
+    else:
+        return np.isclose(n1, n2)
+def normalize_symbolic_equation(s):
+    if not isinstance(s, str):
+        return INVALID_ANSWER
+    if s.startswith("\\["):
+        s = s[2:]
+    if s.endswith("\\]"):
+        s = s[:-2]
+    s = s.replace("\\left(", "(")
+    s = s.replace("\\right)", ")")
+    s = s.replace("\\\\", "\\")
+    if s.startswith("$") or s.endswith("$"):
+        s = s.strip("$")
+    try:
+        maybe_expression = parse_latex(s)
+        if not isinstance(maybe_expression, sympy.core.relational.Equality):
+            # we have equation, not expression
+            return INVALID_ANSWER
+        else:
+            return maybe_expression
+    except:
+        return INVALID_ANSWER
+class SymbolicMathMixin:
+    """
+    Methods useful for parsing mathematical expressions from text and determining equivalence of expressions.
+    """
+    SUBSTITUTIONS = [  # used for text normalize
+        ("an ", ""),
+        ("a ", ""),
+        (".$", "$"),
+        ("\\$", ""),
+        (r"\ ", ""),
+        (" ", ""),
+        ("mbox", "text"),
+        (",\\text{and}", ","),
+        ("\\text{and}", ","),
+        ("\\text{m}", "\\text{}"),
+    ]
+    REMOVED_EXPRESSIONS = [  # used for text normalizer
+        "square",
+        "ways",
+        "integers",
+        "dollars",
+        "mph",
+        "inches",
+        "ft",
+        "hours",
+        "km",
+        "units",
+        "\\ldots",
+        "sue",
+        "points",
+        "feet",
+        "minutes",
+        "digits",
+        "cents",
+        "degrees",
+        "cm",
+        "gm",
+        "pounds",
+        "meters",
+        "meals",
+        "edges",
+        "students",
+        "childrentickets",
+        "multiples",
+        "\\text{s}",
+        "\\text{.}",
+        "\\text{\ns}",
+        "\\text{}^2",
+        "\\text{}^3",
+        "\\text{\n}",
+        "\\text{}",
+        r"\mathrm{th}",
+        r"^\circ",
+        r"^{\circ}",
+        r"\;",
+        r",\!",
+        "{,}",
+        '"',
+        "\\dots",
+    ]
+    def normalize_tex(self, final_answer: str) -> str:
+        """
+        Normalizes a string representing a mathematical expression.
+        Used as a preprocessing step before parsing methods.
+        Copied character for character from appendix D of Lewkowycz et al. (2022)
+        """
+        final_answer = final_answer.split("=")[-1]
+        for before, after in self.SUBSTITUTIONS:
+            final_answer = final_answer.replace(before, after)
+        for expr in self.REMOVED_EXPRESSIONS:
+            final_answer = final_answer.replace(expr, "")
+        # Extract answer that is in LaTeX math, is bold,
+        # is surrounded by a box, etc.
+        final_answer = re.sub(r"(.*?)(\$)(.*?)(\$)(.*)", "$\\3$", final_answer)
+        final_answer = re.sub(r"(\\text\{)(.*?)(\})", "\\2", final_answer)
+        final_answer = re.sub(r"(\\textbf\{)(.*?)(\})", "\\2", final_answer)
+        final_answer = re.sub(r"(\\overline\{)(.*?)(\})", "\\2", final_answer)
+        final_answer = re.sub(r"(\\boxed\{)(.*)(\})", "\\2", final_answer)
+        # Normalize shorthand TeX:
+        #  \fracab -> \frac{a}{b}
+        #  \frac{abc}{bef} -> \frac{abc}{bef}
+        #  \fracabc -> \frac{a}{b}c
+        #  \sqrta -> \sqrt{a}
+        #  \sqrtab -> sqrt{a}b
+        final_answer = re.sub(r"(frac)([^{])(.)", "frac{\\2}{\\3}", final_answer)
+        final_answer = re.sub(r"(sqrt)([^{])", "sqrt{\\2}", final_answer)
+        final_answer = final_answer.replace("$", "")
+        # Normalize 100,000 -> 100000
+        if final_answer.replace(",", "").isdigit():
+            final_answer = final_answer.replace(",", "")
+        return final_answer
+    def parse_tex(self, text: str, time_limit: int = 5) -> sympy.Basic:
+        """
+        Wrapper around `sympy.parse_text` that outputs a SymPy expression.
+        Typically, you want to apply `normalize_text` as a preprocessing step.
+        """
+        try:
+            with timeout(seconds=time_limit):
+                parsed = parse_latex(text)
+        except (
+            # general error handling: there is a long tail of possible sympy/other
+            # errors we would like to catch
+            Exception
+        ) as e:
+            print(f"failed to parse {text} with exception {e}")
+            return None
+        return parsed
+    def is_exp_equiv(self, x1: sympy.Basic, x2: sympy.Basic, time_limit=5) -> bool:
+        """
+        Determines whether two sympy expressions are equal.
+        """
+        try:
+            with timeout(seconds=time_limit):
+                try:
+                    diff = x1 - x2
+                except (SympifyError, ValueError, TypeError) as e:
+                    print(f"Couldn't subtract {x1} and {x2} with exception {e}")
+                    return False
+                try:
+                    if sympy.simplify(diff) == 0:
+                        return True
+                    else:
+                        return False
+                except (SympifyError, ValueError, TypeError) as e:
+                    print(f"Failed to simplify {x1}-{x2} with {e}")
+                    return False
+        except TimeoutError as e:
+            print(f"Timed out comparing {x1} and {x2}")
+            return False
+        except Exception as e:
+            print(f"failed on unrecognized exception {e}")
+            return False
+    def is_tex_equiv(self, x1: str, x2: str, time_limit=5) -> bool:
+        """
+        Determines whether two (ideally normalized using `normalize_text`) TeX expressions are equal.
+        Does so by first checking for string exact-match, then falls back on sympy-equivalence,
+        following the (Lewkowycz et al. 2022) methodology.
+        """
+        if x1 == x2:
+            # don't resort to sympy if we have full string match, post-normalization
+            return True
+        else:
+            return False
+        parsed_x2 = self.parse_tex(x2)
+        if not parsed_x2:
+            # if our reference fails to parse into a Sympy object,
+            # we forgo parsing + checking our generated answer.
+            return False
+        return self.is_exp_equiv(self.parse_tex(x1), parsed_x2, time_limit=time_limit)

eval/python_executor.py ADDED Viewed

	@@ -0,0 +1,193 @@

+import os
+import io
+from contextlib import redirect_stdout
+import pickle
+import regex
+import copy
+from typing import Any, Dict, Optional
+import multiprocess
+from pebble import ProcessPool
+from concurrent.futures import TimeoutError
+from functools import partial
+import traceback
+from timeout_decorator import timeout
+class GenericRuntime:
+    GLOBAL_DICT = {}
+    LOCAL_DICT = None
+    HEADERS = []
+    def __init__(self):
+        self._global_vars = copy.copy(self.GLOBAL_DICT)
+        self._local_vars = copy.copy(self.LOCAL_DICT) if self.LOCAL_DICT else None
+        for c in self.HEADERS:
+            self.exec_code(c)
+    def exec_code(self, code_piece: str) -> None:
+        if regex.search(r"(\s|^)?input\(", code_piece) or regex.search(
+            r"(\s|^)?os.system\(", code_piece
+        ):
+            raise RuntimeError()
+        exec(code_piece, self._global_vars)
+    def eval_code(self, expr: str) -> Any:
+        return eval(expr, self._global_vars)
+    def inject(self, var_dict: Dict[str, Any]) -> None:
+        for k, v in var_dict.items():
+            self._global_vars[k] = v
+    @property
+    def answer(self):
+        return self._global_vars["answer"]
+class PythonExecutor:
+    def __init__(
+        self,
+        runtime: Optional[Any] = None,
+        get_answer_symbol: Optional[str] = None,
+        get_answer_expr: Optional[str] = None,
+        get_answer_from_stdout: bool = False,
+    ) -> None:
+        self.runtime = runtime if runtime else GenericRuntime()
+        self.answer_symbol = get_answer_symbol
+        self.answer_expr = get_answer_expr
+        self.get_answer_from_stdout = get_answer_from_stdout
+    def process_generation_to_code(self, gens: str):
+        batch_code = []
+        for g in gens:
+            multiline_comments = False
+            code = []
+            for line in g.split("\n"):
+                strip_line = line.strip()
+                if strip_line.startswith("#"):
+                    line = line.split("#", 1)[0] + "# comments"
+                elif (
+                    not multiline_comments
+                    and strip_line.startswith('"""')
+                    and strip_line.endswith('"""')
+                    and len(strip_line) >= 6
+                ):
+                    line = line.split('"""', 1)[0] + '"""comments"""'
+                elif not multiline_comments and strip_line.startswith('"""'):
+                    multiline_comments = True
+                elif multiline_comments and strip_line.endswith('"""'):
+                    multiline_comments = False
+                    line = ""
+                if not multiline_comments:
+                    code.append(line)
+            batch_code.append(code)
+        return batch_code
+    @staticmethod
+    def execute(
+        code,
+        get_answer_from_stdout=None,
+        runtime=None,
+        answer_symbol=None,
+        answer_expr=None,
+        timeout_length=10,
+    ):
+        try:
+            if get_answer_from_stdout:
+                program_io = io.StringIO()
+                with redirect_stdout(program_io):
+                    timeout(timeout_length)(runtime.exec_code)("\n".join(code))
+                program_io.seek(0)
+                result = "".join(program_io.readlines())  # [-1]
+            elif answer_symbol:
+                timeout(timeout_length)(runtime.exec_code)("\n".join(code))
+                result = runtime._global_vars[answer_symbol]
+            elif answer_expr:
+                timeout(timeout_length)(runtime.exec_code)("\n".join(code))
+                result = timeout(timeout_length)(runtime.eval_code)(answer_expr)
+            else:
+                timeout(timeout_length)(runtime.exec_code)("\n".join(code[:-1]))
+                result = timeout(timeout_length)(runtime.eval_code)(code[-1])
+            concise_exec_info = ""
+            exec_info = ""
+            str(result)
+            pickle.dumps(result)  # serialization check
+        except:
+            # traceback.print_exc()
+            result = ""
+            concise_exec_info = traceback.format_exc().split("\n")[-2]
+            exec_info = traceback.format_exc()
+            if (
+                get_answer_from_stdout
+                and "exec(code_piece, self._global_vars)" in exec_info
+            ):
+                exec_info = exec_info.split("exec(code_piece, self._global_vars)")[
+                    -1
+                ].strip()
+                msg = []
+                for line in exec_info.split("\n"):
+                    patt = regex.search(
+                        r'(?P<start>.*)File "(?P<file>.*)", line (?P<lno>\d+), (?P<end>.*)',
+                        line,
+                    )
+                    if patt is not None:
+                        if "<module>" in patt.group("end"):
+                            continue
+                        fname = patt.group("file")
+                        if "site-packages" in fname:
+                            fname = f"site-packages{fname.split('site-packages', 1)[1]}"
+                            line = f'{patt.group("start")}File "{fname}", {patt.group("end")}'
+                        else:
+                            line = f'{patt.group("start")}{patt.group("end")}'
+                    else:
+                        patt = regex.search(
+                            r"(?P<start>.*)(?P<file>/.*site-packages/.*\.py)(?P<end>.*)",
+                            line,
+                        )
+                        if patt is not None:
+                            line = f'{patt.group("start")}site-packages{patt.group("file").split("site-packages", 1)[1]}{patt.group("end")}'
+                    msg.append(line)
+                exec_info = "\n".join(msg)
+        return result, concise_exec_info, exec_info
+    def apply(self, code):
+        return self.batch_apply([code])[0]
+    def batch_apply(self, batch_code):
+        all_code_snippets = self.process_generation_to_code(batch_code)
+        all_exec_results = []
+        executor = partial(
+            self.execute,
+            get_answer_from_stdout=self.get_answer_from_stdout,
+            runtime=self.runtime,
+            answer_symbol=self.answer_symbol,
+            answer_expr=self.answer_expr,
+            timeout_length=10,
+        )
+        with ProcessPool(max_workers=multiprocess.cpu_count()) as pool:
+            iterator = pool.map(executor, all_code_snippets, timeout=10).result()
+            while True:
+                try:
+                    result = next(iterator)
+                    all_exec_results.append(result)
+                except StopIteration:
+                    break
+                except TimeoutError as error:
+                    all_exec_results.append(("", "Timeout Error", "Timeout Error"))
+                except Exception as error:
+                    print(error)
+                    exit()
+        batch_results = []
+        for code, (result, concise_exec_info, exec_info) in zip(
+            all_code_snippets, all_exec_results
+        ):
+            metadata = {
+                "code": code,
+                "exec_result": result,
+                "concise_exec_info": concise_exec_info,
+                "exec_info": exec_info,
+            }
+            batch_results.append((result, metadata))
+        return batch_results

main.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import os
+import argparse
+from huggingface_hub import hf_hub_download
+import json
+from build_cache import cache
+from compute_perp import Evaluator as PPLEvaluator
+from compute_sc import SCEvaluator
+from compute_rpc import RPCEvaluator
+REPOID = {
+    "MATH": "WNJXYK/MATH-Reasoning-Paths",
+    "MathOdyssey": "WNJXYK/MathOdyssey-Reasoning-Paths",
+    "AIME": "WNJXYK/AIME_1983_2024-Reasoning-Paths",
+    "OlympiadBench": "WNJXYK/OlympiadBench-Reasoning-Paths"
+}
+EVALUATOR_MAP = {
+    "PPL": PPLEvaluator,
+    "SC": SCEvaluator,
+    "RPC": RPCEvaluator
+}
+args = argparse.ArgumentParser()
+args.add_argument("--dataset", type=str, choices=["MATH", "MathOdyssey", "AIME", "OlympiadBench"], default="MathOdyssey")
+args.add_argument("--model", type=str, choices=["Deepseek-Math-RL-7B", "InternLM2-Math-Plus-1.8B", "InternLM2-Math-Plus-7B"], default="InternLM2-Math-Plus-7B")
+args.add_argument("--K", type=int, default=128)
+args.add_argument("--method", type=str, default="PPL", choices=["PPL", "SC", "RPC"])
+args = args.parse_args()
+repo_id = REPOID[args.dataset]
+filename = args.model + ".json"
+# Download sampled reasoning paths from Hugging Face
+try:
+    file_path = hf_hub_download(repo_id=repo_id, filename=filename, repo_type="dataset")
+    with open(file_path, 'r', encoding='utf-8') as f:
+        json_file = json.load(f)
+    print(f"Load sampled reasoning paths {filename} from {repo_id} successfully!")
+except Exception as e:
+    print(f"Failed to load sampled reasoning paths {filename} from {repo_id}: {e}")
+# Build cache for checking equality
+cache_path = file_path.replace(".json", ".cache.json")
+cache(json_file, cache_path)
+with open(cache_path, 'r', encoding='utf-8') as f:
+    cache_file = json.load(f)
+# Run!
+results = EVALUATOR_MAP[args.method]().solve(json_file=json_file, cache_file=cache_file, K=args.K)
+# Report results
+result_str = f"{args.method} {args.dataset} {args.model} {args.K} {results}"
+with open("results.txt", "a") as f:
+    f.write(result_str + "\n")
+print(result_str)

metrics.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import numpy as np
+EPS = 1e-10
+NLLEPS = 1e-6
+def compute_maximum_metrics(predicts, n_bins=10):
+    n = len(predicts)
+    acc, cnf, siz = np.zeros(n_bins), np.zeros(n_bins), np.zeros(n_bins)
+    brier_score = []
+    negative_ll = []
+    for idx in range(n):
+        m = len(predicts[idx])
+        # Compute maximum probabilities and corresponding counts within each problem
+        max_prob, max_prob_counts = -1e6, 0
+        for i in range(m):
+            ans, prob, flag = predicts[idx][i]
+            if prob > max_prob:
+                max_prob, max_prob_counts = prob, 0
+            if prob >= max_prob - EPS:
+                max_prob_counts += 1
+        # print(max_prob, max_prob_counts)
+        # Compute the maximum accuracy for each problem as well as the ECE metric
+        vote_acc = 0
+        for i in range(m):
+            ans, prob, flag = predicts[idx][i]
+            if prob < max_prob:
+                continue
+            if np.isnan(prob):
+                continue
+            if flag:
+                vote_acc += 1.0 / max_prob_counts
+            # Compute Expected Calibration Error
+            for cur in range(n_bins):
+                lower, upper = cur / n_bins, (cur + 1) / n_bins
+                if lower < max_prob <= upper:
+                    if flag:
+                        acc[cur] += 1.0 / max_prob_counts
+                    cnf[cur] += prob / max_prob_counts
+                    siz[cur] += 1.0 / max_prob_counts
+        # Compute Brier Score
+        brier_score.append((vote_acc - max_prob) ** 2)
+        # Compute Negative Likelihhod
+        cliped_max_prob = max(min(max_prob, 1 - NLLEPS), NLLEPS)
+        cliped_vote_acc = max(min(vote_acc, 1 - NLLEPS), NLLEPS)
+        negative_ll.append(
+            -np.log(cliped_max_prob) * cliped_vote_acc
+            - np.log(1 - cliped_max_prob) * (1 - cliped_vote_acc)
+        )
+    # Turn each metrics into values
+    ece = 0
+    for cur in range(n_bins):
+        if siz[cur] > 0:
+            acc[cur] = acc[cur] / siz[cur]
+            cnf[cur] = cnf[cur] / siz[cur]
+        ece += siz[cur] * np.abs(acc[cur] - cnf[cur])
+        # print(siz[cur], acc[cur], cnf[cur])
+    ece = ece / sum(siz)
+    bs = np.mean(brier_score)
+    nll = np.mean(negative_ll)
+    return (ece, bs, nll), (acc, cnf, siz)
+def compute_average_metrics(predicts, n_bins=10):
+    n = len(predicts)
+    acc, cnf, siz = np.zeros(n_bins), np.zeros(n_bins), np.zeros(n_bins)
+    brier_score = []
+    negative_ll = []
+    for idx in range(n):
+        m = len(predicts[idx])
+        problem_brier_score = []
+        problem_negative_ll = []
+        for i in range(m):
+            ans, prob, flag = predicts[idx][i]
+            # Compute Expected Calibration Error
+            for cur in range(n_bins):
+                lower, upper = cur / n_bins, (cur + 1) / n_bins
+                if lower < prob <= upper:
+                    if flag:
+                        acc[cur] += 1.0 / m
+                    cnf[cur] += prob / m
+                    siz[cur] += 1.0 / m
+            # Compute Brier Score
+            problem_brier_score.append(((1 if flag else 0) - prob) ** 2)
+            # Compute Negative Likelyhood
+            cliped_max_prob = max(min(prob, 1 - NLLEPS), NLLEPS)
+            cliped_vote_acc = max(min(1 if flag else 0, 1 - NLLEPS), NLLEPS)
+            problem_negative_ll.append(
+                -np.log(cliped_max_prob) * cliped_vote_acc
+                - np.log(1 - cliped_max_prob) * (1 - cliped_vote_acc)
+            )
+        brier_score.append(np.mean(problem_brier_score))
+        negative_ll.append(np.mean(problem_negative_ll))
+    ece = 0
+    for cur in range(n_bins):
+        if siz[cur] > 0:
+            acc[cur] = acc[cur] / siz[cur]
+            cnf[cur] = cnf[cur] / siz[cur]
+        ece += siz[cur] * np.abs(acc[cur] - cnf[cur])
+    ece = ece / sum(siz)
+    bs = np.mean(brier_score)
+    nll = np.mean(negative_ll)
+    return (ece, bs, nll), (acc, cnf, siz)

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+antlr4-python3-runtime==4.13.2
+datasets==4.1.1
+Fraction==2.2.0
+huggingface-hub==0.35.3
+multiprocess==0.70.16
+numpy==2.0.2
+regex==2025.9.18
+scipy==1.13.1
+sympy==1.14.0
+tqdm==4.67.1