File size: 8,475 Bytes
22c93a7 fc6f0c5 22c93a7 fc6f0c5 22c93a7 fc6f0c5 22c93a7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 |
import gradio as gr
import json, os
from huggingface_hub import hf_hub_download
from compute_perp import prep_evaluator, numberic_compare, check_equal
from compute_sc import sc_evaluator
from compute_rpc import wpc_evaluator
import numpy as np
def greet(name):
return "Hello " + name + "!!"
json_file = {"predict": [], "answer": [], "completion": [], "mean_logprob": [], "prompt": []}
demo = gr.Blocks()
with demo:
paper_title = gr.HTML("""<div align='center'><h1>[NeurIPS 2025] A Theoretical Study on Bridging Internal Probability and Self-Consistency for LLM Reasoning</h1></div>""")
paper_info = gr.HTML("""<div align="center"><h3><a href="https://arxiv.org/pdf/2502.00511">📄 [Paper]</a> • <a href="https://wnjxyk.github.io/RPC">🌐 [Project]</a> • <a href="#" onclick="document.getElementById('bibtex-popup').style.display='block';">📚 [BibTeX]</a><h3>
<div id="bibtex-popup" style="display:none; position:fixed; top:50%; left:50%; transform:translate(-50%, -50%); background:white; padding:20px; border:1px solid #ccc; box-shadow:0 0 10px rgba(0,0,0,0.2); z-index:1000; max-width:80%; overflow:auto;">
<pre style="white-space:pre-wrap; font-size:12px; text-align:left;">@inproceedings{zhou24theoretical,
author = {Zhou, Zhi and Tan, Yuhao and Li, Zenan and Yao, Yuan and Guo, Lan-Zhe and Li, Yu-Feng and Ma, Xiaoxing},
title = {A Theorecial Study on Bridging Internal Probability and Self-Consistency for LLM Reasoning},
booktitle = {Advances in Neural Information Processing Systems},
year = {2025},
}</pre>
<button onclick="document.getElementById('bibtex-popup').style.display='none';" style="margin-top:10px; padding:5px 10px;">Close</button>
</div></div>""")
with gr.Column():
gr.Markdown("## 1. Experimental Settings")
with gr.Row():
dataset = gr.Dropdown(
choices=["MATH", "MathOdyssey", "AIME", "OlympiadBench"],
value="MathOdyssey",
label="Dataset",
interactive=True
)
model = gr.Dropdown(
choices=["Deepseek-Math-RL-7B", "InternLM2-Math-Plus-1.8B", "InternLM2-Math-Plus-7B"],
value="InternLM2-Math-Plus-7B",
label="Model",
interactive=True
)
k_value = gr.Dropdown(
choices=[8, 16, 32, 64, 128],
value=128,
label="K (Number of Sampled Reasoning Paths)",
interactive=True
)
seed = gr.Number(
label="Random Seed",
value=998244353,
step=1,
interactive=True
)
def update_k_value(dataset_choice):
if dataset_choice == "MATH":
return gr.update(choices=[8, 16, 32, 64], value=min(64, k_value.value))
else:
return gr.update(choices=[8, 16, 32, 64, 128], value=k_value.value)
dataset.change(fn=update_k_value, inputs=dataset, outputs=k_value)
load_btn = gr.Button("Load All Problems")
with gr.Column(visible=False) as content_column:
gr.Markdown("## 2. Problem Selection")
with gr.Group():
data_info = gr.Textbox(label="Experiment Info", value="")
problem_id = gr.Dropdown(
choices=[1],
value=1,
label="Problem ID (We removed (1) problems that were unlikely to be answered correctly using any of the methods; (2) easy problems)",
interactive=True
)
with gr.Row():
problem_prompt = gr.Textbox(label="Problem Prompt", value="", scale=3)
problem_answer = gr.Textbox(label="Problem Answer", value="", scale=1)
def update_problem_info(problem_id):
return gr.update(value=json_file['prompt'][problem_id-1], label=f"Problem#{problem_id} Prompt"), gr.update(value=json_file['answer'][problem_id-1], label=f"Problem#{problem_id} Answer")
problem_id.change(fn=update_problem_info, inputs=problem_id, outputs=[problem_prompt, problem_answer])
run_btn = gr.Button("Run Evaluation")
with gr.Column(visible=False) as result_column:
gr.Markdown("## 3. Experiment Result")
with gr.Row():
with gr.Column():
gr.Markdown("### PPL (Internal Probability)")
ppl_result = gr.Markdown()
with gr.Column():
gr.Markdown("### SC (Self-Consistency)")
sc_result = gr.Markdown(value="")
with gr.Column():
gr.Markdown("### RPC (Ours)")
rpc_result = gr.Markdown(value="")
def get_available_problems():
global json_file
answer = np.array(json_file["accuracy"]).mean(axis=0)
# print(answer.shape)
# Select indices where the answer is greater than 0.3
available_indices = np.where((answer > 0.3) & (answer < 0.5))[0]
available_indices = available_indices + 1
# print(available_indices)
return available_indices.tolist()
def load(dataset, model, k_value, seed):
try:
repo_id = {
"MATH": "WNJXYK/MATH-Reasoning-Paths",
"MathOdyssey": "WNJXYK/MathOdyssey-Reasoning-Paths",
"AIME": "WNJXYK/AIME_1983_2024-Reasoning-Paths",
"OlympiadBench": "WNJXYK/OlympiadBench-Reasoning-Paths"
}[dataset]
filename = f"{model}.json"
yield f"Downloading sampled reasoning paths from Hugging Face {repo_id}...", gr.update(visible=False), gr.update(), gr.update(), gr.update(visible=False)
file_path = hf_hub_download(repo_id=repo_id, filename=filename, repo_type="dataset")
global json_file
with open(file_path, 'r', encoding='utf-8') as f:
json_file = json.load(f)
clist = get_available_problems()
os.remove(file_path)
yield "Loading complete! You can now select a problem ID.", gr.update(visible=True), gr.update(value=f"Dataset: {dataset}\tModel: {model}\tK: {k_value}\tSeed: {seed}"), gr.update(choices=clist, value=clist[0]), gr.update(visible=False)
except Exception as e:
yield f"Error: {str(e)}"
def res_to_str(correct, answers, topk=10):
answers = sorted(answers, key=lambda x: x[1], reverse=True)
response = "| # | Answer | Probability | Correct |\n|---|--------|------------|--------|\n"
for i in range(min(len(answers), topk)):
correct_mark = "✅" if answers[i][2] else "❌"
wrapped_answer = answers[i][0] if len(answers[i][0]) <= 10 else answers[i][0][:10] + "..."
response += f"| Top-{i+1} | {wrapped_answer} | {answers[i][1]:.2f} | {correct_mark} |\n"
return response
def evaluate(problem_id):
ppl_correct, ppl_answers = prep_evaluator(
json_file["predict"][problem_id-1],
json_file["completion"][problem_id-1],
json_file["mean_logprob"][problem_id-1],
json_file["answer"][problem_id-1],
numberic_compare,
check_equal
)
sc_correct, sc_answers = sc_evaluator(
json_file["predict"][problem_id-1],
json_file["completion"][problem_id-1],
json_file["mean_logprob"][problem_id-1],
json_file["answer"][problem_id-1],
numberic_compare,
check_equal
)
rpc_correct, rpc_answers = wpc_evaluator(
json_file["predict"][problem_id-1],
json_file["completion"][problem_id-1],
json_file["mean_logprob"][problem_id-1],
json_file["answer"][problem_id-1],
numberic_compare,
check_equal
)
return gr.update(visible=True), gr.update(value=res_to_str(ppl_correct, ppl_answers)), gr.update(value=res_to_str(sc_correct, sc_answers)), gr.update(value=res_to_str(rpc_correct, rpc_answers))
load_btn.click(fn=load, inputs=[dataset, model, k_value, seed],outputs=[load_btn, content_column, data_info, problem_id, result_column], show_progress="inside")
run_btn.click(fn=evaluate, inputs=problem_id, outputs=[result_column, ppl_result, sc_result, rpc_result])
if __name__ == "__main__":
demo.launch() |