|
import gradio as gr |
|
import json, os |
|
from huggingface_hub import hf_hub_download |
|
from compute_perp import prep_evaluator, numberic_compare, check_equal |
|
from compute_sc import sc_evaluator |
|
from compute_rpc import wpc_evaluator |
|
import numpy as np |
|
|
|
def greet(name): |
|
return "Hello " + name + "!!" |
|
|
|
json_file = {"predict": [], "answer": [], "completion": [], "mean_logprob": [], "prompt": []} |
|
|
|
demo = gr.Blocks() |
|
with demo: |
|
paper_title = gr.HTML("""<div align='center'><h1>[NeurIPS 2025] A Theoretical Study on Bridging Internal Probability and Self-Consistency for LLM Reasoning</h1></div>""") |
|
paper_info = gr.HTML("""<div align="center"><h3><a href="https://arxiv.org/pdf/2502.00511">📄 [Paper]</a> • <a href="https://wnjxyk.github.io/RPC">🌐 [Project]</a> • <a href="#" onclick="document.getElementById('bibtex-popup').style.display='block';">📚 [BibTeX]</a><h3> |
|
<div id="bibtex-popup" style="display:none; position:fixed; top:50%; left:50%; transform:translate(-50%, -50%); background:white; padding:20px; border:1px solid #ccc; box-shadow:0 0 10px rgba(0,0,0,0.2); z-index:1000; max-width:80%; overflow:auto;"> |
|
<pre style="white-space:pre-wrap; font-size:12px; text-align:left;">@inproceedings{zhou24theoretical, |
|
author = {Zhou, Zhi and Tan, Yuhao and Li, Zenan and Yao, Yuan and Guo, Lan-Zhe and Li, Yu-Feng and Ma, Xiaoxing}, |
|
title = {A Theorecial Study on Bridging Internal Probability and Self-Consistency for LLM Reasoning}, |
|
booktitle = {Advances in Neural Information Processing Systems}, |
|
year = {2025}, |
|
}</pre> |
|
<button onclick="document.getElementById('bibtex-popup').style.display='none';" style="margin-top:10px; padding:5px 10px;">Close</button> |
|
</div></div>""") |
|
|
|
with gr.Column(): |
|
gr.Markdown("## 1. Experimental Settings") |
|
with gr.Row(): |
|
dataset = gr.Dropdown( |
|
choices=["MATH", "MathOdyssey", "AIME", "OlympiadBench"], |
|
value="MathOdyssey", |
|
label="Dataset", |
|
interactive=True |
|
) |
|
model = gr.Dropdown( |
|
choices=["Deepseek-Math-RL-7B", "InternLM2-Math-Plus-1.8B", "InternLM2-Math-Plus-7B"], |
|
value="InternLM2-Math-Plus-7B", |
|
label="Model", |
|
interactive=True |
|
) |
|
k_value = gr.Dropdown( |
|
choices=[8, 16, 32, 64, 128], |
|
value=128, |
|
label="K (Number of Sampled Reasoning Paths)", |
|
interactive=True |
|
) |
|
seed = gr.Number( |
|
label="Random Seed", |
|
value=998244353, |
|
step=1, |
|
interactive=True |
|
) |
|
def update_k_value(dataset_choice): |
|
if dataset_choice == "MATH": |
|
return gr.update(choices=[8, 16, 32, 64], value=min(64, k_value.value)) |
|
else: |
|
return gr.update(choices=[8, 16, 32, 64, 128], value=k_value.value) |
|
dataset.change(fn=update_k_value, inputs=dataset, outputs=k_value) |
|
load_btn = gr.Button("Load All Problems") |
|
|
|
with gr.Column(visible=False) as content_column: |
|
gr.Markdown("## 2. Problem Selection") |
|
with gr.Group(): |
|
data_info = gr.Textbox(label="Experiment Info", value="") |
|
problem_id = gr.Dropdown( |
|
choices=[1], |
|
value=1, |
|
label="Problem ID (We removed (1) problems that were unlikely to be answered correctly using any of the methods; (2) easy problems)", |
|
interactive=True |
|
) |
|
with gr.Row(): |
|
problem_prompt = gr.Textbox(label="Problem Prompt", value="", scale=3) |
|
problem_answer = gr.Textbox(label="Problem Answer", value="", scale=1) |
|
def update_problem_info(problem_id): |
|
return gr.update(value=json_file['prompt'][problem_id-1], label=f"Problem#{problem_id} Prompt"), gr.update(value=json_file['answer'][problem_id-1], label=f"Problem#{problem_id} Answer") |
|
problem_id.change(fn=update_problem_info, inputs=problem_id, outputs=[problem_prompt, problem_answer]) |
|
run_btn = gr.Button("Run Evaluation") |
|
|
|
with gr.Column(visible=False) as result_column: |
|
gr.Markdown("## 3. Experiment Result") |
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.Markdown("### PPL (Internal Probability)") |
|
ppl_result = gr.Markdown() |
|
with gr.Column(): |
|
gr.Markdown("### SC (Self-Consistency)") |
|
sc_result = gr.Markdown(value="") |
|
with gr.Column(): |
|
gr.Markdown("### RPC (Ours)") |
|
rpc_result = gr.Markdown(value="") |
|
|
|
def get_available_problems(): |
|
global json_file |
|
answer = np.array(json_file["accuracy"]).mean(axis=0) |
|
|
|
|
|
available_indices = np.where((answer > 0.3) & (answer < 0.5))[0] |
|
available_indices = available_indices + 1 |
|
|
|
return available_indices.tolist() |
|
|
|
|
|
def load(dataset, model, k_value, seed): |
|
try: |
|
repo_id = { |
|
"MATH": "WNJXYK/MATH-Reasoning-Paths", |
|
"MathOdyssey": "WNJXYK/MathOdyssey-Reasoning-Paths", |
|
"AIME": "WNJXYK/AIME_1983_2024-Reasoning-Paths", |
|
"OlympiadBench": "WNJXYK/OlympiadBench-Reasoning-Paths" |
|
}[dataset] |
|
filename = f"{model}.json" |
|
|
|
yield f"Downloading sampled reasoning paths from Hugging Face {repo_id}...", gr.update(visible=False), gr.update(), gr.update(), gr.update(visible=False) |
|
file_path = hf_hub_download(repo_id=repo_id, filename=filename, repo_type="dataset") |
|
|
|
global json_file |
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
json_file = json.load(f) |
|
clist = get_available_problems() |
|
os.remove(file_path) |
|
|
|
yield "Loading complete! You can now select a problem ID.", gr.update(visible=True), gr.update(value=f"Dataset: {dataset}\tModel: {model}\tK: {k_value}\tSeed: {seed}"), gr.update(choices=clist, value=clist[0]), gr.update(visible=False) |
|
|
|
except Exception as e: |
|
yield f"Error: {str(e)}" |
|
|
|
def res_to_str(correct, answers, topk=10): |
|
answers = sorted(answers, key=lambda x: x[1], reverse=True) |
|
response = "| # | Answer | Probability | Correct |\n|---|--------|------------|--------|\n" |
|
for i in range(min(len(answers), topk)): |
|
correct_mark = "✅" if answers[i][2] else "❌" |
|
wrapped_answer = answers[i][0] if len(answers[i][0]) <= 10 else answers[i][0][:10] + "..." |
|
response += f"| Top-{i+1} | {wrapped_answer} | {answers[i][1]:.2f} | {correct_mark} |\n" |
|
return response |
|
|
|
def evaluate(problem_id): |
|
ppl_correct, ppl_answers = prep_evaluator( |
|
json_file["predict"][problem_id-1], |
|
json_file["completion"][problem_id-1], |
|
json_file["mean_logprob"][problem_id-1], |
|
json_file["answer"][problem_id-1], |
|
numberic_compare, |
|
check_equal |
|
) |
|
|
|
sc_correct, sc_answers = sc_evaluator( |
|
json_file["predict"][problem_id-1], |
|
json_file["completion"][problem_id-1], |
|
json_file["mean_logprob"][problem_id-1], |
|
json_file["answer"][problem_id-1], |
|
numberic_compare, |
|
check_equal |
|
) |
|
|
|
rpc_correct, rpc_answers = wpc_evaluator( |
|
json_file["predict"][problem_id-1], |
|
json_file["completion"][problem_id-1], |
|
json_file["mean_logprob"][problem_id-1], |
|
json_file["answer"][problem_id-1], |
|
numberic_compare, |
|
check_equal |
|
) |
|
|
|
return gr.update(visible=True), gr.update(value=res_to_str(ppl_correct, ppl_answers)), gr.update(value=res_to_str(sc_correct, sc_answers)), gr.update(value=res_to_str(rpc_correct, rpc_answers)) |
|
|
|
load_btn.click(fn=load, inputs=[dataset, model, k_value, seed],outputs=[load_btn, content_column, data_info, problem_id, result_column], show_progress="inside") |
|
run_btn.click(fn=evaluate, inputs=problem_id, outputs=[result_column, ppl_result, sc_result, rpc_result]) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |