import gradio as gr import json, os from huggingface_hub import hf_hub_download from compute_perp import prep_evaluator, numberic_compare, check_equal from compute_sc import sc_evaluator from compute_rpc import wpc_evaluator import numpy as np def greet(name): return "Hello " + name + "!!" json_file = {"predict": [], "answer": [], "completion": [], "mean_logprob": [], "prompt": []} demo = gr.Blocks() with demo: paper_title = gr.HTML("""

[NeurIPS 2025] A Theoretical Study on Bridging Internal Probability and Self-Consistency for LLM Reasoning

""") paper_info = gr.HTML("""

📄 [Paper]🌐 [Project]📚 [BibTeX]

""") with gr.Column(): gr.Markdown("## 1. Experimental Settings") with gr.Row(): dataset = gr.Dropdown( choices=["MATH", "MathOdyssey", "AIME", "OlympiadBench"], value="MathOdyssey", label="Dataset", interactive=True ) model = gr.Dropdown( choices=["Deepseek-Math-RL-7B", "InternLM2-Math-Plus-1.8B", "InternLM2-Math-Plus-7B"], value="InternLM2-Math-Plus-7B", label="Model", interactive=True ) k_value = gr.Dropdown( choices=[8, 16, 32, 64, 128], value=128, label="K (Number of Sampled Reasoning Paths)", interactive=True ) seed = gr.Number( label="Random Seed", value=998244353, step=1, interactive=True ) def update_k_value(dataset_choice): if dataset_choice == "MATH": return gr.update(choices=[8, 16, 32, 64], value=min(64, k_value.value)) else: return gr.update(choices=[8, 16, 32, 64, 128], value=k_value.value) dataset.change(fn=update_k_value, inputs=dataset, outputs=k_value) load_btn = gr.Button("Load All Problems") with gr.Column(visible=False) as content_column: gr.Markdown("## 2. Problem Selection") with gr.Group(): data_info = gr.Textbox(label="Experiment Info", value="") problem_id = gr.Dropdown( choices=[1], value=1, label="Problem ID (We removed (1) problems that were unlikely to be answered correctly using any of the methods; (2) easy problems)", interactive=True ) with gr.Row(): problem_prompt = gr.Textbox(label="Problem Prompt", value="", scale=3) problem_answer = gr.Textbox(label="Problem Answer", value="", scale=1) def update_problem_info(problem_id): return gr.update(value=json_file['prompt'][problem_id-1], label=f"Problem#{problem_id} Prompt"), gr.update(value=json_file['answer'][problem_id-1], label=f"Problem#{problem_id} Answer") problem_id.change(fn=update_problem_info, inputs=problem_id, outputs=[problem_prompt, problem_answer]) run_btn = gr.Button("Run Evaluation") with gr.Column(visible=False) as result_column: gr.Markdown("## 3. Experiment Result") with gr.Row(): with gr.Column(): gr.Markdown("### PPL (Internal Probability)") ppl_result = gr.Markdown() with gr.Column(): gr.Markdown("### SC (Self-Consistency)") sc_result = gr.Markdown(value="") with gr.Column(): gr.Markdown("### RPC (Ours)") rpc_result = gr.Markdown(value="") def get_available_problems(): global json_file answer = np.array(json_file["accuracy"]).mean(axis=0) # print(answer.shape) # Select indices where the answer is greater than 0.3 available_indices = np.where((answer > 0.3) & (answer < 0.5))[0] available_indices = available_indices + 1 # print(available_indices) return available_indices.tolist() def load(dataset, model, k_value, seed): try: repo_id = { "MATH": "WNJXYK/MATH-Reasoning-Paths", "MathOdyssey": "WNJXYK/MathOdyssey-Reasoning-Paths", "AIME": "WNJXYK/AIME_1983_2024-Reasoning-Paths", "OlympiadBench": "WNJXYK/OlympiadBench-Reasoning-Paths" }[dataset] filename = f"{model}.json" yield f"Downloading sampled reasoning paths from Hugging Face {repo_id}...", gr.update(visible=False), gr.update(), gr.update(), gr.update(visible=False) file_path = hf_hub_download(repo_id=repo_id, filename=filename, repo_type="dataset") global json_file with open(file_path, 'r', encoding='utf-8') as f: json_file = json.load(f) clist = get_available_problems() os.remove(file_path) yield "Loading complete! You can now select a problem ID.", gr.update(visible=True), gr.update(value=f"Dataset: {dataset}\tModel: {model}\tK: {k_value}\tSeed: {seed}"), gr.update(choices=clist, value=clist[0]), gr.update(visible=False) except Exception as e: yield f"Error: {str(e)}" def res_to_str(correct, answers, topk=10): answers = sorted(answers, key=lambda x: x[1], reverse=True) response = "| # | Answer | Probability | Correct |\n|---|--------|------------|--------|\n" for i in range(min(len(answers), topk)): correct_mark = "✅" if answers[i][2] else "❌" wrapped_answer = answers[i][0] if len(answers[i][0]) <= 10 else answers[i][0][:10] + "..." response += f"| Top-{i+1} | {wrapped_answer} | {answers[i][1]:.2f} | {correct_mark} |\n" return response def evaluate(problem_id): ppl_correct, ppl_answers = prep_evaluator( json_file["predict"][problem_id-1], json_file["completion"][problem_id-1], json_file["mean_logprob"][problem_id-1], json_file["answer"][problem_id-1], numberic_compare, check_equal ) sc_correct, sc_answers = sc_evaluator( json_file["predict"][problem_id-1], json_file["completion"][problem_id-1], json_file["mean_logprob"][problem_id-1], json_file["answer"][problem_id-1], numberic_compare, check_equal ) rpc_correct, rpc_answers = wpc_evaluator( json_file["predict"][problem_id-1], json_file["completion"][problem_id-1], json_file["mean_logprob"][problem_id-1], json_file["answer"][problem_id-1], numberic_compare, check_equal ) return gr.update(visible=True), gr.update(value=res_to_str(ppl_correct, ppl_answers)), gr.update(value=res_to_str(sc_correct, sc_answers)), gr.update(value=res_to_str(rpc_correct, rpc_answers)) load_btn.click(fn=load, inputs=[dataset, model, k_value, seed],outputs=[load_btn, content_column, data_info, problem_id, result_column], show_progress="inside") run_btn.click(fn=evaluate, inputs=problem_id, outputs=[result_column, ppl_result, sc_result, rpc_result]) if __name__ == "__main__": demo.launch()