File size: 8,475 Bytes
22c93a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fc6f0c5
22c93a7
 
 
 
 
 
 
fc6f0c5
 
22c93a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fc6f0c5
22c93a7
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import gradio as gr
import json, os
from huggingface_hub import hf_hub_download
from compute_perp import prep_evaluator, numberic_compare, check_equal
from compute_sc import sc_evaluator
from compute_rpc import wpc_evaluator
import numpy as np

def greet(name):
    return "Hello " + name + "!!"

json_file = {"predict": [], "answer": [], "completion": [], "mean_logprob": [], "prompt": []}

demo = gr.Blocks()
with demo:
    paper_title = gr.HTML("""<div align='center'><h1>[NeurIPS 2025] A Theoretical Study on Bridging Internal Probability and Self-Consistency for LLM Reasoning</h1></div>""")
    paper_info  = gr.HTML("""<div align="center"><h3><a href="https://arxiv.org/pdf/2502.00511">📄 [Paper]</a> • <a href="https://wnjxyk.github.io/RPC">🌐 [Project]</a> • <a href="#" onclick="document.getElementById('bibtex-popup').style.display='block';">📚 [BibTeX]</a><h3>
    <div id="bibtex-popup" style="display:none; position:fixed; top:50%; left:50%; transform:translate(-50%, -50%); background:white; padding:20px; border:1px solid #ccc; box-shadow:0 0 10px rgba(0,0,0,0.2); z-index:1000; max-width:80%; overflow:auto;">
        <pre style="white-space:pre-wrap; font-size:12px; text-align:left;">@inproceedings{zhou24theoretical,
    author    = {Zhou, Zhi and Tan, Yuhao and Li, Zenan and Yao, Yuan and Guo, Lan-Zhe and Li, Yu-Feng and Ma, Xiaoxing},
    title     = {A Theorecial Study on Bridging Internal Probability and Self-Consistency for LLM Reasoning},
    booktitle = {Advances in Neural Information Processing Systems},
    year      = {2025},
}</pre>
        <button onclick="document.getElementById('bibtex-popup').style.display='none';" style="margin-top:10px; padding:5px 10px;">Close</button>
    </div></div>""")
    
    with gr.Column():
        gr.Markdown("## 1. Experimental Settings")
        with gr.Row():    
            dataset = gr.Dropdown(
                choices=["MATH", "MathOdyssey", "AIME", "OlympiadBench"],
                value="MathOdyssey",
                label="Dataset",
                interactive=True
            )
            model = gr.Dropdown(
                choices=["Deepseek-Math-RL-7B", "InternLM2-Math-Plus-1.8B", "InternLM2-Math-Plus-7B"],
                value="InternLM2-Math-Plus-7B",
                label="Model",
                interactive=True
            )
            k_value = gr.Dropdown(
                choices=[8, 16, 32, 64, 128],
                value=128,
                label="K (Number of Sampled Reasoning Paths)",
                interactive=True
            )
            seed = gr.Number(
                label="Random Seed",
                value=998244353,
                step=1,
                interactive=True
            )
            def update_k_value(dataset_choice):
                if dataset_choice == "MATH":
                    return gr.update(choices=[8, 16, 32, 64], value=min(64, k_value.value))
                else:
                    return gr.update(choices=[8, 16, 32, 64, 128], value=k_value.value)
            dataset.change(fn=update_k_value, inputs=dataset, outputs=k_value)
        load_btn = gr.Button("Load All Problems")

    with gr.Column(visible=False) as content_column:
        gr.Markdown("## 2. Problem Selection")
        with gr.Group():
            data_info = gr.Textbox(label="Experiment Info", value="")
            problem_id = gr.Dropdown(
                choices=[1],
                value=1,
                label="Problem ID (We removed (1) problems that were unlikely to be answered correctly using any of the methods; (2) easy problems)",
                interactive=True
            )
            with gr.Row():
                problem_prompt = gr.Textbox(label="Problem Prompt", value="", scale=3)
                problem_answer = gr.Textbox(label="Problem Answer", value="", scale=1)
            def update_problem_info(problem_id):
                return gr.update(value=json_file['prompt'][problem_id-1], label=f"Problem#{problem_id} Prompt"), gr.update(value=json_file['answer'][problem_id-1], label=f"Problem#{problem_id} Answer")
            problem_id.change(fn=update_problem_info, inputs=problem_id, outputs=[problem_prompt, problem_answer])
        run_btn = gr.Button("Run Evaluation")

    with gr.Column(visible=False) as result_column:
        gr.Markdown("## 3. Experiment Result")
        with gr.Row():
            with gr.Column():
                gr.Markdown("### PPL (Internal Probability)")
                ppl_result = gr.Markdown()
            with gr.Column():
                gr.Markdown("### SC (Self-Consistency)")
                sc_result = gr.Markdown(value="")
            with gr.Column():
                gr.Markdown("### RPC (Ours)")
                rpc_result = gr.Markdown(value="")

    def get_available_problems():
        global json_file
        answer = np.array(json_file["accuracy"]).mean(axis=0)
        # print(answer.shape)
        # Select indices where the answer is greater than 0.3
        available_indices = np.where((answer > 0.3) & (answer < 0.5))[0]
        available_indices = available_indices + 1
        # print(available_indices)
        return available_indices.tolist()


    def load(dataset, model, k_value, seed):
        try:
            repo_id = {
                "MATH": "WNJXYK/MATH-Reasoning-Paths",
                "MathOdyssey": "WNJXYK/MathOdyssey-Reasoning-Paths",
                "AIME": "WNJXYK/AIME_1983_2024-Reasoning-Paths",
                "OlympiadBench": "WNJXYK/OlympiadBench-Reasoning-Paths"
            }[dataset]
            filename = f"{model}.json"
            
            yield f"Downloading sampled reasoning paths from Hugging Face {repo_id}...", gr.update(visible=False), gr.update(), gr.update(), gr.update(visible=False)
            file_path = hf_hub_download(repo_id=repo_id, filename=filename, repo_type="dataset")
            
            global json_file
            with open(file_path, 'r', encoding='utf-8') as f:
                json_file = json.load(f)
            clist = get_available_problems()
            os.remove(file_path)

            yield "Loading complete! You can now select a problem ID.", gr.update(visible=True), gr.update(value=f"Dataset: {dataset}\tModel: {model}\tK: {k_value}\tSeed: {seed}"), gr.update(choices=clist, value=clist[0]), gr.update(visible=False)
            
        except Exception as e:
            yield f"Error: {str(e)}"
    
    def res_to_str(correct, answers, topk=10):
        answers = sorted(answers, key=lambda x: x[1], reverse=True)
        response = "| # | Answer | Probability | Correct |\n|---|--------|------------|--------|\n"
        for i in range(min(len(answers), topk)):
            correct_mark = "✅" if answers[i][2] else "❌"
            wrapped_answer = answers[i][0] if len(answers[i][0]) <= 10 else answers[i][0][:10] + "..."
            response += f"| Top-{i+1} | {wrapped_answer} | {answers[i][1]:.2f} | {correct_mark} |\n"
        return response

    def evaluate(problem_id):
        ppl_correct, ppl_answers = prep_evaluator(
            json_file["predict"][problem_id-1],
            json_file["completion"][problem_id-1],
            json_file["mean_logprob"][problem_id-1],
            json_file["answer"][problem_id-1],
            numberic_compare,
            check_equal
        )

        sc_correct, sc_answers = sc_evaluator(
            json_file["predict"][problem_id-1],
            json_file["completion"][problem_id-1],
            json_file["mean_logprob"][problem_id-1],
            json_file["answer"][problem_id-1],
            numberic_compare,
            check_equal
        )

        rpc_correct, rpc_answers = wpc_evaluator(
            json_file["predict"][problem_id-1],
            json_file["completion"][problem_id-1],
            json_file["mean_logprob"][problem_id-1],
            json_file["answer"][problem_id-1],
            numberic_compare,
            check_equal
        )

        return gr.update(visible=True), gr.update(value=res_to_str(ppl_correct, ppl_answers)), gr.update(value=res_to_str(sc_correct, sc_answers)), gr.update(value=res_to_str(rpc_correct, rpc_answers))

    load_btn.click(fn=load, inputs=[dataset, model, k_value, seed],outputs=[load_btn, content_column, data_info, problem_id, result_column], show_progress="inside")
    run_btn.click(fn=evaluate, inputs=problem_id, outputs=[result_column, ppl_result, sc_result, rpc_result])

if __name__ == "__main__":
    demo.launch()