Spaces:

WNJXYK
/

RPC

Running

RPC / app.py

Update app.py

fc6f0c5 verified 8 days ago

8.48 kB

	import gradio as gr
	import json, os
	from huggingface_hub import hf_hub_download
	from compute_perp import prep_evaluator, numberic_compare, check_equal
	from compute_sc import sc_evaluator
	from compute_rpc import wpc_evaluator
	import numpy as np

	def greet(name):
	return "Hello " + name + "!!"

	json_file = {"predict": [], "answer": [], "completion": [], "mean_logprob": [], "prompt": []}

	demo = gr.Blocks()
	with demo:
	paper_title = gr.HTML("""<div align='center'><h1>[NeurIPS 2025] A Theoretical Study on Bridging Internal Probability and Self-Consistency for LLM Reasoning</h1></div>""")
	paper_info = gr.HTML("""<div align="center"><h3><a href="https://arxiv.org/pdf/2502.00511">📄 [Paper]</a> • <a href="https://wnjxyk.github.io/RPC">🌐 [Project]</a> • <a href="#" onclick="document.getElementById('bibtex-popup').style.display='block';">📚 [BibTeX]</a><h3>
	<div id="bibtex-popup" style="display:none; position:fixed; top:50%; left:50%; transform:translate(-50%, -50%); background:white; padding:20px; border:1px solid #ccc; box-shadow:0 0 10px rgba(0,0,0,0.2); z-index:1000; max-width:80%; overflow:auto;">
	<pre style="white-space:pre-wrap; font-size:12px; text-align:left;">@inproceedings{zhou24theoretical,
	author = {Zhou, Zhi and Tan, Yuhao and Li, Zenan and Yao, Yuan and Guo, Lan-Zhe and Li, Yu-Feng and Ma, Xiaoxing},
	title = {A Theorecial Study on Bridging Internal Probability and Self-Consistency for LLM Reasoning},
	booktitle = {Advances in Neural Information Processing Systems},
	year = {2025},
	}</pre>
	<button onclick="document.getElementById('bibtex-popup').style.display='none';" style="margin-top:10px; padding:5px 10px;">Close</button>
	</div></div>""")

	with gr.Column():
	gr.Markdown("## 1. Experimental Settings")
	with gr.Row():
	dataset = gr.Dropdown(
	choices=["MATH", "MathOdyssey", "AIME", "OlympiadBench"],
	value="MathOdyssey",
	label="Dataset",
	interactive=True
	)
	model = gr.Dropdown(
	choices=["Deepseek-Math-RL-7B", "InternLM2-Math-Plus-1.8B", "InternLM2-Math-Plus-7B"],
	value="InternLM2-Math-Plus-7B",
	label="Model",
	interactive=True
	)
	k_value = gr.Dropdown(
	choices=[8, 16, 32, 64, 128],
	value=128,
	label="K (Number of Sampled Reasoning Paths)",
	interactive=True
	)
	seed = gr.Number(
	label="Random Seed",
	value=998244353,
	step=1,
	interactive=True
	)
	def update_k_value(dataset_choice):
	if dataset_choice == "MATH":
	return gr.update(choices=[8, 16, 32, 64], value=min(64, k_value.value))
	else:
	return gr.update(choices=[8, 16, 32, 64, 128], value=k_value.value)
	dataset.change(fn=update_k_value, inputs=dataset, outputs=k_value)
	load_btn = gr.Button("Load All Problems")

	with gr.Column(visible=False) as content_column:
	gr.Markdown("## 2. Problem Selection")
	with gr.Group():
	data_info = gr.Textbox(label="Experiment Info", value="")
	problem_id = gr.Dropdown(
	choices=[1],
	value=1,
	label="Problem ID (We removed (1) problems that were unlikely to be answered correctly using any of the methods; (2) easy problems)",
	interactive=True
	)
	with gr.Row():
	problem_prompt = gr.Textbox(label="Problem Prompt", value="", scale=3)
	problem_answer = gr.Textbox(label="Problem Answer", value="", scale=1)
	def update_problem_info(problem_id):
	return gr.update(value=json_file['prompt'][problem_id-1], label=f"Problem#{problem_id} Prompt"), gr.update(value=json_file['answer'][problem_id-1], label=f"Problem#{problem_id} Answer")
	problem_id.change(fn=update_problem_info, inputs=problem_id, outputs=[problem_prompt, problem_answer])
	run_btn = gr.Button("Run Evaluation")

	with gr.Column(visible=False) as result_column:
	gr.Markdown("## 3. Experiment Result")
	with gr.Row():
	with gr.Column():
	gr.Markdown("### PPL (Internal Probability)")
	ppl_result = gr.Markdown()
	with gr.Column():
	gr.Markdown("### SC (Self-Consistency)")
	sc_result = gr.Markdown(value="")
	with gr.Column():
	gr.Markdown("### RPC (Ours)")
	rpc_result = gr.Markdown(value="")

	def get_available_problems():
	global json_file
	answer = np.array(json_file["accuracy"]).mean(axis=0)
	# print(answer.shape)
	# Select indices where the answer is greater than 0.3
	available_indices = np.where((answer > 0.3) & (answer < 0.5))[0]
	available_indices = available_indices + 1
	# print(available_indices)
	return available_indices.tolist()


	def load(dataset, model, k_value, seed):
	try:
	repo_id = {
	"MATH": "WNJXYK/MATH-Reasoning-Paths",
	"MathOdyssey": "WNJXYK/MathOdyssey-Reasoning-Paths",
	"AIME": "WNJXYK/AIME_1983_2024-Reasoning-Paths",
	"OlympiadBench": "WNJXYK/OlympiadBench-Reasoning-Paths"
	}[dataset]
	filename = f"{model}.json"

	yield f"Downloading sampled reasoning paths from Hugging Face {repo_id}...", gr.update(visible=False), gr.update(), gr.update(), gr.update(visible=False)
	file_path = hf_hub_download(repo_id=repo_id, filename=filename, repo_type="dataset")

	global json_file
	with open(file_path, 'r', encoding='utf-8') as f:
	json_file = json.load(f)
	clist = get_available_problems()
	os.remove(file_path)

	yield "Loading complete! You can now select a problem ID.", gr.update(visible=True), gr.update(value=f"Dataset: {dataset}\tModel: {model}\tK: {k_value}\tSeed: {seed}"), gr.update(choices=clist, value=clist[0]), gr.update(visible=False)

	except Exception as e:
	yield f"Error: {str(e)}"

	def res_to_str(correct, answers, topk=10):
	answers = sorted(answers, key=lambda x: x[1], reverse=True)
	response = "\| # \| Answer \| Probability \| Correct \|\n\|---\|--------\|------------\|--------\|\n"
	for i in range(min(len(answers), topk)):
	correct_mark = "✅" if answers[i][2] else "❌"
	wrapped_answer = answers[i][0] if len(answers[i][0]) <= 10 else answers[i][0][:10] + "..."
	response += f"\| Top-{i+1} \| {wrapped_answer} \| {answers[i][1]:.2f} \| {correct_mark} \|\n"
	return response

	def evaluate(problem_id):
	ppl_correct, ppl_answers = prep_evaluator(
	json_file["predict"][problem_id-1],
	json_file["completion"][problem_id-1],
	json_file["mean_logprob"][problem_id-1],
	json_file["answer"][problem_id-1],
	numberic_compare,
	check_equal
	)

	sc_correct, sc_answers = sc_evaluator(
	json_file["predict"][problem_id-1],
	json_file["completion"][problem_id-1],
	json_file["mean_logprob"][problem_id-1],
	json_file["answer"][problem_id-1],
	numberic_compare,
	check_equal
	)

	rpc_correct, rpc_answers = wpc_evaluator(
	json_file["predict"][problem_id-1],
	json_file["completion"][problem_id-1],
	json_file["mean_logprob"][problem_id-1],
	json_file["answer"][problem_id-1],
	numberic_compare,
	check_equal
	)

	return gr.update(visible=True), gr.update(value=res_to_str(ppl_correct, ppl_answers)), gr.update(value=res_to_str(sc_correct, sc_answers)), gr.update(value=res_to_str(rpc_correct, rpc_answers))

	load_btn.click(fn=load, inputs=[dataset, model, k_value, seed],outputs=[load_btn, content_column, data_info, problem_id, result_column], show_progress="inside")
	run_btn.click(fn=evaluate, inputs=problem_id, outputs=[result_column, ppl_result, sc_result, rpc_result])

	if __name__ == "__main__":
	demo.launch()