arjunguha commited on
Commit
6f224f3
·
verified ·
1 Parent(s): 66ca398

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +422 -0
app.py ADDED
@@ -0,0 +1,422 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ You do not need to run this program yourself. It is hosted on Hugging Face
3
+ Spaces at:
4
+
5
+ https://huggingface.co/spaces/nuprl/BigCodeBench-MultiPL-Stdio-Problem-Inspector
6
+
7
+ If you want to run it yourself, you can do the following:
8
+
9
+ We use this program to help inspect our synthesized problems. These are the
10
+ steps to run it end-to-end:
11
+
12
+ 1. Create a jsonl file that joins synthesized problems with their execution
13
+ results.
14
+
15
+ uv run python3 -m bigcodebench_multipl.stdio_problem_inspector upload \
16
+ --problems-path unfiltered_stdio.jsonl \
17
+ --results-path unfiltered_stdio.results.jsonl \
18
+ --output-path unfiltered_stdio.joined.jsonl
19
+
20
+ 2. Upload the dataset to the Hugging Face Hub for the next steps.
21
+
22
+ mkdir python_stdio
23
+ mv unfiltered_stdio.joined.jsonl python_stdio/test.jsonl
24
+
25
+ Now, drag and drop the *folder* above to a Hugging Face dataset.
26
+
27
+ 3. Run the inspector:
28
+
29
+ uv run python3 -m bigcodebench_multipl.stdio_problem_inspector dataset-inspector
30
+
31
+ """
32
+ import argparse
33
+ import pandas as pd
34
+ import gradio as gr
35
+ import datasets
36
+ from pathlib import Path
37
+ import datasets
38
+ import ast
39
+ from typing import TypedDict, Generator
40
+
41
+ ################################################################################
42
+ # Copy-pasted from bcb_reader.py. #
43
+ ################################################################################
44
+
45
+ # This is the format of BigCodeBench problems. However, BigCodeBench-Hard has
46
+ # a few extra columns.
47
+ class _OriginalBigCodeBenchProblem(TypedDict):
48
+ task_id: str
49
+ complete_prompt: str
50
+ instruct_prompt: str
51
+ canonical_solution: str
52
+ code_prompt: str
53
+ test: str
54
+ entry_point: str
55
+ doc_struct: str
56
+ libs: str
57
+
58
+
59
+ class BigCodeBenchProblem(TypedDict):
60
+ task_id: str
61
+ problem: str
62
+ solution: str
63
+ tests: str
64
+
65
+
66
+ _PROMPT_BOILERPLATE = "\nYou should write self-contained code starting with:\n```\n"
67
+ _PROMPT_SUFFIX = "```"
68
+
69
+
70
+ def _prepare_bcb_problem(item: _OriginalBigCodeBenchProblem) -> BigCodeBenchProblem:
71
+ """
72
+ Every BCB problem has a canonical solution, which is a completion expected
73
+ from a base model. This function splits the prompt to get a complete
74
+ solution."""
75
+ instruct_prompt = item["instruct_prompt"]
76
+ problem, solution_prefix = instruct_prompt.split(_PROMPT_BOILERPLATE, maxsplit=1)
77
+
78
+ assert solution_prefix.endswith(
79
+ _PROMPT_SUFFIX
80
+ ), f"Prompt ends with {solution_prefix[-20:].__repr__()}"
81
+ solution_prefix = solution_prefix[: -len(_PROMPT_SUFFIX)]
82
+ solution = solution_prefix + item["canonical_solution"]
83
+
84
+ tests = item["test"]
85
+
86
+ # As a sanity check, parse. We get syntax warnings on standard error.
87
+ ast.parse(solution, filename=item["task_id"])
88
+ ast.parse(tests, filename="test_" + item["task_id"])
89
+
90
+ return BigCodeBenchProblem(
91
+ task_id=item["task_id"],
92
+ problem=problem,
93
+ solution=solution,
94
+ tests=tests,
95
+ )
96
+
97
+
98
+ def load_bigcodebench() -> Generator[BigCodeBenchProblem, None, None]:
99
+ """ "
100
+ Loads the BigCodeBench dataset in a format appropriate for translation.
101
+ """
102
+ bcb = datasets.load_dataset("bigcode/bigcodebench", split="v0.1.4")
103
+ for item in bcb:
104
+ yield _prepare_bcb_problem(item)
105
+
106
+ ################################################################################
107
+
108
+ def upload(problems_path: Path, results_path: Path, output_path: Path):
109
+ problems = pd.read_json(problems_path, lines=True)
110
+ results = pd.read_json(results_path, lines=True)
111
+
112
+ joined = problems.merge(results, on="task_id", how="left")
113
+
114
+ assert list(joined.columns) == [
115
+ "reasoning",
116
+ "prompt",
117
+ "program",
118
+ "test_suite",
119
+ "task_id",
120
+ "timeout",
121
+ "exit_code",
122
+ "stdout",
123
+ "stderr",
124
+ ], "Unexpected columns after the join. Are you sure you are merging the right files?"
125
+
126
+ joined.to_json(output_path, orient="records", lines=True)
127
+
128
+
129
+ def dataset_inspector(dataset_name: str, data_dir: str):
130
+ dataset = datasets.load_dataset(dataset_name, data_dir=data_dir, split="test")
131
+
132
+ original_dataset = pd.DataFrame(load_bigcodebench())
133
+ original_dataset = original_dataset.rename(columns={
134
+ "problem": "original_prompt",
135
+ "solution": "original_program",
136
+ "tests": "original_test_suite",
137
+ })
138
+
139
+ # Convert to pandas DataFrame for easier manipulation
140
+ df = dataset.to_pandas()
141
+ df = df.merge(original_dataset, on="task_id", how="left")
142
+
143
+ def get_filtered_data(predicate):
144
+ """Filter the dataset based on predicate"""
145
+ filtered_df = df.copy()
146
+
147
+ selector = False
148
+ if predicate.get('filter_timeout', False):
149
+ selector = selector | (filtered_df['timeout'] == True)
150
+
151
+ if predicate.get('filter_successes', False):
152
+ selector = selector | (filtered_df['exit_code'] == 0)
153
+
154
+ if predicate.get('filter_errors', False):
155
+ # We use exit_code < 0 for timeout.
156
+ selector = selector | (filtered_df['exit_code'] > 0)
157
+
158
+ return filtered_df[selector]
159
+
160
+ def format_problem_display(row, predicate):
161
+ """Format a single problem for display - returns (generated_content, original_content)"""
162
+ generated_content = []
163
+ original_content = []
164
+
165
+ # Add reasoning to generated side if checkbox is checked
166
+ if predicate.get('show_reasoning', False):
167
+ generated_content.append("## Reasoning")
168
+ generated_content.append(str(row['reasoning']))
169
+ generated_content.append("")
170
+
171
+ # Generated content
172
+ generated_content.append("# Generated")
173
+ generated_content.append("")
174
+ generated_content.append("## Prompt")
175
+ generated_content.append(str(row['prompt']))
176
+ generated_content.append("")
177
+
178
+ generated_content.append("## Program")
179
+ generated_content.append("```python")
180
+ generated_content.append(str(row['program']))
181
+ generated_content.append("```")
182
+ generated_content.append("")
183
+
184
+ generated_content.append("## Test Suite")
185
+ generated_content.append("```python")
186
+ generated_content.append(str(row['test_suite']))
187
+ generated_content.append("```")
188
+ generated_content.append("")
189
+
190
+ # Add execution results to generated side
191
+ if str(row['stdout']).strip():
192
+ generated_content.append("## Standard Output")
193
+ generated_content.append("```")
194
+ generated_content.append(str(row['stdout']))
195
+ generated_content.append("```")
196
+ generated_content.append("")
197
+
198
+ if str(row['stderr']).strip():
199
+ generated_content.append("## Standard Error")
200
+ generated_content.append("```")
201
+ generated_content.append(str(row['stderr']))
202
+ generated_content.append("```")
203
+ generated_content.append("")
204
+
205
+ generated_content.append("## Metadata")
206
+ generated_content.append(f"**Task ID:** {row['task_id']}")
207
+ generated_content.append(f"**Timeout:** {row['timeout']}")
208
+ generated_content.append(f"**Exit Code:** {row['exit_code']}")
209
+
210
+ # Original content
211
+ original_content.append("# Original")
212
+ original_content.append("")
213
+ original_content.append("## Prompt")
214
+ original_content.append(str(row['original_prompt']))
215
+ original_content.append("")
216
+
217
+ original_content.append("## Program")
218
+ original_content.append("```python")
219
+ original_content.append(str(row['original_program']))
220
+ original_content.append("```")
221
+ original_content.append("")
222
+
223
+ original_content.append("## Test Suite")
224
+ original_content.append("```python")
225
+ original_content.append(str(row['original_test_suite']))
226
+ original_content.append("```")
227
+
228
+ return "\n".join(generated_content), "\n".join(original_content)
229
+
230
+ def update_display(current_index, predicate):
231
+ """Update the display based on current predicate and index"""
232
+ filtered_df = get_filtered_data(predicate)
233
+
234
+ if len(filtered_df) == 0:
235
+ return "No problems match the current filters.", "No problems match the current filters.", f"0 / 0", gr.update(interactive=False), gr.update(interactive=False)
236
+
237
+ # Ensure index is within bounds
238
+ current_index = max(0, min(current_index, len(filtered_df) - 1))
239
+
240
+ row = filtered_df.iloc[current_index]
241
+ generated_content, original_content = format_problem_display(row, predicate)
242
+ status = f"{current_index + 1} / {len(filtered_df)}"
243
+
244
+ # Update button states
245
+ prev_enabled = current_index > 0
246
+ next_enabled = current_index < len(filtered_df) - 1
247
+
248
+ return generated_content, original_content, status, gr.update(interactive=prev_enabled), gr.update(interactive=next_enabled)
249
+
250
+ def go_prev(current_index, predicate):
251
+ """Go to previous problem"""
252
+ new_index = max(0, current_index - 1)
253
+ generated_content, original_content, status, prev_btn, next_btn = update_display(new_index, predicate)
254
+ return generated_content, original_content, status, new_index, prev_btn, next_btn
255
+
256
+ def go_next(current_index, predicate):
257
+ """Go to next problem"""
258
+ filtered_df = get_filtered_data(predicate)
259
+ new_index = min(len(filtered_df) - 1, current_index + 1)
260
+ generated_content, original_content, status, prev_btn, next_btn = update_display(new_index, predicate)
261
+ return generated_content, original_content, status, new_index, prev_btn, next_btn
262
+
263
+ def on_filter_change(current_index, predicate):
264
+ """Handle filter changes - reset to first item"""
265
+ generated_content, original_content, status, prev_btn, next_btn = update_display(0, predicate)
266
+ return generated_content, original_content, status, 0, prev_btn, next_btn
267
+
268
+ def update_predicate(predicate, key, value):
269
+ """Update a single key in the predicate"""
270
+ new_predicate = predicate.copy()
271
+ new_predicate[key] = value
272
+ return new_predicate
273
+
274
+ # Create Gradio interface
275
+ with gr.Blocks(title="BigCodeBench Problem Inspector") as demo:
276
+ gr.Markdown("# BigCodeBench-MultiPL Problem Inspector")
277
+
278
+ # State to track current index and predicate
279
+ current_index = gr.State(0)
280
+ predicate = gr.State({
281
+ 'filter_timeout': False,
282
+ 'filter_successes': True,
283
+ 'filter_errors': False,
284
+ 'show_reasoning': False
285
+ })
286
+
287
+ # Top controls row
288
+ with gr.Row():
289
+ prev_btn = gr.Button("← Previous", size="sm")
290
+ status_text = gr.Textbox(value="1 / 1", interactive=False, container=False, show_label=False)
291
+ next_btn = gr.Button("Next →", size="sm")
292
+
293
+ # Filter controls
294
+ with gr.Row():
295
+ filter_timeout = gr.Checkbox(label="Filter by timeout = True", value=False)
296
+ filter_successes = gr.Checkbox(label="Show successes (exit_code == 0)", value=True)
297
+ filter_errors = gr.Checkbox(label="Show errors (exit_code != 0)", value=False)
298
+ show_reasoning = gr.Checkbox(label="Show reasoning", value=False)
299
+
300
+ # Main content area - two columns
301
+ with gr.Row():
302
+ with gr.Column():
303
+ generated_display = gr.Markdown(value="Loading generated content...", height=600)
304
+ with gr.Column():
305
+ original_display = gr.Markdown(value="Loading original content...", height=600)
306
+
307
+ # Initialize display
308
+ demo.load(
309
+ fn=lambda: update_display(0, {'filter_timeout': False, 'filter_successes': True, 'filter_errors': False, 'show_reasoning': False}),
310
+ outputs=[generated_display, original_display, status_text, prev_btn, next_btn]
311
+ )
312
+
313
+ # Event handlers
314
+ prev_btn.click(
315
+ fn=go_prev,
316
+ inputs=[current_index, predicate],
317
+ outputs=[generated_display, original_display, status_text, current_index, prev_btn, next_btn]
318
+ )
319
+
320
+ next_btn.click(
321
+ fn=go_next,
322
+ inputs=[current_index, predicate],
323
+ outputs=[generated_display, original_display, status_text, current_index, prev_btn, next_btn]
324
+ )
325
+
326
+ # Filter change handlers
327
+ filter_timeout.change(
328
+ fn=lambda current_idx, pred, value: (
329
+ *on_filter_change(current_idx, update_predicate(pred, 'filter_timeout', value)),
330
+ update_predicate(pred, 'filter_timeout', value)
331
+ ),
332
+ inputs=[current_index, predicate, filter_timeout],
333
+ outputs=[generated_display, original_display, status_text, current_index, prev_btn, next_btn, predicate]
334
+ )
335
+
336
+ filter_errors.change(
337
+ fn=lambda current_idx, pred, value: (
338
+ *on_filter_change(current_idx, update_predicate(pred, 'filter_errors', value)),
339
+ update_predicate(pred, 'filter_errors', value)
340
+ ),
341
+ inputs=[current_index, predicate, filter_errors],
342
+ outputs=[generated_display, original_display, status_text, current_index, prev_btn, next_btn, predicate]
343
+ )
344
+
345
+ filter_successes.change(
346
+ fn=lambda current_idx, pred, value: (
347
+ *on_filter_change(current_idx, update_predicate(pred, 'filter_successes', value)),
348
+ update_predicate(pred, 'filter_successes', value)
349
+ ),
350
+ inputs=[current_index, predicate, filter_successes],
351
+ outputs=[generated_display, original_display, status_text, current_index, prev_btn, next_btn, predicate]
352
+ )
353
+
354
+ show_reasoning.change(
355
+ fn=lambda current_idx, pred, value: (
356
+ *update_display(current_idx, update_predicate(pred, 'show_reasoning', value)),
357
+ update_predicate(pred, 'show_reasoning', value)
358
+ ),
359
+ inputs=[current_index, predicate, show_reasoning],
360
+ outputs=[generated_display, original_display, status_text, prev_btn, next_btn, predicate]
361
+ )
362
+
363
+ demo.launch(share=True)
364
+
365
+
366
+ def main():
367
+ parser = argparse.ArgumentParser()
368
+
369
+ subparsers = parser.add_subparsers(dest="subcommand")
370
+
371
+ upload_command = subparsers.add_parser("upload", help="Prepare the dataset")
372
+ upload_command.add_argument(
373
+ "--problems-path",
374
+ type=Path,
375
+ required=True,
376
+ help="Output from make_stdio_problem.py",
377
+ )
378
+ upload_command.add_argument(
379
+ "--results-path",
380
+ type=Path,
381
+ required=True,
382
+ help="Execution results from --problems-path",
383
+ )
384
+ upload_command.add_argument(
385
+ "--output-path",
386
+ type=Path,
387
+ required=True,
388
+ help="Output path to save the joined dataset",
389
+ )
390
+
391
+
392
+ dataset_inspector_command = subparsers.add_parser("dataset-inspector", help="Inspect a dataset")
393
+ dataset_inspector_command.add_argument(
394
+ "--dataset-name",
395
+ type=str,
396
+ default="nuprl/BigCodeBench-MultiPL-Results",
397
+ help="Name of the dataset on the Hugging Face Hub",
398
+ )
399
+ dataset_inspector_command.add_argument(
400
+ "--data-dir",
401
+ type=str,
402
+ default="python_stdio",
403
+ help="Name of the directory on the Hugging Face Hub",
404
+ )
405
+
406
+ args = parser.parse_args()
407
+
408
+ args_dict = dict(vars(args))
409
+ del args_dict["subcommand"]
410
+
411
+ if args.subcommand == "upload":
412
+ upload(**args_dict)
413
+ elif args.subcommand == "dataset-inspector":
414
+ dataset_inspector(**args_dict)
415
+ elif args.subcommand is None:
416
+ dataset_inspector(dataset_name="nuprl/BigCodeBench-MultiPL-Results", data_dir="python_stdio")
417
+ else:
418
+ raise ValueError(f"Unknown subcommand: {args.subcommand}")
419
+
420
+
421
+ if __name__ == "__main__":
422
+ main()