""" Script for joining .csv candidate data into a .duckdb results. Launches a gradio app to review candidates """ import argparse from pathlib import Path import pandas as pd from metrics import load_results from utils import query_format_models, sha256_hash, get_completions, print_info, regex_compare import numpy as np import json import ast import gradio as gr import re from typing import List SQL_QUERY = """ WITH AllResults AS ( SELECT results.parent_dir AS model, * FROM results.completions results JOIN challenges challenges ON results.prompt_id = challenges.ID ) SELECT prompt_id, model, completion, answer as solution, prompt FROM AllResults WHERE AllResults.model IN {models} """.format(models=query_format_models(['r1','gemini2'])) def _parse(x): if isinstance(x, str): if len(x.strip()) == 0 or x.strip() in ["]","["]: return [] # bad gen else: try: return ast.literal_eval(x) except: raise ValueError(f"Bad gen: {x}") elif np.isnan(x): return [] else: raise ValueError(f"Found unexpected type {type(x)}: {x}") def _concat(series: pd.Series) -> np.array: items = list(filter(lambda x: len(x) > 0, map(_parse, series))) if len(items) > 0: return np.unique(np.concatenate(items)) else: return items def check_candidates(candidates: pd.DataFrame, merged_df: pd.DataFrame): """ Perform a variety of sanity checks ie: - all attempted answers are in the completion """ for _,row in merged_df.iterrows(): candidates = json.loads(row["candidates"]) comp = row["completion"].lower() for c in candidates: assert c.lower() in comp or regex_compare(c.lower(), comp), \ json.dumps({"candidate":c, "completion":row["completion"], "hash": row["_original_completion_hash"]}, indent=4) def launch_app(df: pd.DataFrame, share_demo: bool = False): # Define function to display table and toggle completion def show_table(show_completion, example_idx): # Extract the row based on the slider index example = df.iloc[example_idx] # Function to highlight words from the candidates list def highlight_words(text, candidates, color="yellow"): for word in candidates: # Use word boundaries to ensure we only match whole words text = re.sub(rf'\b({re.escape(word)})\b', r'<@>\1@>', text, flags=re.IGNORECASE) text = re.sub("<@>",f'', text) text = re.sub("@>",''.format(color=color), text) return text # Highlight words in the 'completion' column candidates = json.loads(example['candidates']) regex_candidates = json.loads(example['regex_candidates']) highlighted_completion = highlight_words(example['completion'], candidates) highlighted_regex_completion = highlight_words(example['completion'], regex_candidates, color="green") # Create a table with the core columns table_html = f"""
Completion hash | {example['_original_completion_hash']} |
Model | {example['model']} |
Prompt ID | {example['prompt_id']} |
Solution | {example['solution']} |
Prompt | {example['prompt']} |
Candidates | {candidates} |
Regex Candidates | {regex_candidates} |
{completion}
""" if "highlight_candidates" in show_completion: completion = highlighted_completion table_html += f"""{completion}
""" return table_html # Create the Gradio interface with gr.Blocks() as demo: # Slider to navigate through examples example_slider = gr.Slider(minimum=0, maximum=len(df)-1, step=1, label="Example", value=0) # Toggle button for showing/hiding completion toggle_button = gr.CheckboxGroup(["highlight_candidates", "highlight_regex"]) with gr.Row(): gr.HTML('