import gradio as gr import pandas as pd from pathlib import Path LABELS = {"PER", "ORG", "LOC", "EV", "O"} # allowed tags token_df = pd.DataFrame() # global # ───────────────── tokenization ──────────────────────────────── def tokenize(df: pd.DataFrame) -> pd.DataFrame: rows = [] if "text" in df.columns: lines = df["text"].astype(str) else: lines = df.apply(lambda r: f"User: {r['user']} Assistant: {r['assistant']}", axis=1) for sid, line in enumerate(lines): for tok in line.split(): rows.append({"sentence_id": sid, "token": tok, "label": "O"}) return pd.DataFrame(rows) # ───────────────── callbacks ─────────────────────────────────── def load_csv(file): global token_df df = pd.read_csv(file.name) if "text" not in df.columns and not {"user", "assistant"}.issubset(df.columns): return None, "❌ CSV must have `text` OR `user`+`assistant` columns.", \ gr.update(visible=False), gr.update(visible=False) token_df = tokenize(df) return gr.update(value=token_df, visible=True), \ f"✅ Loaded {len(df)} rows – {len(token_df)} tokens.", \ gr.update(visible=True), gr.update(visible=False) def save_table(tbl): global token_df token_df = pd.DataFrame(tbl, columns=["sentence_id", "token", "label"]) # simple validation bad = token_df[~token_df["label"].isin(LABELS)] if not bad.empty: return "⚠️ Unknown labels found. Allowed: PER / ORG / LOC / EV / O" return "💾 Saved." def to_tokens_csv(): path = "raw_tokens.csv" token_df.to_csv(path, index=False) return Path(path) def to_iob_csv(): # build IOB tags iob, prev = [], {} for _, r in token_df.iterrows(): sid, lbl = r["sentence_id"], r["label"] if lbl == "O": iob.append("O"); prev[sid] = None else: prefix = "I-" if prev.get(sid) == lbl else "B-" iob.append(prefix + lbl) prev[sid] = lbl out = token_df.copy(); out["iob"] = iob path = "ner_iob.csv"; out.to_csv(path, index=False) return Path(path) # ───────────────── UI ────────────────────────────────────────── with gr.Blocks() as demo: gr.Markdown("# 🏷️ Label It! Mini-NER") gr.Markdown("**Step 1** – upload a CSV containing a `text` column *or* `user`+`assistant` columns.") with gr.Row(): csv_file = gr.File(file_types=[".csv"]) load_btn = gr.Button("Load") status = gr.Textbox(label="Status", interactive=False) tok_table = gr.Dataframe( headers=["sentence_id", "token", "label"], datatype=["number", "str", "str"], row_count=0, col_count=3, visible=False ) with gr.Row(visible=False) as btn_row: save_btn = gr.Button("💾 Save") dl_tok = gr.Button("⬇︎ Tokens CSV") dl_iob = gr.Button("⬇︎ IOB CSV") file_tok = gr.File(visible=False) file_iob = gr.File(visible=False) # bind load_btn.click(load_csv, inputs=csv_file, outputs=[tok_table, status, btn_row, file_tok]) save_btn.click(save_table, inputs=tok_table, outputs=status) dl_tok.click(lambda: to_tokens_csv(), outputs=file_tok) dl_iob.click(lambda: to_iob_csv(), outputs=file_iob) gr.Markdown( "**Step 2** – type `PER`, `ORG`, `LOC`, `EV`, or `O` in the **label** column → Save → Download." ) demo.launch()