import gradio as gr import pandas as pd from pathlib import Path from huggingface_hub import HfApi, Repository # Allowed tags LABELS = {"PER", "ORG", "LOC", "EV", "O"} # Global token DataFrame token_df = pd.DataFrame() # ───────────────────────── helpers ────────────────────────────── def explode_to_tokens(df: pd.DataFrame) -> pd.DataFrame: """Return DataFrame(sentence_id, token, label[=O])""" rows = [] if "text" in df.columns: lines = df["text"].astype(str) else: # dialog pair lines = df.apply(lambda r: f"User: {r['user']} Assistant: {r['assistant']}", axis=1) for sid, line in enumerate(lines): for tok in line.split(): rows.append({"sentence_id": sid, "token": tok, "label": "O"}) return pd.DataFrame(rows) # ───────────────────────── callbacks ──────────────────────────── def load_csv(file): global token_df df = pd.read_csv(file.name) valid = ("text" in df.columns) or ({"user", "assistant"}.issubset(df.columns)) if not valid: return None, "❌ CSV must contain `text` OR `user`+`assistant` columns.", \ gr.update(visible=False), gr.update(visible=False), gr.update(visible=False) token_df = explode_to_tokens(df) return ( gr.update(value=token_df, visible=True, # show table row_count=len(token_df)), f"✅ Loaded {len(df)} rows – {len(token_df)} tokens.", gr.update(visible=True), # show action row gr.update(visible=False), # hide token file gr.update(visible=False) # hide iob file ) def save_table(table_data): global token_df token_df = pd.DataFrame(table_data, columns=["sentence_id", "token", "label"]) if not set(token_df["label"]).issubset(LABELS): return "⚠️ Unknown label detected. Allowed: PER / ORG / LOC / EV / O" return "💾 Saved." def export_tokens(): path = "raw_tokens.csv" token_df.to_csv(path, index=False) return gr.update(value=Path(path), visible=True) def export_iob(): iob, prev = [], {} for _, r in token_df.iterrows(): sid, lbl = r["sentence_id"], r["label"] if lbl == "O": iob.append("O"); prev[sid] = None else: tag = ("I-" if prev.get(sid) == lbl else "B-") + lbl iob.append(tag); prev[sid] = lbl out = token_df.copy(); out["iob"] = iob path = "ner_iob.csv"; out.to_csv(path, index=False) return gr.update(value=Path(path), visible=True) def push_to_hub(repo_id, token): global token_df try: api = HfApi() api.create_repo(repo_id, token=token, repo_type="dataset", exist_ok=True) local_dir = Path(f"./{repo_id.replace('/','_')}") if local_dir.exists(): for f in local_dir.iterdir(): f.unlink() local_dir.rmdir() repo = Repository(local_dir=str(local_dir), clone_from=repo_id, repo_type="dataset", use_auth_token=token) token_df.to_csv(local_dir / "data.csv", index=False) repo.push_to_hub(commit_message="Add annotated NER data") return f"🚀 Pushed to https://huggingface.co/datasets/{repo_id}" except Exception as e: return f"❌ Push failed: {e}" # ───────────────────────── UI ─────────────────────────────────── with gr.Blocks() as demo: gr.Markdown("# 🏷️ Label It! Mini-NER") gr.Markdown("**Step 1** – upload a CSV with a `text` column **or** a `user`+`assistant` pair.") with gr.Row(): csv_file = gr.File(file_types=[".csv"], label="📁 Upload CSV") load_btn = gr.Button("Load") status = gr.Textbox(label="Status", interactive=False) tok_table = gr.Dataframe( headers=["sentence_id", "token", "label"], datatype=["number", "str", "str"], row_count=0, col_count=3, visible=False ) with gr.Row(visible=False) as action_row: save_btn = gr.Button("💾 Save") dl_tok_btn = gr.Button("⬇︎ Tokens CSV") dl_iob_btn = gr.Button("⬇︎ IOB CSV") file_tok = gr.File(visible=False) file_iob = gr.File(visible=False) # Push accordion with gr.Accordion("📦 Push to Hugging Face Hub", open=False) as push_acc: repo_in = gr.Textbox(label="dataset repo (username/name)") token_in = gr.Textbox(label="HF Token", type="password") push_btn = gr.Button("Push") push_out = gr.Textbox(label="Push Status", interactive=False) # Hide accordion until data load push_acc.visible = False # ── wiring load_btn.click(load_csv, inputs=csv_file, outputs=[tok_table, status, action_row, file_tok, file_iob]) load_btn.click(lambda: gr.update(visible=True), None, push_acc) # show accordion after load save_btn.click(save_table, inputs=tok_table, outputs=status) dl_tok_btn.click(export_tokens, outputs=file_tok) dl_iob_btn.click(export_iob, outputs=file_iob) push_btn.click(push_to_hub, inputs=[repo_in, token_in], outputs=push_out) gr.Markdown( "**Step 2** – edit the `label` column (`PER`, `ORG`, `LOC`, `EV`, `O`) ➜ Save ➜ Download / Push." ) demo.launch()