Suzana's picture
Update app.py
4455f2c verified
raw
history blame
5.58 kB
import gradio as gr
import pandas as pd
from pathlib import Path
from huggingface_hub import HfApi, Repository
# Allowed tags
LABELS = {"PER", "ORG", "LOC", "EV", "O"}
# Global token DataFrame
token_df = pd.DataFrame()
# ───────────────────────── helpers ──────────────────────────────
def explode_to_tokens(df: pd.DataFrame) -> pd.DataFrame:
"""Return DataFrame(sentence_id, token, label[=O])"""
rows = []
if "text" in df.columns:
lines = df["text"].astype(str)
else: # dialog pair
lines = df.apply(lambda r: f"User: {r['user']} Assistant: {r['assistant']}", axis=1)
for sid, line in enumerate(lines):
for tok in line.split():
rows.append({"sentence_id": sid, "token": tok, "label": "O"})
return pd.DataFrame(rows)
# ───────────────────────── callbacks ────────────────────────────
def load_csv(file):
global token_df
df = pd.read_csv(file.name)
valid = ("text" in df.columns) or ({"user", "assistant"}.issubset(df.columns))
if not valid:
return None, "❌ CSV must contain `text` OR `user`+`assistant` columns.", \
gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
token_df = explode_to_tokens(df)
return (
gr.update(value=token_df, visible=True, # show table
row_count=len(token_df)),
f"βœ… Loaded {len(df)} rows – {len(token_df)} tokens.",
gr.update(visible=True), # show action row
gr.update(visible=False), # hide token file
gr.update(visible=False) # hide iob file
)
def save_table(table_data):
global token_df
token_df = pd.DataFrame(table_data, columns=["sentence_id", "token", "label"])
if not set(token_df["label"]).issubset(LABELS):
return "⚠️ Unknown label detected. Allowed: PER / ORG / LOC / EV / O"
return "πŸ’Ύ Saved."
def export_tokens():
path = "raw_tokens.csv"
token_df.to_csv(path, index=False)
return gr.update(value=Path(path), visible=True)
def export_iob():
iob, prev = [], {}
for _, r in token_df.iterrows():
sid, lbl = r["sentence_id"], r["label"]
if lbl == "O":
iob.append("O"); prev[sid] = None
else:
tag = ("I-" if prev.get(sid) == lbl else "B-") + lbl
iob.append(tag); prev[sid] = lbl
out = token_df.copy(); out["iob"] = iob
path = "ner_iob.csv"; out.to_csv(path, index=False)
return gr.update(value=Path(path), visible=True)
def push_to_hub(repo_id, token):
global token_df
try:
api = HfApi()
api.create_repo(repo_id, token=token, repo_type="dataset", exist_ok=True)
local_dir = Path(f"./{repo_id.replace('/','_')}")
if local_dir.exists():
for f in local_dir.iterdir(): f.unlink()
local_dir.rmdir()
repo = Repository(local_dir=str(local_dir),
clone_from=repo_id,
repo_type="dataset",
use_auth_token=token)
token_df.to_csv(local_dir / "data.csv", index=False)
repo.push_to_hub(commit_message="Add annotated NER data")
return f"πŸš€ Pushed to https://huggingface.co/datasets/{repo_id}"
except Exception as e:
return f"❌ Push failed: {e}"
# ───────────────────────── UI ───────────────────────────────────
with gr.Blocks() as demo:
gr.Markdown("# 🏷️ Label It! Mini-NER")
gr.Markdown("**Step 1** – upload a CSV with a `text` column **or** a `user`+`assistant` pair.")
with gr.Row():
csv_file = gr.File(file_types=[".csv"], label="πŸ“ Upload CSV")
load_btn = gr.Button("Load")
status = gr.Textbox(label="Status", interactive=False)
tok_table = gr.Dataframe(
headers=["sentence_id", "token", "label"],
datatype=["number", "str", "str"],
row_count=0, col_count=3,
visible=False
)
with gr.Row(visible=False) as action_row:
save_btn = gr.Button("πŸ’Ύ Save")
dl_tok_btn = gr.Button("β¬‡οΈŽ Tokens CSV")
dl_iob_btn = gr.Button("β¬‡οΈŽ IOB CSV")
file_tok = gr.File(visible=False)
file_iob = gr.File(visible=False)
# Push accordion
with gr.Accordion("πŸ“¦ Push to Hugging Face Hub", open=False) as push_acc:
repo_in = gr.Textbox(label="dataset repo (username/name)")
token_in = gr.Textbox(label="HF Token", type="password")
push_btn = gr.Button("Push")
push_out = gr.Textbox(label="Push Status", interactive=False)
# Hide accordion until data load
push_acc.visible = False
# ── wiring
load_btn.click(load_csv, inputs=csv_file,
outputs=[tok_table, status, action_row, file_tok, file_iob])
load_btn.click(lambda: gr.update(visible=True), None, push_acc) # show accordion after load
save_btn.click(save_table, inputs=tok_table, outputs=status)
dl_tok_btn.click(export_tokens, outputs=file_tok)
dl_iob_btn.click(export_iob, outputs=file_iob)
push_btn.click(push_to_hub, inputs=[repo_in, token_in], outputs=push_out)
gr.Markdown(
"**Step 2** – edit the `label` column (`PER`, `ORG`, `LOC`, `EV`, `O`) ➜ Save ➜ Download / Push."
)
demo.launch()