Spaces:
Sleeping
Sleeping
import gradio as gr | |
import pandas as pd | |
from pathlib import Path | |
LABELS = {"PER", "ORG", "LOC", "EV", "O"} # allowed tags | |
token_df = pd.DataFrame() # global | |
# βββββββββββββββββ tokenization ββββββββββββββββββββββββββββββββ | |
def tokenize(df: pd.DataFrame) -> pd.DataFrame: | |
rows = [] | |
if "text" in df.columns: | |
lines = df["text"].astype(str) | |
else: | |
lines = df.apply(lambda r: f"User: {r['user']} Assistant: {r['assistant']}", axis=1) | |
for sid, line in enumerate(lines): | |
for tok in line.split(): | |
rows.append({"sentence_id": sid, "token": tok, "label": "O"}) | |
return pd.DataFrame(rows) | |
# βββββββββββββββββ callbacks βββββββββββββββββββββββββββββββββββ | |
def load_csv(file): | |
global token_df | |
df = pd.read_csv(file.name) | |
if "text" not in df.columns and not {"user", "assistant"}.issubset(df.columns): | |
return None, "β CSV must have `text` OR `user`+`assistant` columns.", \ | |
gr.update(visible=False), gr.update(visible=False) | |
token_df = tokenize(df) | |
return gr.update(value=token_df, visible=True), \ | |
f"β Loaded {len(df)} rows β {len(token_df)} tokens.", \ | |
gr.update(visible=True), gr.update(visible=False) | |
def save_table(tbl): | |
global token_df | |
token_df = pd.DataFrame(tbl, columns=["sentence_id", "token", "label"]) | |
# simple validation | |
bad = token_df[~token_df["label"].isin(LABELS)] | |
if not bad.empty: | |
return "β οΈ Unknown labels found. Allowed: PER / ORG / LOC / EV / O" | |
return "πΎ Saved." | |
def to_tokens_csv(): | |
path = "raw_tokens.csv" | |
token_df.to_csv(path, index=False) | |
return Path(path) | |
def to_iob_csv(): | |
# build IOB tags | |
iob, prev = [], {} | |
for _, r in token_df.iterrows(): | |
sid, lbl = r["sentence_id"], r["label"] | |
if lbl == "O": | |
iob.append("O"); prev[sid] = None | |
else: | |
prefix = "I-" if prev.get(sid) == lbl else "B-" | |
iob.append(prefix + lbl) | |
prev[sid] = lbl | |
out = token_df.copy(); out["iob"] = iob | |
path = "ner_iob.csv"; out.to_csv(path, index=False) | |
return Path(path) | |
# βββββββββββββββββ UI ββββββββββββββββββββββββββββββββββββββββββ | |
with gr.Blocks() as demo: | |
gr.Markdown("# π·οΈ Label It! Mini-NER") | |
gr.Markdown("**Step 1** β upload a CSV containing a `text` column *or* `user`+`assistant` columns.") | |
with gr.Row(): | |
csv_file = gr.File(file_types=[".csv"]) | |
load_btn = gr.Button("Load") | |
status = gr.Textbox(label="Status", interactive=False) | |
tok_table = gr.Dataframe( | |
headers=["sentence_id", "token", "label"], | |
datatype=["number", "str", "str"], | |
row_count=0, | |
col_count=3, | |
visible=False | |
) | |
with gr.Row(visible=False) as btn_row: | |
save_btn = gr.Button("πΎ Save") | |
dl_tok = gr.Button("β¬οΈ Tokens CSV") | |
dl_iob = gr.Button("β¬οΈ IOB CSV") | |
file_tok = gr.File(visible=False) | |
file_iob = gr.File(visible=False) | |
# bind | |
load_btn.click(load_csv, inputs=csv_file, | |
outputs=[tok_table, status, btn_row, file_tok]) | |
save_btn.click(save_table, inputs=tok_table, outputs=status) | |
dl_tok.click(lambda: to_tokens_csv(), outputs=file_tok) | |
dl_iob.click(lambda: to_iob_csv(), outputs=file_iob) | |
gr.Markdown( | |
"**Step 2** β type `PER`, `ORG`, `LOC`, `EV`, or `O` in the **label** column β Save β Download." | |
) | |
demo.launch() | |