Spaces:
Sleeping
Sleeping
import gradio as gr | |
import pandas as pd | |
from pathlib import Path | |
# Global token storage | |
token_df = pd.DataFrame() | |
# Generate generic sample sentences | |
def make_sample_data(n=100): | |
people = ["Alice","Bob","Charlie","Diane","Eve"] | |
orgs = ["Acme Corp","Globex","Initech","Umbrella","Stark Inc"] | |
locs = ["Paris","New York","London","Tokyo","Sydney"] | |
verbs = ["visited","joined","founded","traveled to","met with"] | |
rows = [] | |
for i in range(n): | |
p = people[i % len(people)] | |
v = verbs[i % len(verbs)] | |
o = orgs[i % len(orgs)] | |
l = locs[i % len(locs)] | |
rows.append({"text": f"{p} {v} {o} in {l}."}) | |
return pd.DataFrame(rows) | |
def load_data(file): | |
global token_df | |
# Load user CSV or fallback to sample | |
if file: | |
df = pd.read_csv(file.name) | |
else: | |
df = make_sample_data(100) | |
if "text" not in df.columns: | |
return ( | |
gr.update(visible=False), | |
"❌ CSV must contain a `text` column.", | |
gr.update(visible=False) | |
) | |
# Tokenize into (sentence_id, token, label) | |
records = [] | |
for sid, txt in enumerate(df["text"]): | |
for tok in txt.split(): | |
records.append({"sentence_id": sid, "token": tok, "label": "O"}) | |
token_df = pd.DataFrame(records) | |
return ( | |
gr.update(value=token_df, visible=True), | |
f"✅ Loaded {len(df)} sentences → {len(token_df)} tokens.", | |
gr.update(visible=True) | |
) | |
def save_edits(table): | |
global token_df | |
token_df = pd.DataFrame(table, columns=["sentence_id","token","label"]) | |
return "💾 Edits saved." | |
def download_tokens(): | |
token_df.to_csv("raw_tokens.csv", index=False) | |
return Path("raw_tokens.csv") | |
def download_iob(): | |
# Convert to IOB | |
iob, prev = [], {} | |
for _, r in token_df.iterrows(): | |
sid, lbl = r["sentence_id"], r["label"] | |
if lbl == "O": | |
iob.append("O") | |
prev[sid] = None | |
else: | |
tag = ("I-" if prev.get(sid)==lbl else "B-") + lbl | |
iob.append(tag) | |
prev[sid] = lbl | |
out = token_df.copy() | |
out["iob"] = iob | |
out.to_csv("ner_iob.csv", index=False) | |
return Path("ner_iob.csv") | |
with gr.Blocks() as app: | |
gr.Markdown("# 🏷️ Label It! Mini-NER") | |
gr.Markdown("**Step 1:** Upload a CSV with a `text` column, or leave blank for sample sentences.") | |
with gr.Row(): | |
file_in = gr.File(label="📁 Upload CSV", file_types=[".csv"]) | |
load_btn = gr.Button("Load Data") | |
status = gr.Textbox(label="Status", interactive=False) | |
table = gr.Dataframe( | |
headers=["sentence_id","token","label"], | |
editable=True, | |
visible=False, | |
label="📝 Annotate Tokens" | |
) | |
with gr.Row(visible=False) as actions: | |
save_btn = gr.Button("💾 Save Edits") | |
dl_tokens = gr.DownloadButton( | |
fn=download_tokens, | |
file_name="raw_tokens.csv", | |
label="⬇️ Download Tokens CSV" | |
) | |
dl_iob = gr.DownloadButton( | |
fn=download_iob, | |
file_name="ner_iob.csv", | |
label="⬇️ Download IOB CSV" | |
) | |
# Bind events | |
load_btn.click( | |
load_data, | |
inputs=file_in, | |
outputs=[table, status, actions] | |
) | |
save_btn.click( | |
save_edits, | |
inputs=table, | |
outputs=status | |
) | |
gr.Markdown(""" | |
**Step 2:** | |
- Click into the **label** column and type one of: | |
`PER`, `ORG`, `LOC`, or leave as `O`. | |
- **Save Edits**, then download your token CSV or IOB‐tagged CSV. | |
""") | |
app.launch() | |