Suzana's picture
Create app.py
f3b49b2 verified
raw
history blame
3.64 kB
import gradio as gr
import pandas as pd
from pathlib import Path
# Global token storage
token_df = pd.DataFrame()
# Generate generic sample sentences
def make_sample_data(n=100):
people = ["Alice","Bob","Charlie","Diane","Eve"]
orgs = ["Acme Corp","Globex","Initech","Umbrella","Stark Inc"]
locs = ["Paris","New York","London","Tokyo","Sydney"]
verbs = ["visited","joined","founded","traveled to","met with"]
rows = []
for i in range(n):
p = people[i % len(people)]
v = verbs[i % len(verbs)]
o = orgs[i % len(orgs)]
l = locs[i % len(locs)]
rows.append({"text": f"{p} {v} {o} in {l}."})
return pd.DataFrame(rows)
def load_data(file):
global token_df
# Load user CSV or fallback to sample
if file:
df = pd.read_csv(file.name)
else:
df = make_sample_data(100)
if "text" not in df.columns:
return (
gr.update(visible=False),
"❌ CSV must contain a `text` column.",
gr.update(visible=False)
)
# Tokenize into (sentence_id, token, label)
records = []
for sid, txt in enumerate(df["text"]):
for tok in txt.split():
records.append({"sentence_id": sid, "token": tok, "label": "O"})
token_df = pd.DataFrame(records)
return (
gr.update(value=token_df, visible=True),
f"✅ Loaded {len(df)} sentences → {len(token_df)} tokens.",
gr.update(visible=True)
)
def save_edits(table):
global token_df
token_df = pd.DataFrame(table, columns=["sentence_id","token","label"])
return "💾 Edits saved."
def download_tokens():
token_df.to_csv("raw_tokens.csv", index=False)
return Path("raw_tokens.csv")
def download_iob():
# Convert to IOB
iob, prev = [], {}
for _, r in token_df.iterrows():
sid, lbl = r["sentence_id"], r["label"]
if lbl == "O":
iob.append("O")
prev[sid] = None
else:
tag = ("I-" if prev.get(sid)==lbl else "B-") + lbl
iob.append(tag)
prev[sid] = lbl
out = token_df.copy()
out["iob"] = iob
out.to_csv("ner_iob.csv", index=False)
return Path("ner_iob.csv")
with gr.Blocks() as app:
gr.Markdown("# 🏷️ Label It! Mini-NER")
gr.Markdown("**Step 1:** Upload a CSV with a `text` column, or leave blank for sample sentences.")
with gr.Row():
file_in = gr.File(label="📁 Upload CSV", file_types=[".csv"])
load_btn = gr.Button("Load Data")
status = gr.Textbox(label="Status", interactive=False)
table = gr.Dataframe(
headers=["sentence_id","token","label"],
editable=True,
visible=False,
label="📝 Annotate Tokens"
)
with gr.Row(visible=False) as actions:
save_btn = gr.Button("💾 Save Edits")
dl_tokens = gr.DownloadButton(
fn=download_tokens,
file_name="raw_tokens.csv",
label="⬇️ Download Tokens CSV"
)
dl_iob = gr.DownloadButton(
fn=download_iob,
file_name="ner_iob.csv",
label="⬇️ Download IOB CSV"
)
# Bind events
load_btn.click(
load_data,
inputs=file_in,
outputs=[table, status, actions]
)
save_btn.click(
save_edits,
inputs=table,
outputs=status
)
gr.Markdown("""
**Step 2:**
- Click into the **label** column and type one of:
`PER`, `ORG`, `LOC`, or leave as `O`.
- **Save Edits**, then download your token CSV or IOB‐tagged CSV.
""")
app.launch()