Spaces:

Suzana
/

labelit-mini-ner

Sleeping

App Files Files Community

labelit-mini-ner / app.py

Suzana

Update app.py

d1f4849 verified 2 months ago

raw

history blame

3.81 kB

	import gradio as gr
	import pandas as pd
	from pathlib import Path

	LABELS = {"PER", "ORG", "LOC", "EV", "O"} # allowed tags
	token_df = pd.DataFrame() # global

	# ───────────────── tokenization ────────────────────────────────
	def tokenize(df: pd.DataFrame) -> pd.DataFrame:
	rows = []
	if "text" in df.columns:
	lines = df["text"].astype(str)
	else:
	lines = df.apply(lambda r: f"User: {r['user']} Assistant: {r['assistant']}", axis=1)
	for sid, line in enumerate(lines):
	for tok in line.split():
	rows.append({"sentence_id": sid, "token": tok, "label": "O"})
	return pd.DataFrame(rows)

	# ───────────────── callbacks ───────────────────────────────────
	def load_csv(file):
	global token_df
	df = pd.read_csv(file.name)
	if "text" not in df.columns and not {"user", "assistant"}.issubset(df.columns):
	return None, "❌ CSV must have `text` OR `user`+`assistant` columns.", \
	gr.update(visible=False), gr.update(visible=False)

	token_df = tokenize(df)
	return gr.update(value=token_df, visible=True), \
	f"✅ Loaded {len(df)} rows – {len(token_df)} tokens.", \
	gr.update(visible=True), gr.update(visible=False)

	def save_table(tbl):
	global token_df
	token_df = pd.DataFrame(tbl, columns=["sentence_id", "token", "label"])
	# simple validation
	bad = token_df[~token_df["label"].isin(LABELS)]
	if not bad.empty:
	return "⚠️ Unknown labels found. Allowed: PER / ORG / LOC / EV / O"
	return "💾 Saved."

	def to_tokens_csv():
	path = "raw_tokens.csv"
	token_df.to_csv(path, index=False)
	return Path(path)

	def to_iob_csv():
	# build IOB tags
	iob, prev = [], {}
	for _, r in token_df.iterrows():
	sid, lbl = r["sentence_id"], r["label"]
	if lbl == "O":
	iob.append("O"); prev[sid] = None
	else:
	prefix = "I-" if prev.get(sid) == lbl else "B-"
	iob.append(prefix + lbl)
	prev[sid] = lbl
	out = token_df.copy(); out["iob"] = iob
	path = "ner_iob.csv"; out.to_csv(path, index=False)
	return Path(path)

	# ───────────────── UI ──────────────────────────────────────────
	with gr.Blocks() as demo:
	gr.Markdown("# 🏷️ Label It! Mini-NER")
	gr.Markdown("Step 1 – upload a CSV containing a `text` column or `user`+`assistant` columns.")

	with gr.Row():
	csv_file = gr.File(file_types=[".csv"])
	load_btn = gr.Button("Load")

	status = gr.Textbox(label="Status", interactive=False)

	tok_table = gr.Dataframe(
	headers=["sentence_id", "token", "label"],
	datatype=["number", "str", "str"],
	row_count=0,
	col_count=3,
	visible=False
	)

	with gr.Row(visible=False) as btn_row:
	save_btn = gr.Button("💾 Save")
	dl_tok = gr.Button("⬇︎ Tokens CSV")
	dl_iob = gr.Button("⬇︎ IOB CSV")

	file_tok = gr.File(visible=False)
	file_iob = gr.File(visible=False)

	# bind
	load_btn.click(load_csv, inputs=csv_file,
	outputs=[tok_table, status, btn_row, file_tok])

	save_btn.click(save_table, inputs=tok_table, outputs=status)

	dl_tok.click(lambda: to_tokens_csv(), outputs=file_tok)
	dl_iob.click(lambda: to_iob_csv(), outputs=file_iob)

	gr.Markdown(
	"Step 2 – type `PER`, `ORG`, `LOC`, `EV`, or `O` in the label column → Save → Download."
	)

	demo.launch()