import gradio as gr
import pandas as pd
from pathlib import Path

LABELS = {"PER", "ORG", "LOC", "EV", "O"}      # allowed tags
token_df = pd.DataFrame()                      # global

# ───────────────── tokenization ────────────────────────────────
def tokenize(df: pd.DataFrame) -> pd.DataFrame:
    rows = []
    if "text" in df.columns:
        lines = df["text"].astype(str)
    else:
        lines = df.apply(lambda r: f"User: {r['user']} Assistant: {r['assistant']}", axis=1)
    for sid, line in enumerate(lines):
        for tok in line.split():
            rows.append({"sentence_id": sid, "token": tok, "label": "O"})
    return pd.DataFrame(rows)

# ───────────────── callbacks ───────────────────────────────────
def load_csv(file):
    global token_df
    df = pd.read_csv(file.name)
    if "text" not in df.columns and not {"user", "assistant"}.issubset(df.columns):
        return None, "❌ CSV must have `text` OR `user`+`assistant` columns.", \
               gr.update(visible=False), gr.update(visible=False)

    token_df = tokenize(df)
    return gr.update(value=token_df, visible=True), \
           f"✅ Loaded {len(df)} rows – {len(token_df)} tokens.", \
           gr.update(visible=True), gr.update(visible=False)

def save_table(tbl):
    global token_df
    token_df = pd.DataFrame(tbl, columns=["sentence_id", "token", "label"])
    # simple validation
    bad = token_df[~token_df["label"].isin(LABELS)]
    if not bad.empty:
        return "⚠️ Unknown labels found. Allowed: PER / ORG / LOC / EV / O"
    return "💾 Saved."

def to_tokens_csv():
    path = "raw_tokens.csv"
    token_df.to_csv(path, index=False)
    return Path(path)

def to_iob_csv():
    # build IOB tags
    iob, prev = [], {}
    for _, r in token_df.iterrows():
        sid, lbl = r["sentence_id"], r["label"]
        if lbl == "O":
            iob.append("O"); prev[sid] = None
        else:
            prefix = "I-" if prev.get(sid) == lbl else "B-"
            iob.append(prefix + lbl)
            prev[sid] = lbl
    out = token_df.copy(); out["iob"] = iob
    path = "ner_iob.csv"; out.to_csv(path, index=False)
    return Path(path)

# ───────────────── UI ──────────────────────────────────────────
with gr.Blocks() as demo:
    gr.Markdown("# 🏷️ Label It! Mini-NER")
    gr.Markdown("**Step 1** – upload a CSV containing a `text` column *or* `user`+`assistant` columns.")

    with gr.Row():
        csv_file = gr.File(file_types=[".csv"])
        load_btn = gr.Button("Load")

    status = gr.Textbox(label="Status", interactive=False)

    tok_table = gr.Dataframe(
        headers=["sentence_id", "token", "label"],
        datatype=["number", "str", "str"],
        row_count=0,
        col_count=3,
        visible=False
    )

    with gr.Row(visible=False) as btn_row:
        save_btn = gr.Button("💾 Save")
        dl_tok   = gr.Button("⬇︎ Tokens CSV")
        dl_iob   = gr.Button("⬇︎ IOB CSV")

    file_tok = gr.File(visible=False)
    file_iob = gr.File(visible=False)

    # bind
    load_btn.click(load_csv, inputs=csv_file,
                   outputs=[tok_table, status, btn_row, file_tok])

    save_btn.click(save_table, inputs=tok_table, outputs=status)

    dl_tok.click(lambda: to_tokens_csv(), outputs=file_tok)
    dl_iob.click(lambda: to_iob_csv(),   outputs=file_iob)

    gr.Markdown(
        "**Step 2** – type `PER`, `ORG`, `LOC`, `EV`, or `O` in the **label** column → Save → Download."
    )

demo.launch()