import gradio as gr
import pandas as pd
from pathlib import Path
from typing import Dict, List, Tuple
import os
import base64
PROCESSED_DATA_DIR = Path(".")
# Embed logo as a base64 data URI to avoid Gradio toolbar interactions
logo_path = "rowsquared-logo-large.png"
with open(logo_path, "rb") as f:
logo_b64 = base64.b64encode(f.read()).decode("utf-8")
# ----------------------------
# Data loading & preprocessing
# ----------------------------
df_isco = (
pd.read_excel(
PROCESSED_DATA_DIR / "isco_imperfect.xlsx",
converters={"major": str, "sub_major": str, "minor": str, "unit": str},
)[["major_label", "sub_major_label", "minor_label", "unit_label"]]
.dropna()
.drop_duplicates()
.reset_index(drop=True)
)
# Build nested hierarchy dict: {major: {sub: {minor: [units]}}}
hierarchy: Dict[str, Dict[str, Dict[str, List[str]]]] = {}
for _, r in df_isco.iterrows():
hierarchy.setdefault(r.major_label, {}) \
.setdefault(r.sub_major_label, {}) \
.setdefault(r.minor_label, []) \
.append(r.unit_label)
# Ensure uniqueness & sorting at leaf lists
for maj in hierarchy:
for sub in hierarchy[maj]:
for mn in hierarchy[maj][sub]:
hierarchy[maj][sub][mn] = sorted(list(dict.fromkeys(hierarchy[maj][sub][mn])))
# Fast helpers for children
def majors() -> List[str]:
return sorted(hierarchy.keys())
def submajors(maj: str) -> List[str]:
return sorted(hierarchy.get(maj, {}).keys())
def minors(maj: str, sub: str) -> List[str]:
return sorted(hierarchy.get(maj, {}).get(sub, {}).keys())
def units(maj: str, sub: str, mn: str) -> List[str]:
return hierarchy.get(maj, {}).get(sub, {}).get(mn, [])
# ----------------------------
# Records to annotate
# ----------------------------
records = pd.read_excel(PROCESSED_DATA_DIR / "isco_predictions.xlsx").copy()
for col in ["major_label", "sub_major_label", "minor_label", "unit_label"]:
if col not in records:
records[col] = ""
if "annotated" not in records:
records["annotated"] = False
# ensure not views
for col in ["major_label", "sub_major_label", "minor_label", "unit_label", "annotated"]:
records[col] = records[col].copy()
records.reset_index(drop=True, inplace=True)
# -----------------------------------
# Core logic: clamp & state management
# -----------------------------------
def clamp_path(maj: str, sub: str, mn: str, un: str
) -> Tuple[str, str, str, str, List[str], List[str], List[str], List[str]]:
"""Return a valid (maj, sub, mn, un) tuple + their choices lists.
Only replace a level if it's invalid for the hierarchy."""
maj_choices = majors()
if maj not in maj_choices:
maj = maj_choices[0] if maj_choices else ""
sub_choices = submajors(maj) if maj else []
if sub not in sub_choices:
sub = sub_choices[0] if sub_choices else ""
mn_choices = minors(maj, sub) if sub else []
if mn not in mn_choices:
mn = mn_choices[0] if mn_choices else ""
un_choices = units(maj, sub, mn) if mn else []
if un not in un_choices:
un = un_choices[0] if un_choices else ""
return maj, sub, mn, un, maj_choices, sub_choices, mn_choices, un_choices
def save_record(i: int, maj: str, sub: str, mn: str, un: str) -> None:
records.loc[i, ["major_label", "sub_major_label", "minor_label", "unit_label"]] = [maj, sub, mn, un]
records.loc[i, "annotated"] = True
def status_text(i: int) -> str:
return f"**Status**: {'✅ Annotated' if records.loc[i, 'annotated'] else '❌ Not Annotated'}"
def load_record(i: int):
rec = records.loc[i]
maj, sub, mn, un, maj_c, sub_c, mn_c, un_c = clamp_path(
rec["major_label"], rec["sub_major_label"], rec["minor_label"], rec["unit_label"]
)
# Persist clamped values back (only if changed)
save_record(i, maj, sub, mn, un)
record_md = f"## Occupation: {rec['occupation_title_main']}\n## Industry: {rec['industry_title_main']}"
return (
record_md,
status_text(i),
gr.update(choices=maj_c, value=maj),
gr.update(choices=sub_c, value=sub),
gr.update(choices=mn_c, value=mn),
gr.update(choices=un_c, value=un),
)
# ---------------------
# Event handler helpers
# ---------------------
def on_major_change(new_major: str, i: int):
sub_c = submajors(new_major)
sub = sub_c[0] if sub_c else ""
mn_c = minors(new_major, sub) if sub else []
mn = mn_c[0] if mn_c else ""
un_c = units(new_major, sub, mn) if mn else []
un = un_c[0] if un_c else ""
save_record(i, new_major, sub, mn, un)
return (
gr.update(choices=majors(), value=new_major),
gr.update(choices=sub_c, value=sub),
gr.update(choices=mn_c, value=mn),
gr.update(choices=un_c, value=un),
status_text(i),
)
def on_sub_change(new_sub: str, i: int, major: str):
mn_c = minors(major, new_sub)
mn = mn_c[0] if mn_c else ""
un_c = units(major, new_sub, mn) if mn else []
un = un_c[0] if un_c else ""
records.loc[i, ["sub_major_label", "minor_label", "unit_label"]] = [new_sub, mn, un]
records.loc[i, "annotated"] = True
return (
gr.update(choices=submajors(major), value=new_sub),
gr.update(choices=mn_c, value=mn),
gr.update(choices=un_c, value=un),
status_text(i),
)
def on_minor_change(new_minor: str, i: int, major: str, sub: str):
un_c = units(major, sub, new_minor)
un = un_c[0] if un_c else ""
records.loc[i, ["minor_label", "unit_label"]] = [new_minor, un]
records.loc[i, "annotated"] = True
return (
gr.update(choices=minors(major, sub), value=new_minor),
gr.update(choices=un_c, value=un),
status_text(i),
)
def on_unit_change(new_unit: str, i: int, major: str, sub: str, mn: str):
un_c = units(major, sub, mn)
if new_unit not in un_c:
new_unit = un_c[0] if un_c else ""
records.loc[i, "unit_label"] = new_unit
records.loc[i, "annotated"] = True
return gr.update(choices=un_c, value=new_unit), status_text(i)
def go_next(i: int) -> int:
return (i + 1) % len(records)
def go_prev(i: int) -> int:
return (i - 1) % len(records)
# ---- NAVIGATION: save + move + reload in ONE callback ----
def save_and_jump(i: int, direction: str):
# Final safety net: clamp and persist whatever is currently stored
rec = records.loc[i]
maj, sub, mn, un, *_ = clamp_path(
rec["major_label"], rec["sub_major_label"], rec["minor_label"], rec["unit_label"]
)
save_record(i, maj, sub, mn, un)
new_i = go_next(i) if direction == "next" else go_prev(i)
return (new_i,) + load_record(new_i)
def download_annotations() -> str:
path = PROCESSED_DATA_DIR / "annotated_output.csv"
records.to_csv(path, index=False)
return str(path)
# --------------
# Build the UI
# --------------
def build_gradio_app():
with gr.Blocks() as demo:
with gr.Row():
with gr.Column(scale=1):
# Static logo, non-interactive
gr.HTML(
f'
'
)
with gr.Row():
gr.Markdown("# ISCO Annotation", elem_id="isco-title")
gr.HTML("""
""")
idx_state = gr.State(0)
with gr.Group():
record_md = gr.Markdown()
status_md = gr.Markdown()
with gr.Row():
prev_btn = gr.Button("⬅ Previous")
next_btn = gr.Button("✅ Next")
with gr.Row():
with gr.Column():
major_radio = gr.Radio(label="Level 1: Major", choices=[], interactive=True)
with gr.Column():
sub_radio = gr.Radio(label="Level 2: Sub-major", choices=[], interactive=True)
with gr.Column():
minor_radio = gr.Radio(label="Level 3: Minor", choices=[], interactive=True)
with gr.Column():
unit_radio = gr.Radio(label="Level 4: Unit", choices=[], interactive=True)
download_btn = gr.Button("📥 Download Annotations")
download_file = gr.File(label="Annotated CSV", visible=False)
# Initial load
demo.load(
lambda: (0,) + load_record(0),
outputs=[idx_state, record_md, status_md, major_radio, sub_radio, minor_radio, unit_radio],
)
next_btn.click(lambda i: save_and_jump(i, "next"),
inputs=[idx_state],
outputs=[idx_state, record_md, status_md, major_radio, sub_radio, minor_radio, unit_radio])
prev_btn.click(lambda i: save_and_jump(i, "prev"),
inputs=[idx_state],
outputs=[idx_state, record_md, status_md, major_radio, sub_radio, minor_radio, unit_radio])
# Change handlers (also update status)
major_radio.change(
on_major_change,
inputs=[major_radio, idx_state],
outputs=[major_radio, sub_radio, minor_radio, unit_radio, status_md],
)
sub_radio.change(
on_sub_change,
inputs=[sub_radio, idx_state, major_radio],
outputs=[sub_radio, minor_radio, unit_radio, status_md],
)
minor_radio.change(
on_minor_change,
inputs=[minor_radio, idx_state, major_radio, sub_radio],
outputs=[minor_radio, unit_radio, status_md],
)
unit_radio.change(
on_unit_change,
inputs=[unit_radio, idx_state, major_radio, sub_radio, minor_radio],
outputs=[unit_radio, status_md],
)
# Download
download_btn.click(download_annotations, outputs=[download_file]).then(
lambda: gr.update(visible=True), None, [download_file]
)
return demo
if __name__=="__main__":
demo = build_gradio_app()
demo.queue().launch(
show_api=False,
ssr_mode=False, # ← disable experimental SSR
auth=(os.getenv("APP_USER",""), os.getenv("APP_PASS","")),
server_name="0.0.0.0", # optional, but explicit
server_port=int(os.getenv("PORT", 7860)),
)