import gradio as gr import pandas as pd from pathlib import Path from typing import Dict, List, Tuple import os import base64 PROCESSED_DATA_DIR = Path(".") # Embed logo as a base64 data URI to avoid Gradio toolbar interactions logo_path = "rowsquared-logo-large.png" with open(logo_path, "rb") as f: logo_b64 = base64.b64encode(f.read()).decode("utf-8") # ---------------------------- # Data loading & preprocessing # ---------------------------- df_isco = ( pd.read_excel( PROCESSED_DATA_DIR / "isco_imperfect.xlsx", converters={"major": str, "sub_major": str, "minor": str, "unit": str}, )[["major_label", "sub_major_label", "minor_label", "unit_label"]] .dropna() .drop_duplicates() .reset_index(drop=True) ) # Build nested hierarchy dict: {major: {sub: {minor: [units]}}} hierarchy: Dict[str, Dict[str, Dict[str, List[str]]]] = {} for _, r in df_isco.iterrows(): hierarchy.setdefault(r.major_label, {}) \ .setdefault(r.sub_major_label, {}) \ .setdefault(r.minor_label, []) \ .append(r.unit_label) # Ensure uniqueness & sorting at leaf lists for maj in hierarchy: for sub in hierarchy[maj]: for mn in hierarchy[maj][sub]: hierarchy[maj][sub][mn] = sorted(list(dict.fromkeys(hierarchy[maj][sub][mn]))) # Fast helpers for children def majors() -> List[str]: return sorted(hierarchy.keys()) def submajors(maj: str) -> List[str]: return sorted(hierarchy.get(maj, {}).keys()) def minors(maj: str, sub: str) -> List[str]: return sorted(hierarchy.get(maj, {}).get(sub, {}).keys()) def units(maj: str, sub: str, mn: str) -> List[str]: return hierarchy.get(maj, {}).get(sub, {}).get(mn, []) # ---------------------------- # Records to annotate # ---------------------------- records = pd.read_excel(PROCESSED_DATA_DIR / "isco_predictions.xlsx").copy() for col in ["major_label", "sub_major_label", "minor_label", "unit_label"]: if col not in records: records[col] = "" if "annotated" not in records: records["annotated"] = False # ensure not views for col in ["major_label", "sub_major_label", "minor_label", "unit_label", "annotated"]: records[col] = records[col].copy() records.reset_index(drop=True, inplace=True) # ----------------------------------- # Core logic: clamp & state management # ----------------------------------- def clamp_path(maj: str, sub: str, mn: str, un: str ) -> Tuple[str, str, str, str, List[str], List[str], List[str], List[str]]: """Return a valid (maj, sub, mn, un) tuple + their choices lists. Only replace a level if it's invalid for the hierarchy.""" maj_choices = majors() if maj not in maj_choices: maj = maj_choices[0] if maj_choices else "" sub_choices = submajors(maj) if maj else [] if sub not in sub_choices: sub = sub_choices[0] if sub_choices else "" mn_choices = minors(maj, sub) if sub else [] if mn not in mn_choices: mn = mn_choices[0] if mn_choices else "" un_choices = units(maj, sub, mn) if mn else [] if un not in un_choices: un = un_choices[0] if un_choices else "" return maj, sub, mn, un, maj_choices, sub_choices, mn_choices, un_choices def save_record(i: int, maj: str, sub: str, mn: str, un: str) -> None: records.loc[i, ["major_label", "sub_major_label", "minor_label", "unit_label"]] = [maj, sub, mn, un] records.loc[i, "annotated"] = True def status_text(i: int) -> str: return f"**Status**: {'✅ Annotated' if records.loc[i, 'annotated'] else '❌ Not Annotated'}" def load_record(i: int): rec = records.loc[i] maj, sub, mn, un, maj_c, sub_c, mn_c, un_c = clamp_path( rec["major_label"], rec["sub_major_label"], rec["minor_label"], rec["unit_label"] ) # Persist clamped values back (only if changed) save_record(i, maj, sub, mn, un) record_md = f"## Occupation: {rec['occupation_title_main']}\n## Industry: {rec['industry_title_main']}" return ( record_md, status_text(i), gr.update(choices=maj_c, value=maj), gr.update(choices=sub_c, value=sub), gr.update(choices=mn_c, value=mn), gr.update(choices=un_c, value=un), ) # --------------------- # Event handler helpers # --------------------- def on_major_change(new_major: str, i: int): sub_c = submajors(new_major) sub = sub_c[0] if sub_c else "" mn_c = minors(new_major, sub) if sub else [] mn = mn_c[0] if mn_c else "" un_c = units(new_major, sub, mn) if mn else [] un = un_c[0] if un_c else "" save_record(i, new_major, sub, mn, un) return ( gr.update(choices=majors(), value=new_major), gr.update(choices=sub_c, value=sub), gr.update(choices=mn_c, value=mn), gr.update(choices=un_c, value=un), status_text(i), ) def on_sub_change(new_sub: str, i: int, major: str): mn_c = minors(major, new_sub) mn = mn_c[0] if mn_c else "" un_c = units(major, new_sub, mn) if mn else [] un = un_c[0] if un_c else "" records.loc[i, ["sub_major_label", "minor_label", "unit_label"]] = [new_sub, mn, un] records.loc[i, "annotated"] = True return ( gr.update(choices=submajors(major), value=new_sub), gr.update(choices=mn_c, value=mn), gr.update(choices=un_c, value=un), status_text(i), ) def on_minor_change(new_minor: str, i: int, major: str, sub: str): un_c = units(major, sub, new_minor) un = un_c[0] if un_c else "" records.loc[i, ["minor_label", "unit_label"]] = [new_minor, un] records.loc[i, "annotated"] = True return ( gr.update(choices=minors(major, sub), value=new_minor), gr.update(choices=un_c, value=un), status_text(i), ) def on_unit_change(new_unit: str, i: int, major: str, sub: str, mn: str): un_c = units(major, sub, mn) if new_unit not in un_c: new_unit = un_c[0] if un_c else "" records.loc[i, "unit_label"] = new_unit records.loc[i, "annotated"] = True return gr.update(choices=un_c, value=new_unit), status_text(i) def go_next(i: int) -> int: return (i + 1) % len(records) def go_prev(i: int) -> int: return (i - 1) % len(records) # ---- NAVIGATION: save + move + reload in ONE callback ---- def save_and_jump(i: int, direction: str): # Final safety net: clamp and persist whatever is currently stored rec = records.loc[i] maj, sub, mn, un, *_ = clamp_path( rec["major_label"], rec["sub_major_label"], rec["minor_label"], rec["unit_label"] ) save_record(i, maj, sub, mn, un) new_i = go_next(i) if direction == "next" else go_prev(i) return (new_i,) + load_record(new_i) def download_annotations() -> str: path = PROCESSED_DATA_DIR / "annotated_output.csv" records.to_csv(path, index=False) return str(path) # -------------- # Build the UI # -------------- def build_gradio_app(): with gr.Blocks() as demo: with gr.Row(): with gr.Column(scale=1): # Static logo, non-interactive gr.HTML( f'' ) with gr.Row(): gr.Markdown("# ISCO Annotation", elem_id="isco-title") gr.HTML(""" """) idx_state = gr.State(0) with gr.Group(): record_md = gr.Markdown() status_md = gr.Markdown() with gr.Row(): prev_btn = gr.Button("⬅ Previous") next_btn = gr.Button("✅ Next") with gr.Row(): with gr.Column(): major_radio = gr.Radio(label="Level 1: Major", choices=[], interactive=True) with gr.Column(): sub_radio = gr.Radio(label="Level 2: Sub-major", choices=[], interactive=True) with gr.Column(): minor_radio = gr.Radio(label="Level 3: Minor", choices=[], interactive=True) with gr.Column(): unit_radio = gr.Radio(label="Level 4: Unit", choices=[], interactive=True) download_btn = gr.Button("📥 Download Annotations") download_file = gr.File(label="Annotated CSV", visible=False) # Initial load demo.load( lambda: (0,) + load_record(0), outputs=[idx_state, record_md, status_md, major_radio, sub_radio, minor_radio, unit_radio], ) next_btn.click(lambda i: save_and_jump(i, "next"), inputs=[idx_state], outputs=[idx_state, record_md, status_md, major_radio, sub_radio, minor_radio, unit_radio]) prev_btn.click(lambda i: save_and_jump(i, "prev"), inputs=[idx_state], outputs=[idx_state, record_md, status_md, major_radio, sub_radio, minor_radio, unit_radio]) # Change handlers (also update status) major_radio.change( on_major_change, inputs=[major_radio, idx_state], outputs=[major_radio, sub_radio, minor_radio, unit_radio, status_md], ) sub_radio.change( on_sub_change, inputs=[sub_radio, idx_state, major_radio], outputs=[sub_radio, minor_radio, unit_radio, status_md], ) minor_radio.change( on_minor_change, inputs=[minor_radio, idx_state, major_radio, sub_radio], outputs=[minor_radio, unit_radio, status_md], ) unit_radio.change( on_unit_change, inputs=[unit_radio, idx_state, major_radio, sub_radio, minor_radio], outputs=[unit_radio, status_md], ) # Download download_btn.click(download_annotations, outputs=[download_file]).then( lambda: gr.update(visible=True), None, [download_file] ) return demo if __name__=="__main__": demo = build_gradio_app() demo.queue().launch( show_api=False, ssr_mode=False, # ← disable experimental SSR auth=(os.getenv("APP_USER",""), os.getenv("APP_PASS","")), server_name="0.0.0.0", # optional, but explicit server_port=int(os.getenv("PORT", 7860)), )