import boto3 import os import json import re import gradio as gr from typing import List, Dict, Tuple, Optional, Union, Any # ── S3 CONFIG ───────────────────────────────────────────────────────────────── s3 = boto3.client( "s3", aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID"), aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY"), region_name = os.getenv("AWS_DEFAULT_REGION", "ap-southeast-2"), ) # ai4data/datause-annotation # S3 bucket and keys BUCKET = "doccano-processed" #INIT_KEY = "gradio/initial_data_train.json" INIT_KEY = "gradio/refugee_train_initial_datav5.json" #VALID_PREFIX = "validated_records/" VALID_PREFIX = "refugee_train_validated_v5/" # ── Helpers to load & save from S3 ────────────────────────────────────────────── def load_initial_data() -> List[Dict]: obj = s3.get_object(Bucket=BUCKET, Key=INIT_KEY) return json.loads(obj['Body'].read()) def load_all_validations() -> Dict[int, Dict]: records = {} pages = s3.get_paginator("list_objects_v2").paginate( Bucket=BUCKET, Prefix=VALID_PREFIX ) for page in pages: for obj in page.get("Contents", []): key = obj["Key"] idx = int(os.path.splitext(os.path.basename(key))[0]) data = s3.get_object(Bucket=BUCKET, Key=key)["Body"].read() records[idx] = json.loads(data) return records def save_single_validation(idx: int, record: Dict): key = f"{VALID_PREFIX}{idx}.json" s3.put_object( Bucket = BUCKET, Key = key, Body = json.dumps(record, indent=2).encode('utf-8'), ContentType = 'application/json' ) class DynamicDataset: def __init__(self, data: List[Dict]): self.data = data self.len = len(data) self.current = 0 for ex in self.data: ex.setdefault("validated", False) def example(self, idx: int) -> Dict: self.current = max(0, min(self.len - 1, idx)) return self.data[self.current] def next(self) -> Dict: if self.current < self.len - 1: self.current += 1 return self.data[self.current] def prev(self) -> Dict: if self.current > 0: self.current -= 1 return self.data[self.current] def jump_next_unvalidated(self) -> Dict: for i in range(self.current + 1, self.len): if not self.data[i]["validated"]: self.current = i break return self.data[self.current] def jump_prev_unvalidated(self) -> Dict: for i in range(self.current - 1, -1, -1): if not self.data[i]["validated"]: self.current = i break return self.data[self.current] def validate(self): self.data[self.current]["validated"] = True def tokenize_text(text: str) -> List[str]: return re.findall(r"\w+(?:[-_]\w+)*|[^\s\w]", text) def prepare_for_highlight(data: Dict) -> List[Tuple[str, Optional[str]]]: tokens = data["tokenized_text"] ner = data["ner"] highlighted, curr_ent, ent_buf, norm_buf = [], None, [], [] for idx, tok in enumerate(tokens): if curr_ent is None or idx > curr_ent[1]: if ent_buf: highlighted.append((" ".join(ent_buf), curr_ent[2])) ent_buf = [] curr_ent = next((e for e in ner if e[0] == idx), None) if curr_ent and curr_ent[0] <= idx <= curr_ent[1]: if norm_buf: highlighted.append((" ".join(norm_buf), None)) norm_buf = [] ent_buf.append(tok) else: if ent_buf: highlighted.append((" ".join(ent_buf), curr_ent[2])) ent_buf = [] norm_buf.append(tok) if ent_buf: highlighted.append((" ".join(ent_buf), curr_ent[2])) if norm_buf: highlighted.append((" ".join(norm_buf), None)) return [(re.sub(r"\s(?=[,\.!?…:;])", "", txt), lbl) for txt, lbl in highlighted] def extract_tokens_and_labels(highlighted: List[Dict[str, Union[str, None]]] ) -> Tuple[List[str], List[Tuple[int,int,str]]]: tokens, ner = [], [] token_idx = 0 for entry in highlighted: text = entry['token'] label = entry.get('class_or_confidence') or entry.get('class') or entry.get('label') # split into real tokens toks = tokenize_text(text) start = token_idx end = token_idx + len(toks) - 1 tokens.extend(toks) if label: ner.append((start, end, label)) token_idx = end + 1 return tokens, ner def create_demo() -> gr.Blocks: data = load_initial_data() validated_store = load_all_validations() # mark any pre-validated examples for idx in validated_store: if 0 <= idx < len(data): data[idx]["validated"] = True dynamic_dataset = DynamicDataset(data) def make_info(rec): fn = rec.get("filename", "—") pg = rec.get("page", "—") # Markdown with line break for Gradio return f"**File:** `{fn}` \n**Page:** `{pg}`" def align_spans_to_tokens( highlighted: List[Dict[str, Union[str, None]]], tokens: List[str] ) -> List[Tuple[int, int, str]]: """ Align each highlighted chunk to the next matching tokens in the list, advancing a pointer so repeated tokens map in the order you clicked them. """ spans = [] search_start = 0 for entry in highlighted: text = entry["token"] label = entry.get("class_or_confidence") or entry.get("label") or entry.get("class") if not label: continue chunk_toks = tokenize_text(text) # scan only from the end of the last match for i in range(search_start, len(tokens) - len(chunk_toks) + 1): if tokens[i:i + len(chunk_toks)] == chunk_toks: spans.append((i, i + len(chunk_toks) - 1, label)) search_start = i + len(chunk_toks) break else: print(f"⚠️ Couldn’t align chunk: {text!r}") return spans def load_example(idx): rec = validated_store.get(idx, dynamic_dataset.example(idx)) segs = prepare_for_highlight(rec) return segs, rec.get("validated", False), idx, make_info(rec) def update_example(highlighted, idx: int): rec = dynamic_dataset.data[idx] # re‐tokenize orig_tokens = tokenize_text(rec["text"]) # realign highlights new_ner = align_spans_to_tokens(highlighted, orig_tokens) # overwrite & mark un-validated rec["tokenized_text"] = orig_tokens rec["ner"] = new_ner rec["validated"] = False return prepare_for_highlight(rec), rec["validated"], idx, make_info(rec) def do_validate(highlighted, idx: int): # in-memory mark dynamic_dataset.validate() rec = dynamic_dataset.data[idx] orig_tokens = tokenize_text(rec["text"]) new_ner = align_spans_to_tokens(highlighted, orig_tokens) rec["tokenized_text"] = orig_tokens rec["ner"] = new_ner # persist to disk/store save_single_validation(idx, rec) return prepare_for_highlight(rec), True, make_info(rec) def nav(fn): rec = fn() segs = prepare_for_highlight(rec) return segs, rec.get("validated", False), dynamic_dataset.current, make_info(rec) with gr.Blocks() as demo: prog = gr.Slider(0, dynamic_dataset.len-1, value=0, step=1, label="Example #", interactive=False) inp_box = gr.HighlightedText(label="Sentence", interactive=True) info_md = gr.Markdown(label="Source") # ← shows filename & page status = gr.Checkbox(label="Validated?", value=False, interactive=False) gr.Markdown( "[📖 Entity Tag Guide](https://huggingface.co/spaces/rafmacalaba/datause-annotation/blob/main/guidelines.md)" ) with gr.Row(): prev_btn = gr.Button("◀️ Previous") apply_btn = gr.Button("📝 Apply Changes") next_btn = gr.Button("Next ▶️") with gr.Row(): skip_prev = gr.Button("⏮️ Prev Unvalidated") validate_btn = gr.Button("✅ Validate") skip_next = gr.Button("⏭️ Next Unvalidated") # initial load demo.load(load_example, inputs=prog, outputs=[inp_box, status, prog, info_md]) # wire up actions (all now also update info_md) apply_btn.click(update_example, inputs=[inp_box, prog], outputs=[inp_box, status, prog, info_md]) prev_btn.click(lambda: nav(dynamic_dataset.prev), inputs=None, outputs=[inp_box, status, prog, info_md]) next_btn.click(lambda: nav(dynamic_dataset.next), inputs=None, outputs=[inp_box, status, prog, info_md]) skip_prev.click(lambda: nav(dynamic_dataset.jump_prev_unvalidated), inputs=None, outputs=[inp_box, status, prog, info_md]) skip_next.click(lambda: nav(dynamic_dataset.jump_next_unvalidated), inputs=None, outputs=[inp_box, status, prog, info_md]) validate_btn.click(do_validate, inputs=[inp_box, prog], outputs=[inp_box, status, info_md]) return demo if __name__ == "__main__": demo = create_demo() demo.launch(share=True, inline=True, debug=True)