rafmacalaba commited on
Commit
1376fd4
·
1 Parent(s): e9d3a0a

init in ai4data org

Browse files
Files changed (3) hide show
  1. app.py +264 -0
  2. guidelines.md +27 -0
  3. requirements.txt +2 -0
app.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import boto3
2
+ import os
3
+ import json
4
+ import re
5
+ import gradio as gr
6
+ from typing import List, Dict, Tuple, Optional, Union, Any
7
+
8
+ # ── S3 CONFIG ─────────────────────────────────────────────────────────────────
9
+ s3 = boto3.client(
10
+ "s3",
11
+ aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID"),
12
+ aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY"),
13
+ region_name = os.getenv("AWS_DEFAULT_REGION", "ap-southeast-2"),
14
+ )
15
+
16
+ BUCKET = "doccano-processed"
17
+ #INIT_KEY = "gradio/initial_data_train.json"
18
+ INIT_KEY = "gradio/refugee_train_initial_data.json"
19
+ #VALID_PREFIX = "validated_records/"
20
+ VALID_PREFIX = "refugee_train_validated/"
21
+
22
+ # ── Helpers to load & save from S3 ──────────────────────────────────────────────
23
+ def load_initial_data() -> List[Dict]:
24
+ obj = s3.get_object(Bucket=BUCKET, Key=INIT_KEY)
25
+ return json.loads(obj['Body'].read())
26
+
27
+ def load_all_validations() -> Dict[int, Dict]:
28
+ records = {}
29
+ pages = s3.get_paginator("list_objects_v2").paginate(
30
+ Bucket=BUCKET, Prefix=VALID_PREFIX
31
+ )
32
+ for page in pages:
33
+ for obj in page.get("Contents", []):
34
+ key = obj["Key"]
35
+ idx = int(os.path.splitext(os.path.basename(key))[0])
36
+ data = s3.get_object(Bucket=BUCKET, Key=key)["Body"].read()
37
+ records[idx] = json.loads(data)
38
+ return records
39
+
40
+ def save_single_validation(idx: int, record: Dict):
41
+ key = f"{VALID_PREFIX}{idx}.json"
42
+ s3.put_object(
43
+ Bucket = BUCKET,
44
+ Key = key,
45
+ Body = json.dumps(record, indent=2).encode('utf-8'),
46
+ ContentType = 'application/json'
47
+ )
48
+
49
+ class DynamicDataset:
50
+ def __init__(self, data: List[Dict]):
51
+ self.data = data
52
+ self.len = len(data)
53
+ self.current = 0
54
+ for ex in self.data:
55
+ ex.setdefault("validated", False)
56
+
57
+ def example(self, idx: int) -> Dict:
58
+ self.current = max(0, min(self.len - 1, idx))
59
+ return self.data[self.current]
60
+
61
+ def next(self) -> Dict:
62
+ if self.current < self.len - 1:
63
+ self.current += 1
64
+ return self.data[self.current]
65
+
66
+ def prev(self) -> Dict:
67
+ if self.current > 0:
68
+ self.current -= 1
69
+ return self.data[self.current]
70
+
71
+ def jump_next_unvalidated(self) -> Dict:
72
+ for i in range(self.current + 1, self.len):
73
+ if not self.data[i]["validated"]:
74
+ self.current = i
75
+ break
76
+ return self.data[self.current]
77
+
78
+ def jump_prev_unvalidated(self) -> Dict:
79
+ for i in range(self.current - 1, -1, -1):
80
+ if not self.data[i]["validated"]:
81
+ self.current = i
82
+ break
83
+ return self.data[self.current]
84
+
85
+ def validate(self):
86
+ self.data[self.current]["validated"] = True
87
+
88
+ def tokenize_text(text: str) -> List[str]:
89
+ return re.findall(r"\w+(?:[-_]\w+)*|[^\s\w]", text)
90
+
91
+ def prepare_for_highlight(data: Dict) -> List[Tuple[str, Optional[str]]]:
92
+ tokens = data["tokenized_text"]
93
+ ner = data["ner"]
94
+ highlighted, curr_ent, ent_buf, norm_buf = [], None, [], []
95
+ for idx, tok in enumerate(tokens):
96
+ if curr_ent is None or idx > curr_ent[1]:
97
+ if ent_buf:
98
+ highlighted.append((" ".join(ent_buf), curr_ent[2]))
99
+ ent_buf = []
100
+ curr_ent = next((e for e in ner if e[0] == idx), None)
101
+ if curr_ent and curr_ent[0] <= idx <= curr_ent[1]:
102
+ if norm_buf:
103
+ highlighted.append((" ".join(norm_buf), None))
104
+ norm_buf = []
105
+ ent_buf.append(tok)
106
+ else:
107
+ if ent_buf:
108
+ highlighted.append((" ".join(ent_buf), curr_ent[2]))
109
+ ent_buf = []
110
+ norm_buf.append(tok)
111
+ if ent_buf:
112
+ highlighted.append((" ".join(ent_buf), curr_ent[2]))
113
+ if norm_buf:
114
+ highlighted.append((" ".join(norm_buf), None))
115
+ return [(re.sub(r"\s(?=[,\.!?…:;])", "", txt), lbl) for txt, lbl in highlighted]
116
+
117
+
118
+ def extract_tokens_and_labels(highlighted: List[Dict[str, Union[str, None]]]
119
+ ) -> Tuple[List[str], List[Tuple[int,int,str]]]:
120
+ tokens, ner = [], []
121
+ token_idx = 0
122
+
123
+ for entry in highlighted:
124
+ text = entry['token']
125
+ label = entry.get('class_or_confidence') or entry.get('class') or entry.get('label')
126
+ # split into real tokens
127
+ toks = tokenize_text(text)
128
+ start = token_idx
129
+ end = token_idx + len(toks) - 1
130
+
131
+ tokens.extend(toks)
132
+ if label:
133
+ ner.append((start, end, label))
134
+
135
+ token_idx = end + 1
136
+
137
+ return tokens, ner
138
+
139
+
140
+ # ── App factory ──────────────────────────��─────────────────────────────────────
141
+ def create_demo() -> gr.Blocks:
142
+ data = load_initial_data()
143
+ validated_store = load_all_validations()
144
+
145
+ for idx in validated_store:
146
+ if 0 <= idx < len(data):
147
+ data[idx]["validated"] = True
148
+ dynamic_dataset = DynamicDataset(data)
149
+ with gr.Blocks() as demo:
150
+ prog = gr.Slider(0, dynamic_dataset.len-1, value=0, step=1, label="Example #", interactive=False)
151
+ inp_box = gr.HighlightedText(label="Sentence", interactive=True)
152
+ status = gr.Checkbox(label="Validated?", value=False, interactive=False)
153
+ gr.Markdown(
154
+ "[📖 Entity Tag Guide](https://huggingface.co/spaces/rafmacalaba/datause-annotation/blob/main/guidelines.md)"
155
+ )
156
+
157
+ with gr.Row():
158
+ prev_btn = gr.Button("◀️ Previous")
159
+ apply_btn = gr.Button("📝 Apply Changes")
160
+ next_btn = gr.Button("Next ▶️")
161
+ with gr.Row():
162
+ skip_prev = gr.Button("⏮️ Prev Unvalidated")
163
+ validate_btn = gr.Button("✅ Validate")
164
+ skip_next = gr.Button("⏭️ Next Unvalidated")
165
+
166
+ def load_example(idx):
167
+ rec = validated_store.get(idx, dynamic_dataset.example(idx))
168
+ segs = prepare_for_highlight(rec)
169
+ return segs, rec.get("validated", False), idx
170
+
171
+ def update_example(highlighted, idx: int):
172
+ # grab the record
173
+ rec = dynamic_dataset.data[idx]
174
+
175
+ # re‐tokenize from the raw text (same as do_validate)
176
+ orig_tokens = tokenize_text(rec["text"])
177
+
178
+ # realign the user's highlights back to those tokens
179
+ new_ner = align_spans_to_tokens(highlighted, orig_tokens)
180
+
181
+ # overwrite both token list and span list (and mark un‐validated)
182
+ rec["tokenized_text"] = orig_tokens
183
+ rec["ner"] = new_ner
184
+ rec["validated"] = False
185
+
186
+ # re‐render
187
+ return prepare_for_highlight(rec)
188
+
189
+ def align_spans_to_tokens(
190
+ highlighted: List[Dict[str, Union[str, None]]],
191
+ tokens: List[str]
192
+ ) -> List[Tuple[int,int,str]]:
193
+ """
194
+ Align each highlighted chunk to the next matching tokens in the list,
195
+ advancing a pointer so repeated tokens map in the order you clicked them.
196
+ """
197
+ spans = []
198
+ search_start = 0
199
+
200
+ for entry in highlighted:
201
+ text = entry["token"]
202
+ label = entry.get("class_or_confidence") or entry.get("label") or entry.get("class")
203
+ if not label:
204
+ continue
205
+
206
+ chunk_toks = tokenize_text(text)
207
+ # scan only from the end of the last match
208
+ for i in range(search_start, len(tokens) - len(chunk_toks) + 1):
209
+ if tokens[i:i+len(chunk_toks)] == chunk_toks:
210
+ spans.append((i, i + len(chunk_toks) - 1, label))
211
+ search_start = i + len(chunk_toks)
212
+ break
213
+ else:
214
+ print(f"⚠️ Couldn’t align chunk: {text!r}")
215
+
216
+ return spans
217
+
218
+ def do_validate(highlighted, idx: int):
219
+ # mark validated in memory
220
+ dynamic_dataset.validate()
221
+
222
+ # grab the record
223
+ rec = dynamic_dataset.data[idx]
224
+
225
+ # re-tokenize from the original text
226
+ orig_tokens = tokenize_text(rec["text"])
227
+
228
+ # realign the user's highlighted segments to those tokens
229
+ new_ner = align_spans_to_tokens(highlighted, orig_tokens)
230
+
231
+ # overwrite both token list and span list
232
+ rec["tokenized_text"] = orig_tokens
233
+ rec["ner"] = new_ner
234
+
235
+ # persist
236
+ save_single_validation(idx, rec)
237
+
238
+ # re-render and show checkbox checked
239
+ return prepare_for_highlight(rec), True
240
+
241
+
242
+ def nav(fn):
243
+ rec = fn()
244
+ segs = prepare_for_highlight(rec)
245
+ return segs, rec.get("validated", False), dynamic_dataset.current
246
+
247
+ demo.load(load_example, inputs=prog, outputs=[inp_box, status, prog])
248
+ apply_btn.click(
249
+ fn=update_example,
250
+ inputs=[inp_box, prog], # pass both the highlights *and* the example idx
251
+ outputs=inp_box
252
+ )
253
+ #apply_btn.click(update_spans, inputs=inp_box, outputs=inp_box)
254
+ prev_btn.click(lambda: nav(dynamic_dataset.prev), inputs=None, outputs=[inp_box, status, prog])
255
+ validate_btn.click(do_validate, inputs=[inp_box, prog], outputs=[inp_box, status])
256
+ next_btn.click(lambda: nav(dynamic_dataset.next), inputs=None, outputs=[inp_box, status, prog])
257
+ skip_prev.click(lambda: nav(dynamic_dataset.jump_prev_unvalidated), inputs=None, outputs=[inp_box, status, prog])
258
+ skip_next.click(lambda: nav(dynamic_dataset.jump_next_unvalidated), inputs=None, outputs=[inp_box, status, prog])
259
+
260
+ return demo
261
+
262
+ if __name__ == "__main__":
263
+ demo = create_demo()
264
+ demo.launch(share=True, inline=True, debug=True)
guidelines.md ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Entity Tag Guide
2
+
3
+ This document describes the annotation tags you will see in the NER / merged NER output. Each **entity** corresponds to a labeled span in the text.
4
+
5
+ | Entity | Meaning |
6
+ | --------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
7
+ | **`match_named`** | Model and ground-truth agree on an explicit, uniquely named dataset span. |
8
+ | **`actual_named`** | A named dataset span present in the ground-truth but missed by the model. |
9
+ | **`pred_named`** | A named dataset span predicted by the model but not in the ground-truth. |
10
+ | **`match_unnamed`** | Model and ground-truth agree on a clearly described but unnamed dataset span. |
11
+ | **`actual_unnamed`** | An unnamed dataset span present in the ground-truth but missed by the model. |
12
+ | **`pred_unnamed`** | An unnamed dataset span predicted by the model but not in the ground-truth. |
13
+ | **`match_vague`** | Model and ground-truth agree on a vague dataset mention (lacking specific identifying details). |
14
+ | **`actual_vague`** | A vague dataset mention present in the ground-truth but missed by the model. |
15
+ | **`pred_vague`** | A vague dataset mention predicted by the model but not in the ground-truth. |
16
+ | **`<span> <> acronym`** | Relation: marks the dataset’s acronym (e.g. `RUV <> acronym`). |
17
+ | **`<span> <> data description`** | Relation: describes what the dataset contains or how it was collected. |
18
+ | **`<span> <> data geography`** | Relation: indicates the geographic coverage of the dataset (e.g. country, region). |
19
+ | **`<span> <> data source`** | Relation: links to the original source or repository of the data. |
20
+ | **`<span> <> data type`** | Relation: specifies the type of data (e.g. survey, census, register). |
21
+ | **`<span> <> geography`** | Relation: connects the dataset to its referenced geography (may duplicate data geography). |
22
+ | **`<span> <> publication year`** | Relation: the year the dataset (or its documentation) was published. |
23
+ | **`<span> <> publisher`** | Relation: the organization or entity that published the dataset. |
24
+ | **`<span> <> reference year`** | Relation: the year the data were actually collected or refer to. |
25
+ | **`<span> <> version`** | Relation: the version identifier of the dataset (e.g. “v5”, “Version 2”). |
26
+
27
+ Use this guide when reviewing model predictions to quickly identify correct matches, false positives, and false negatives, as well as any extracted relations.
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ gradio>=3.40
2
+ boto3