rafmacalaba commited on
Commit
81c36f4
ยท
1 Parent(s): 1ba6579

add filename and page v2

Browse files
Files changed (1) hide show
  1. app.py +232 -119
app.py CHANGED
@@ -140,137 +140,250 @@ def extract_tokens_and_labels(highlighted: List[Dict[str, Union[str, None]]]
140
 
141
 
142
  # โ”€โ”€ App factory โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  def create_demo() -> gr.Blocks:
144
- data = load_initial_data()
145
- validated_store = load_all_validations()
146
 
 
147
  for idx in validated_store:
148
  if 0 <= idx < len(data):
149
  data[idx]["validated"] = True
150
- dynamic_dataset = DynamicDataset(data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  with gr.Blocks() as demo:
152
- prog = gr.Slider(0, dynamic_dataset.len-1, value=0, step=1, label="Example #", interactive=False)
153
- inp_box = gr.HighlightedText(label="Sentence", interactive=True)
154
- status = gr.Checkbox(label="Validated?", value=False, interactive=False)
155
- filename_disp = gr.Markdown(label="Filename") # NEW: shows current filename
156
- page_disp = gr.Markdown(label="Page") # NEW: shows current page number
157
  gr.Markdown(
158
  "[๐Ÿ“– Entity Tag Guide](https://huggingface.co/spaces/rafmacalaba/datause-annotation/blob/main/guidelines.md)"
159
  )
160
-
161
  with gr.Row():
162
- prev_btn = gr.Button("โ—€๏ธ Previous")
163
- apply_btn = gr.Button("๐Ÿ“ Apply Changes")
164
- next_btn = gr.Button("Next โ–ถ๏ธ")
 
165
  with gr.Row():
166
- skip_prev = gr.Button("โฎ๏ธ Prev Unvalidated")
167
  validate_btn = gr.Button("โœ… Validate")
168
- skip_next = gr.Button("โญ๏ธ Next Unvalidated")
169
-
170
- # def load_example(idx):
171
- # rec = validated_store.get(idx, dynamic_dataset.example(idx))
172
- # segs = prepare_for_highlight(rec)
173
- # return segs, rec.get("validated", False), idx
174
-
175
- def load_example(idx):
176
- rec = validated_store.get(idx, dynamic_dataset.example(idx))
177
- segs = prepare_for_highlight(rec)
178
- return (
179
- segs,
180
- rec.get("validated", False),
181
- idx,
182
- rec.get("filename", ""), # <-- returns filename for filename_disp
183
- f"Page {rec.get('page', '')}" # <-- returns page for page_disp
184
- )
185
-
186
- def update_example(highlighted, idx: int):
187
- # grab the record
188
- rec = dynamic_dataset.data[idx]
189
-
190
- # reโ€tokenize from the raw text (same as do_validate)
191
- orig_tokens = tokenize_text(rec["text"])
192
-
193
- # realign the user's highlights back to those tokens
194
- new_ner = align_spans_to_tokens(highlighted, orig_tokens)
195
-
196
- # overwrite both token list and span list (and mark unโ€validated)
197
- rec["tokenized_text"] = orig_tokens
198
- rec["ner"] = new_ner
199
- rec["validated"] = False
200
-
201
- # reโ€render
202
- return prepare_for_highlight(rec)
203
-
204
- def align_spans_to_tokens(
205
- highlighted: List[Dict[str, Union[str, None]]],
206
- tokens: List[str]
207
- ) -> List[Tuple[int,int,str]]:
208
- """
209
- Align each highlighted chunk to the next matching tokens in the list,
210
- advancing a pointer so repeated tokens map in the order you clicked them.
211
- """
212
- spans = []
213
- search_start = 0
214
-
215
- for entry in highlighted:
216
- text = entry["token"]
217
- label = entry.get("class_or_confidence") or entry.get("label") or entry.get("class")
218
- if not label:
219
- continue
220
-
221
- chunk_toks = tokenize_text(text)
222
- # scan only from the end of the last match
223
- for i in range(search_start, len(tokens) - len(chunk_toks) + 1):
224
- if tokens[i:i+len(chunk_toks)] == chunk_toks:
225
- spans.append((i, i + len(chunk_toks) - 1, label))
226
- search_start = i + len(chunk_toks)
227
- break
228
- else:
229
- print(f"โš ๏ธ Couldnโ€™t align chunk: {text!r}")
230
-
231
- return spans
232
-
233
- def do_validate(highlighted, idx: int):
234
- # mark validated in memory
235
- dynamic_dataset.validate()
236
-
237
- # grab the record
238
- rec = dynamic_dataset.data[idx]
239
-
240
- # re-tokenize from the original text
241
- orig_tokens = tokenize_text(rec["text"])
242
-
243
- # realign the user's highlighted segments to those tokens
244
- new_ner = align_spans_to_tokens(highlighted, orig_tokens)
245
-
246
- # overwrite both token list and span list
247
- rec["tokenized_text"] = orig_tokens
248
- rec["ner"] = new_ner
249
-
250
- # persist
251
- save_single_validation(idx, rec)
252
-
253
- # re-render and show checkbox checked
254
- return prepare_for_highlight(rec), True
255
-
256
-
257
- def nav(fn):
258
- rec = fn()
259
- segs = prepare_for_highlight(rec)
260
- return segs, rec.get("validated", False), dynamic_dataset.current
261
-
262
- demo.load(load_example, inputs=prog, outputs=[inp_box, status, prog])
263
- apply_btn.click(
264
- fn=update_example,
265
- inputs=[inp_box, prog], # pass both the highlights *and* the example idx
266
- outputs=inp_box
267
- )
268
- #apply_btn.click(update_spans, inputs=inp_box, outputs=inp_box)
269
- prev_btn.click(lambda: nav(dynamic_dataset.prev), inputs=None, outputs=[inp_box, status, prog])
270
- validate_btn.click(do_validate, inputs=[inp_box, prog], outputs=[inp_box, status])
271
- next_btn.click(lambda: nav(dynamic_dataset.next), inputs=None, outputs=[inp_box, status, prog])
272
- skip_prev.click(lambda: nav(dynamic_dataset.jump_prev_unvalidated), inputs=None, outputs=[inp_box, status, prog])
273
- skip_next.click(lambda: nav(dynamic_dataset.jump_next_unvalidated), inputs=None, outputs=[inp_box, status, prog])
274
 
275
  return demo
276
 
 
140
 
141
 
142
  # โ”€โ”€ App factory โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
143
+ # def create_demo() -> gr.Blocks:
144
+ # data = load_initial_data()
145
+ # validated_store = load_all_validations()
146
+
147
+ # for idx in validated_store:
148
+ # if 0 <= idx < len(data):
149
+ # data[idx]["validated"] = True
150
+ # dynamic_dataset = DynamicDataset(data)
151
+ # with gr.Blocks() as demo:
152
+ # prog = gr.Slider(0, dynamic_dataset.len-1, value=0, step=1, label="Example #", interactive=False)
153
+ # inp_box = gr.HighlightedText(label="Sentence", interactive=True)
154
+ # status = gr.Checkbox(label="Validated?", value=False, interactive=False)
155
+ # filename_disp = gr.Markdown(label="Filename") # NEW: shows current filename
156
+ # page_disp = gr.Markdown(label="Page") # NEW: shows current page number
157
+ # gr.Markdown(
158
+ # "[๐Ÿ“– Entity Tag Guide](https://huggingface.co/spaces/rafmacalaba/datause-annotation/blob/main/guidelines.md)"
159
+ # )
160
+
161
+ # with gr.Row():
162
+ # prev_btn = gr.Button("โ—€๏ธ Previous")
163
+ # apply_btn = gr.Button("๐Ÿ“ Apply Changes")
164
+ # next_btn = gr.Button("Next โ–ถ๏ธ")
165
+ # with gr.Row():
166
+ # skip_prev = gr.Button("โฎ๏ธ Prev Unvalidated")
167
+ # validate_btn = gr.Button("โœ… Validate")
168
+ # skip_next = gr.Button("โญ๏ธ Next Unvalidated")
169
+
170
+ # # def load_example(idx):
171
+ # # rec = validated_store.get(idx, dynamic_dataset.example(idx))
172
+ # # segs = prepare_for_highlight(rec)
173
+ # # return segs, rec.get("validated", False), idx
174
+
175
+ # def load_example(idx):
176
+ # rec = validated_store.get(idx, dynamic_dataset.example(idx))
177
+ # segs = prepare_for_highlight(rec)
178
+ # return (
179
+ # segs,
180
+ # rec.get("validated", False),
181
+ # idx,
182
+ # rec.get("filename", ""), # <-- returns filename for filename_disp
183
+ # f"Page {rec.get('page', '')}" # <-- returns page for page_disp
184
+ # )
185
+
186
+ # def update_example(highlighted, idx: int):
187
+ # # grab the record
188
+ # rec = dynamic_dataset.data[idx]
189
+
190
+ # # reโ€tokenize from the raw text (same as do_validate)
191
+ # orig_tokens = tokenize_text(rec["text"])
192
+
193
+ # # realign the user's highlights back to those tokens
194
+ # new_ner = align_spans_to_tokens(highlighted, orig_tokens)
195
+
196
+ # # overwrite both token list and span list (and mark unโ€validated)
197
+ # rec["tokenized_text"] = orig_tokens
198
+ # rec["ner"] = new_ner
199
+ # rec["validated"] = False
200
+
201
+ # # reโ€render
202
+ # return prepare_for_highlight(rec)
203
+
204
+ # def align_spans_to_tokens(
205
+ # highlighted: List[Dict[str, Union[str, None]]],
206
+ # tokens: List[str]
207
+ # ) -> List[Tuple[int,int,str]]:
208
+ # """
209
+ # Align each highlighted chunk to the next matching tokens in the list,
210
+ # advancing a pointer so repeated tokens map in the order you clicked them.
211
+ # """
212
+ # spans = []
213
+ # search_start = 0
214
+
215
+ # for entry in highlighted:
216
+ # text = entry["token"]
217
+ # label = entry.get("class_or_confidence") or entry.get("label") or entry.get("class")
218
+ # if not label:
219
+ # continue
220
+
221
+ # chunk_toks = tokenize_text(text)
222
+ # # scan only from the end of the last match
223
+ # for i in range(search_start, len(tokens) - len(chunk_toks) + 1):
224
+ # if tokens[i:i+len(chunk_toks)] == chunk_toks:
225
+ # spans.append((i, i + len(chunk_toks) - 1, label))
226
+ # search_start = i + len(chunk_toks)
227
+ # break
228
+ # else:
229
+ # print(f"โš ๏ธ Couldnโ€™t align chunk: {text!r}")
230
+
231
+ # return spans
232
+
233
+ # def do_validate(highlighted, idx: int):
234
+ # # mark validated in memory
235
+ # dynamic_dataset.validate()
236
+
237
+ # # grab the record
238
+ # rec = dynamic_dataset.data[idx]
239
+
240
+ # # re-tokenize from the original text
241
+ # orig_tokens = tokenize_text(rec["text"])
242
+
243
+ # # realign the user's highlighted segments to those tokens
244
+ # new_ner = align_spans_to_tokens(highlighted, orig_tokens)
245
+
246
+ # # overwrite both token list and span list
247
+ # rec["tokenized_text"] = orig_tokens
248
+ # rec["ner"] = new_ner
249
+
250
+ # # persist
251
+ # save_single_validation(idx, rec)
252
+
253
+ # # re-render and show checkbox checked
254
+ # return prepare_for_highlight(rec), True
255
+
256
+
257
+ # def nav(fn):
258
+ # rec = fn()
259
+ # segs = prepare_for_highlight(rec)
260
+ # return segs, rec.get("validated", False), dynamic_dataset.current
261
+
262
+ # demo.load(load_example, inputs=prog, outputs=[inp_box, status, prog])
263
+ # apply_btn.click(
264
+ # fn=update_example,
265
+ # inputs=[inp_box, prog], # pass both the highlights *and* the example idx
266
+ # outputs=inp_box
267
+ # )
268
+ # #apply_btn.click(update_spans, inputs=inp_box, outputs=inp_box)
269
+ # prev_btn.click(lambda: nav(dynamic_dataset.prev), inputs=None, outputs=[inp_box, status, prog])
270
+ # validate_btn.click(do_validate, inputs=[inp_box, prog], outputs=[inp_box, status])
271
+ # next_btn.click(lambda: nav(dynamic_dataset.next), inputs=None, outputs=[inp_box, status, prog])
272
+ # skip_prev.click(lambda: nav(dynamic_dataset.jump_prev_unvalidated), inputs=None, outputs=[inp_box, status, prog])
273
+ # skip_next.click(lambda: nav(dynamic_dataset.jump_next_unvalidated), inputs=None, outputs=[inp_box, status, prog])
274
+
275
+ # return demo
276
+
277
  def create_demo() -> gr.Blocks:
278
+ data = load_initial_data()
279
+ validated_store = load_all_validations()
280
 
281
+ # mark any pre-validated examples
282
  for idx in validated_store:
283
  if 0 <= idx < len(data):
284
  data[idx]["validated"] = True
285
+
286
+ dynamic_dataset = DynamicDataset(data)
287
+
288
+ def make_info(rec):
289
+ fn = rec.get("filename", "โ€”")
290
+ pg = rec.get("page", "โ€”")
291
+ # Markdown with line break for Gradio
292
+ return f"**File:** `{fn}` \n**Page:** `{pg}`"
293
+
294
+ def align_spans_to_tokens(
295
+ highlighted: List[Dict[str, Union[str, None]]],
296
+ tokens: List[str]
297
+ ) -> List[Tuple[int, int, str]]:
298
+ """
299
+ Align each highlighted chunk to the next matching tokens in the list,
300
+ advancing a pointer so repeated tokens map in the order you clicked them.
301
+ """
302
+ spans = []
303
+ search_start = 0
304
+
305
+ for entry in highlighted:
306
+ text = entry["token"]
307
+ label = entry.get("class_or_confidence") or entry.get("label") or entry.get("class")
308
+ if not label:
309
+ continue
310
+
311
+ chunk_toks = tokenize_text(text)
312
+ # scan only from the end of the last match
313
+ for i in range(search_start, len(tokens) - len(chunk_toks) + 1):
314
+ if tokens[i:i + len(chunk_toks)] == chunk_toks:
315
+ spans.append((i, i + len(chunk_toks) - 1, label))
316
+ search_start = i + len(chunk_toks)
317
+ break
318
+ else:
319
+ print(f"โš ๏ธ Couldnโ€™t align chunk: {text!r}")
320
+
321
+ return spans
322
+
323
+ def load_example(idx):
324
+ rec = validated_store.get(idx, dynamic_dataset.example(idx))
325
+ segs = prepare_for_highlight(rec)
326
+ return segs, rec.get("validated", False), idx, make_info(rec)
327
+
328
+ def update_example(highlighted, idx: int):
329
+ rec = dynamic_dataset.data[idx]
330
+ # reโ€tokenize
331
+ orig_tokens = tokenize_text(rec["text"])
332
+ # realign highlights
333
+ new_ner = align_spans_to_tokens(highlighted, orig_tokens)
334
+ # overwrite & mark un-validated
335
+ rec["tokenized_text"] = orig_tokens
336
+ rec["ner"] = new_ner
337
+ rec["validated"] = False
338
+ return prepare_for_highlight(rec), rec["validated"], idx, make_info(rec)
339
+
340
+ def do_validate(highlighted, idx: int):
341
+ # in-memory mark
342
+ dynamic_dataset.validate()
343
+ rec = dynamic_dataset.data[idx]
344
+ orig_tokens = tokenize_text(rec["text"])
345
+ new_ner = align_spans_to_tokens(highlighted, orig_tokens)
346
+ rec["tokenized_text"] = orig_tokens
347
+ rec["ner"] = new_ner
348
+ # persist to disk/store
349
+ save_single_validation(idx, rec)
350
+ return prepare_for_highlight(rec), True, make_info(rec)
351
+
352
+ def nav(fn):
353
+ rec = fn()
354
+ segs = prepare_for_highlight(rec)
355
+ return segs, rec.get("validated", False), dynamic_dataset.current, make_info(rec)
356
+
357
  with gr.Blocks() as demo:
358
+ prog = gr.Slider(0, dynamic_dataset.len-1, value=0, step=1, label="Example #", interactive=False)
359
+ inp_box = gr.HighlightedText(label="Sentence", interactive=True)
360
+ info_md = gr.Markdown(label="Source", interactive=False) # โ† shows filename & page
361
+ status = gr.Checkbox(label="Validated?", value=False, interactive=False)
362
+
363
  gr.Markdown(
364
  "[๐Ÿ“– Entity Tag Guide](https://huggingface.co/spaces/rafmacalaba/datause-annotation/blob/main/guidelines.md)"
365
  )
366
+
367
  with gr.Row():
368
+ prev_btn = gr.Button("โ—€๏ธ Previous")
369
+ apply_btn = gr.Button("๐Ÿ“ Apply Changes")
370
+ next_btn = gr.Button("Next โ–ถ๏ธ")
371
+
372
  with gr.Row():
373
+ skip_prev = gr.Button("โฎ๏ธ Prev Unvalidated")
374
  validate_btn = gr.Button("โœ… Validate")
375
+ skip_next = gr.Button("โญ๏ธ Next Unvalidated")
376
+
377
+ # initial load
378
+ demo.load(load_example, inputs=prog, outputs=[inp_box, status, prog, info_md])
379
+
380
+ # wire up actions (all now also update info_md)
381
+ apply_btn.click(update_example, inputs=[inp_box, prog], outputs=[inp_box, status, prog, info_md])
382
+ prev_btn.click(lambda: nav(dynamic_dataset.prev), inputs=None, outputs=[inp_box, status, prog, info_md])
383
+ next_btn.click(lambda: nav(dynamic_dataset.next), inputs=None, outputs=[inp_box, status, prog, info_md])
384
+ skip_prev.click(lambda: nav(dynamic_dataset.jump_prev_unvalidated), inputs=None, outputs=[inp_box, status, prog, info_md])
385
+ skip_next.click(lambda: nav(dynamic_dataset.jump_next_unvalidated), inputs=None, outputs=[inp_box, status, prog, info_md])
386
+ validate_btn.click(do_validate, inputs=[inp_box, prog], outputs=[inp_box, status, info_md])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
387
 
388
  return demo
389