rafmacalaba commited on
Commit
4ebd8ec
ยท
1 Parent(s): e587c99
Files changed (1) hide show
  1. app.py +2 -138
app.py CHANGED
@@ -17,9 +17,9 @@ s3 = boto3.client(
17
  # S3 bucket and keys
18
  BUCKET = "doccano-processed"
19
  #INIT_KEY = "gradio/initial_data_train.json"
20
- INIT_KEY = "gradio/refugee_train_initial_data_v4.json"
21
  #VALID_PREFIX = "validated_records/"
22
- VALID_PREFIX = "refugee_train_validated_v4/"
23
 
24
  # โ”€โ”€ Helpers to load & save from S3 โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
25
  def load_initial_data() -> List[Dict]:
@@ -138,142 +138,6 @@ def extract_tokens_and_labels(highlighted: List[Dict[str, Union[str, None]]]
138
 
139
  return tokens, ner
140
 
141
-
142
- # โ”€โ”€ App factory โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
143
- # def create_demo() -> gr.Blocks:
144
- # data = load_initial_data()
145
- # validated_store = load_all_validations()
146
-
147
- # for idx in validated_store:
148
- # if 0 <= idx < len(data):
149
- # data[idx]["validated"] = True
150
- # dynamic_dataset = DynamicDataset(data)
151
- # with gr.Blocks() as demo:
152
- # prog = gr.Slider(0, dynamic_dataset.len-1, value=0, step=1, label="Example #", interactive=False)
153
- # inp_box = gr.HighlightedText(label="Sentence", interactive=True)
154
- # status = gr.Checkbox(label="Validated?", value=False, interactive=False)
155
- # filename_disp = gr.Markdown(label="Filename") # NEW: shows current filename
156
- # page_disp = gr.Markdown(label="Page") # NEW: shows current page number
157
- # gr.Markdown(
158
- # "[๐Ÿ“– Entity Tag Guide](https://huggingface.co/spaces/rafmacalaba/datause-annotation/blob/main/guidelines.md)"
159
- # )
160
-
161
- # with gr.Row():
162
- # prev_btn = gr.Button("โ—€๏ธ Previous")
163
- # apply_btn = gr.Button("๐Ÿ“ Apply Changes")
164
- # next_btn = gr.Button("Next โ–ถ๏ธ")
165
- # with gr.Row():
166
- # skip_prev = gr.Button("โฎ๏ธ Prev Unvalidated")
167
- # validate_btn = gr.Button("โœ… Validate")
168
- # skip_next = gr.Button("โญ๏ธ Next Unvalidated")
169
-
170
- # # def load_example(idx):
171
- # # rec = validated_store.get(idx, dynamic_dataset.example(idx))
172
- # # segs = prepare_for_highlight(rec)
173
- # # return segs, rec.get("validated", False), idx
174
-
175
- # def load_example(idx):
176
- # rec = validated_store.get(idx, dynamic_dataset.example(idx))
177
- # segs = prepare_for_highlight(rec)
178
- # return (
179
- # segs,
180
- # rec.get("validated", False),
181
- # idx,
182
- # rec.get("filename", ""), # <-- returns filename for filename_disp
183
- # f"Page {rec.get('page', '')}" # <-- returns page for page_disp
184
- # )
185
-
186
- # def update_example(highlighted, idx: int):
187
- # # grab the record
188
- # rec = dynamic_dataset.data[idx]
189
-
190
- # # reโ€tokenize from the raw text (same as do_validate)
191
- # orig_tokens = tokenize_text(rec["text"])
192
-
193
- # # realign the user's highlights back to those tokens
194
- # new_ner = align_spans_to_tokens(highlighted, orig_tokens)
195
-
196
- # # overwrite both token list and span list (and mark unโ€validated)
197
- # rec["tokenized_text"] = orig_tokens
198
- # rec["ner"] = new_ner
199
- # rec["validated"] = False
200
-
201
- # # reโ€render
202
- # return prepare_for_highlight(rec)
203
-
204
- # def align_spans_to_tokens(
205
- # highlighted: List[Dict[str, Union[str, None]]],
206
- # tokens: List[str]
207
- # ) -> List[Tuple[int,int,str]]:
208
- # """
209
- # Align each highlighted chunk to the next matching tokens in the list,
210
- # advancing a pointer so repeated tokens map in the order you clicked them.
211
- # """
212
- # spans = []
213
- # search_start = 0
214
-
215
- # for entry in highlighted:
216
- # text = entry["token"]
217
- # label = entry.get("class_or_confidence") or entry.get("label") or entry.get("class")
218
- # if not label:
219
- # continue
220
-
221
- # chunk_toks = tokenize_text(text)
222
- # # scan only from the end of the last match
223
- # for i in range(search_start, len(tokens) - len(chunk_toks) + 1):
224
- # if tokens[i:i+len(chunk_toks)] == chunk_toks:
225
- # spans.append((i, i + len(chunk_toks) - 1, label))
226
- # search_start = i + len(chunk_toks)
227
- # break
228
- # else:
229
- # print(f"โš ๏ธ Couldnโ€™t align chunk: {text!r}")
230
-
231
- # return spans
232
-
233
- # def do_validate(highlighted, idx: int):
234
- # # mark validated in memory
235
- # dynamic_dataset.validate()
236
-
237
- # # grab the record
238
- # rec = dynamic_dataset.data[idx]
239
-
240
- # # re-tokenize from the original text
241
- # orig_tokens = tokenize_text(rec["text"])
242
-
243
- # # realign the user's highlighted segments to those tokens
244
- # new_ner = align_spans_to_tokens(highlighted, orig_tokens)
245
-
246
- # # overwrite both token list and span list
247
- # rec["tokenized_text"] = orig_tokens
248
- # rec["ner"] = new_ner
249
-
250
- # # persist
251
- # save_single_validation(idx, rec)
252
-
253
- # # re-render and show checkbox checked
254
- # return prepare_for_highlight(rec), True
255
-
256
-
257
- # def nav(fn):
258
- # rec = fn()
259
- # segs = prepare_for_highlight(rec)
260
- # return segs, rec.get("validated", False), dynamic_dataset.current
261
-
262
- # demo.load(load_example, inputs=prog, outputs=[inp_box, status, prog])
263
- # apply_btn.click(
264
- # fn=update_example,
265
- # inputs=[inp_box, prog], # pass both the highlights *and* the example idx
266
- # outputs=inp_box
267
- # )
268
- # #apply_btn.click(update_spans, inputs=inp_box, outputs=inp_box)
269
- # prev_btn.click(lambda: nav(dynamic_dataset.prev), inputs=None, outputs=[inp_box, status, prog])
270
- # validate_btn.click(do_validate, inputs=[inp_box, prog], outputs=[inp_box, status])
271
- # next_btn.click(lambda: nav(dynamic_dataset.next), inputs=None, outputs=[inp_box, status, prog])
272
- # skip_prev.click(lambda: nav(dynamic_dataset.jump_prev_unvalidated), inputs=None, outputs=[inp_box, status, prog])
273
- # skip_next.click(lambda: nav(dynamic_dataset.jump_next_unvalidated), inputs=None, outputs=[inp_box, status, prog])
274
-
275
- # return demo
276
-
277
  def create_demo() -> gr.Blocks:
278
  data = load_initial_data()
279
  validated_store = load_all_validations()
 
17
  # S3 bucket and keys
18
  BUCKET = "doccano-processed"
19
  #INIT_KEY = "gradio/initial_data_train.json"
20
+ INIT_KEY = "gradio/refugee_train_initial_datav5.json "
21
  #VALID_PREFIX = "validated_records/"
22
+ VALID_PREFIX = "refugee_train_validated_v5/"
23
 
24
  # โ”€โ”€ Helpers to load & save from S3 โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
25
  def load_initial_data() -> List[Dict]:
 
138
 
139
  return tokens, ner
140
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  def create_demo() -> gr.Blocks:
142
  data = load_initial_data()
143
  validated_store = load_all_validations()