Spaces:

ai4data
/

datause-detector

Paused

App Files Files Community

rafmacalaba commited on May 24

Commit

079aa2a

1 Parent(s): b99c40c

just highlight closest

Browse files

Files changed (1) hide show

app.py +68 -29

app.py CHANGED Viewed

@@ -109,45 +109,84 @@ def prune_acronym_and_self_relations(ner_preds, rel_preds):
 # Highlighting function
 def highlight_text(text, ner_threshold, re_threshold):
-    # Run inference
-    ner_preds, rel_preds = inference_pipeline(
-        text,
-        model=model,
-        labels=labels,
-        relation_extractor=relation_extractor,
-        TYPE2RELS=TYPE2RELS,
-        ner_threshold=ner_threshold,
-        re_threshold=re_threshold,
-        re_multi_label=False
-    )
-    # Post-process
     ner_preds, rel_preds = prune_acronym_and_self_relations(ner_preds, rel_preds)
-    # Gather all spans
     spans = []
     for ent in ner_preds:
         spans.append((ent["start"], ent["end"], ent["label"]))
     for src, rels in rel_preds.items():
-        for r in rels:
-            for m in re.finditer(re.escape(r["target"]), text):
-                spans.append((m.start(), m.end(), f"{src} <> {r['relation']}"))
-    # Merge labels by span
     merged = defaultdict(list)
-    for start, end, lbl in spans:
-        merged[(start, end)].append(lbl)
-    # Build Gradio entities
-    entities = []
-    for (start, end), lbls in sorted(merged.items(), key=lambda x: x[0]):
-        entities.append({
-            "entity": ", ".join(lbls),
-            "start": start,
-            "end": end
-        })
     return {"text": text, "entities": entities}, {"ner": ner_preds, "relations": rel_preds}
 # JSON output function

 # Highlighting function
+# def highlight_text(text, ner_threshold, re_threshold):
+#     # Run inference
+#     ner_preds, rel_preds = inference_pipeline(
+#         text,
+#         model=model,
+#         labels=labels,
+#         relation_extractor=relation_extractor,
+#         TYPE2RELS=TYPE2RELS,
+#         ner_threshold=ner_threshold,
+#         re_threshold=re_threshold,
+#         re_multi_label=False
+#     )
+#     # Post-process
+#     ner_preds, rel_preds = prune_acronym_and_self_relations(ner_preds, rel_preds)
+#     # Gather all spans
+#     spans = []
+#     for ent in ner_preds:
+#         spans.append((ent["start"], ent["end"], ent["label"]))
+#     for src, rels in rel_preds.items():
+#         for r in rels:
+#             for m in re.finditer(re.escape(r["target"]), text):
+#                 spans.append((m.start(), m.end(), f"{src} <> {r['relation']}"))
+#     # Merge labels by span
+#     merged = defaultdict(list)
+#     for start, end, lbl in spans:
+#         merged[(start, end)].append(lbl)
+#     # Build Gradio entities
+#     entities = []
+#     for (start, end), lbls in sorted(merged.items(), key=lambda x: x[0]):
+#         entities.append({
+#             "entity": ", ".join(lbls),
+#             "start": start,
+#             "end": end
+#         })
 def highlight_text(text, ner_threshold, re_threshold):
+    # … inference + pruning …
+    ner_preds, rel_preds = inference_pipeline(…)
     ner_preds, rel_preds = prune_acronym_and_self_relations(ner_preds, rel_preds)
     spans = []
+    # 1) NER spans
     for ent in ner_preds:
         spans.append((ent["start"], ent["end"], ent["label"]))
+    # 2) RE spans, closest‐match logic (no math import needed)
     for src, rels in rel_preds.items():
+        # find the source span center
+        src_ent = next((e for e in ner_preds if e["text"] == src), None)
+        src_center = ((src_ent["start"] + src_ent["end"]) / 2) if src_ent else None
+        for r in rels:
+            target = r["target"]
+            matches = list(re.finditer(re.escape(target), text))
+            if not matches:
+                continue
+            # pick the match whose center is nearest to src_center
+            if src_center is not None:
+                best = min(
+                    matches,
+                    key=lambda m: abs(((m.start() + m.end()) / 2) - src_center)
+                )
+            else:
+                best = matches[0]
+            spans.append((best.start(), best.end(), f"{src} <> {r['relation']}"))
+    # 3) merge & return…
     merged = defaultdict(list)
+    for s, e, lbl in spans:
+        merged[(s, e)].append(lbl)
+    entities = [
+        {"entity": ", ".join(lbls), "start": s, "end": e}
+        for (s, e), lbls in sorted(merged.items(), key=lambda x: x[0])
+    ]
     return {"text": text, "entities": entities}, {"ner": ner_preds, "relations": rel_preds}
 # JSON output function