Spaces:

ai4data
/

datause-detector

Running

App Files Files Community

rafmacalaba commited on 17 days ago

Commit

2ae65ac

1 Parent(s): 48711e5

add caching of preds

Browse files

Files changed (3) hide show

app.py +13 -26
relation_extraction.py +147 -0
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -9,12 +9,12 @@ _CACHE_DIR = os.environ.get("CACHE_DIR", None)
 # Import GLiNER model and relation extractor
 from gliner import GLiNER
-from gliner.multitask import GLiNERRelationExtractor
 # Cache and initialize model + relation extractor
 DATA_MODEL_ID = "rafmacalaba/gliner_re_finetuned-v3"
 model = GLiNER.from_pretrained(DATA_MODEL_ID, cache_dir=_CACHE_DIR)
-relation_extractor = GLiNERRelationExtractor(model=model)
 # Sample text
 SAMPLE_TEXT = (
@@ -45,11 +45,12 @@ def inference_pipeline(
     text: str,
     model,
     labels: List[str],
-    relation_extractor: GLiNERRelationExtractor,
     TYPE2RELS: Dict[str, List[str]],
-    ner_threshold: float = 0.5,
-    re_threshold: float = 0.4,
     re_multi_label: bool = False,
 ) -> Tuple[List[Dict[str, Any]], Dict[str, List[Dict[str, Any]]]]:
     ner_preds = model.predict_entities(
         text,
@@ -72,9 +73,9 @@ def inference_pipeline(
             relations=None,
             entities=None,
             relation_labels=slot_labels,
-            threshold=re_threshold,
             multi_label=re_multi_label,
-            distance_threshold=100,
         )[0]
         re_results[span] = preds
@@ -109,7 +110,7 @@ def prune_acronym_and_self_relations(ner_preds, rel_preds):
 # Highlighting function
-def highlight_text(text, ner_threshold, re_threshold):
     # Run inference
     ner_preds, rel_preds = inference_pipeline(
         text,
@@ -118,8 +119,9 @@ def highlight_text(text, ner_threshold, re_threshold):
         relation_extractor=relation_extractor,
         TYPE2RELS=TYPE2RELS,
         ner_threshold=ner_threshold,
-        re_threshold=re_threshold,
-        re_multi_label=False
     )
     # Post-process
@@ -150,21 +152,6 @@ def highlight_text(text, ner_threshold, re_threshold):
     return {"text": text, "entities": entities}, {"ner": ner_preds, "relations": rel_preds}
 # JSON output function
-def get_model_predictions(text, ner_threshold, re_threshold):
-    ner_preds, rel_preds = inference_pipeline(
-        text,
-        model=model,
-        labels=labels,
-        relation_extractor=relation_extractor,
-        TYPE2RELS=TYPE2RELS,
-        ner_threshold=ner_threshold,
-        re_threshold=re_threshold,
-        re_multi_label=False
-    )
-    ner_preds, rel_preds = prune_acronym_and_self_relations(ner_preds, rel_preds)
-    return json.dumps({"ner": ner_preds, "relations": rel_preds}, indent=2)
 def _cached_predictions(state):
     if not state:
         return "📋 No predictions yet. Click **Submit** first."
@@ -216,4 +203,4 @@ with gr.Blocks() as demo:
 # Launch the app
-demo.launch(debug=True)

 # Import GLiNER model and relation extractor
 from gliner import GLiNER
+#from relation_extraction import CustomGLiNERRelationExtractor
 # Cache and initialize model + relation extractor
 DATA_MODEL_ID = "rafmacalaba/gliner_re_finetuned-v3"
 model = GLiNER.from_pretrained(DATA_MODEL_ID, cache_dir=_CACHE_DIR)
+relation_extractor = CustomGLiNERRelationExtractor(model=model, return_index=True)
 # Sample text
 SAMPLE_TEXT = (
     text: str,
     model,
     labels: List[str],
+    relation_extractor: CustomGLiNERRelationExtractor,
     TYPE2RELS: Dict[str, List[str]],
+    ner_threshold: float = 0.7,
+    rel_threshold: float = 0.5,
     re_multi_label: bool = False,
+    return_index: bool = False,
 ) -> Tuple[List[Dict[str, Any]], Dict[str, List[Dict[str, Any]]]]:
     ner_preds = model.predict_entities(
         text,
             relations=None,
             entities=None,
             relation_labels=slot_labels,
+            threshold=rel_threshold,
             multi_label=re_multi_label,
+            return_index=return_index,
         )[0]
         re_results[span] = preds
 # Highlighting function
+def highlight_text(text, ner_threshold, rel_threshold):
     # Run inference
     ner_preds, rel_preds = inference_pipeline(
         text,
         relation_extractor=relation_extractor,
         TYPE2RELS=TYPE2RELS,
         ner_threshold=ner_threshold,
+        rel_threshold=rel_threshold,
+        re_multi_label=False,
+        return_index=True,
     )
     # Post-process
     return {"text": text, "entities": entities}, {"ner": ner_preds, "relations": rel_preds}
 # JSON output function
 def _cached_predictions(state):
     if not state:
         return "📋 No predictions yet. Click **Submit** first."
 # Launch the app
+demo.launch(debug=True, inline=True)

relation_extraction.py ADDED Viewed

	@@ -0,0 +1,147 @@

+from gliner.multitask.base import GLiNERBasePipeline
+from typing import Optional, List, Union
+from datasets import load_dataset, Dataset
+class CustomGLiNERRelationExtractor(GLiNERBasePipeline):
+    """
+    A class to use GLiNER for relation extraction inference and evaluation.
+    Attributes:
+        device (str): Device to run the model on, e.g., 'cuda:0' or 'cpu'.
+        model (GLiNER): Loaded GLiNER model instance.
+        prompt (str): Template prompt for relation extraction.
+    Methods:
+        process_predictions(predictions):
+            Processes model predictions to extract the most likely labels.
+        prepare_texts(texts, labels):
+            Creates relation extraction prompts for each input text.
+        __call__(texts, labels, threshold=0.5):
+            Runs the model on the given texts and returns predicted labels.
+        evaluate(dataset_id, labels=None, threshold=0.5, max_examples=-1):
+            Evaluates the model on a dataset and computes F1 scores.
+    """
+    prompt = "Extract relationships between entities from the text: "
+    def __init__(self, model_id: str = None, model: GLiNER = None, device: str = 'cuda:0', ner_threshold: float = 0.5, rel_threshold: float = 0.5, return_index: bool = False, prompt: Optional[str] = None):
+        """
+        Initializes the GLiNERRelationExtractor.
+        Args:
+            model_id (str, optional): Identifier for the model to be loaded. Defaults to None.
+            model (GLiNER, optional): Preloaded GLiNER model. Defaults to None.
+            device (str, optional): Device to run the model on ('cpu' or 'cuda:X'). Defaults to 'cuda:0'.
+            ner_threshold (float, optional): Named Entity Recognition threshold to use. Defaults to 0.5.
+            rel_threshold (float, optional): Relation Extraction threshold to use. Defaults to 0.5.
+            prompt (str, optional): Template prompt for question-answering.
+        """
+        # Use the provided prompt or default to the class-level prompt
+        prompt = prompt if prompt is not None else self.prompt
+        super().__init__(model_id=model_id, model=model, prompt=prompt, device=device)
+    def prepare_texts(self, texts: List[str], **kwargs):
+        """
+        Prepares prompts for relation extraction to texts.
+        Args:
+            texts (list): List of input texts.
+        Returns:
+            list: List of formatted prompts.
+        """
+        prompts = []
+        for id, text in enumerate(texts):
+            prompt = f"{self.prompt} \n {text}"
+            prompts.append(prompt)
+        return prompts
+    def prepare_source_relation(self, ner_predictions: List[dict], relations: List[str]):
+        relation_labels = []
+        for prediction in ner_predictions:
+            curr_labels = []
+            unique_entities = {ent['text'] for ent in prediction}
+            for relation in relations:
+                for ent in unique_entities:
+                    curr_labels.append(f"{ent} <> {relation}")
+            relation_labels.append(curr_labels)
+        return relation_labels
+    def process_predictions(self, predictions, **kwargs):
+        """
+        Processes predictions to extract the highest-scoring relation(s).
+        Args:
+            predictions (list): List of predictions with scores.
+        Returns:
+            list: List of predicted labels for each input.
+        """
+        batch_predicted_relations = []
+        for prediction in predictions:
+            # Sort predictions by score in descending order
+            curr_relations = []
+            for target in prediction:
+                target_ent = target['text']
+                score = target['score']
+                source, relation = target['label'].split('<>')
+                relation = {
+                    "source": source.strip(),
+                    "relation": relation.strip(),
+                    "target": target_ent.strip(),
+                    "score": score
+                }
+                # **pull through** span info if present
+                if self.return_index:
+                    relation['start'] = target.get('start', None)
+                    relation['end']   = target.get('end', None)
+                curr_relations.append(relation)
+            batch_predicted_relations.append(curr_relations)
+        return batch_predicted_relations
+    def __call__(self, texts: Union[str, List[str]], relations: List[str]=None,
+                                entities: List[str] = ['named entity'],
+                                relation_labels: Optional[List[List[str]]]=None,
+                                ner_threshold: float = 0.5,
+                                rel_threshold: float = 0.5,
+                                batch_size: int = 8, **kwargs):
+        if isinstance(texts, str):
+            texts = [texts]
+        prompts = self.prepare_texts(texts, **kwargs)
+        if relation_labels is None:
+            # ner
+            ner_predictions = self.model.run(texts, entities, threshold=ner_threshold, batch_size=batch_size)
+            #rex
+            relation_labels = self.prepare_source_relation(ner_predictions, relations)
+        predictions = self.model.run(prompts, relation_labels, threshold=rel_threshold, batch_size=batch_size)
+        results = self.process_predictions(predictions, **kwargs)
+        return results
+    def evaluate(self, dataset_id: Optional[str] = None, dataset: Optional[Dataset] = None,
+                    labels: Optional[List[str]]=None, threshold: float =0.5, max_examples: float =-1):
+        """
+        Evaluates the model on a specified dataset and computes evaluation metrics.
+        Args:
+            dataset_id (str, optional): Identifier for the dataset to load (e.g., from Hugging Face datasets).
+            dataset (Dataset, optional): A pre-loaded dataset to evaluate. If provided, `dataset_id` is ignored.
+            labels (list, optional): List of target labels to consider for relation extraction. Defaults to None (use all).
+            threshold (float): Confidence threshold for predictions. Defaults to 0.5.
+            max_examples (int): Maximum number of examples to evaluate. Defaults to -1 (use all available examples).
+        Returns:
+            dict: A dictionary containing evaluation metrics such as F1 scores.
+        Raises:
+            ValueError: If neither `dataset_id` nor `dataset` is provided.
+        """
+        raise NotImplementedError("Currently `evaluate` method is not implemented.")

requirements.txt CHANGED Viewed

@@ -2,4 +2,5 @@ gradio
 gliner
 torch
 scipy
-scikit-learn

 gliner
 torch
 scipy
+scikit-learn
+datasets