Spaces:

Kuberwastaken
/

TREAT

Running

App Files Files Community

Kuberwastaken commited on Jan 9

Commit

1331506

1 Parent(s): 8ea6312

Reverting to classic model

Browse files

Files changed (1) hide show

model/analyzer.py +28 -80

model/analyzer.py CHANGED Viewed

@@ -73,10 +73,10 @@ class ContentAnalyzer:
                 "mapped_name": "Sexual Abuse",
                 "description": (
                     "Any form of non-consensual sexual act, behavior, or interaction, involving coercion, manipulation, or physical force. "
-                    "This includes incidents of sexual assault, molestation, exploitation, harassment, and any acts where an individual is subjected to sexual acts against their will or without their consent. "
-                    "It also covers discussions or depictions of the aftermath of such abuse, such as trauma, emotional distress, legal proceedings, or therapy. "
-                    "References to inappropriate sexual advances, groping, or any other form of sexual misconduct are also included, as well as the psychological and emotional impact on survivors. "
-                    "Scenes where individuals are placed in sexually compromising situations, even if not directly acted upon, may also fall under this category."
                 )
             },
             "Self-Harm": {
@@ -139,51 +139,9 @@ class ContentAnalyzer:
             logger.error(f"Error loading model: {str(e)}")
             raise
-    def _chunk_text(self, text: str, chunk_size: int = 256, overlap: int = 32) -> List[str]:
         """Split text into overlapping chunks for processing."""
-        chunks = []
-        start = 0
-        text_len = len(text)
-        while start < text_len:
-            end = min(start + chunk_size, text_len)
-            # Find the last period or newline in the chunk to avoid cutting sentences
-            if end < text_len:
-                last_period = max(
-                    text.rfind('. ', start, end),
-                    text.rfind('\n', start, end)
-                )
-                if last_period > start:
-                    end = last_period + 1
-            chunks.append(text[start:end])
-            start = end - overlap
-        return chunks
-    def _process_model_response(self, response_text: str) -> float:
-        """Process model response and return a confidence score."""
-        response = response_text.strip().upper()
-        # Check for explicit YES/NO/MAYBE
-        if "YES" in response:
-            evidence_words = ["CLEAR", "DEFINITELY", "EXPLICIT", "STRONG"]
-            return 1.0 if any(word in response for word in evidence_words) else 0.8
-        elif "MAYBE" in response or "POSSIBLE" in response:
-            return 0.5
-        elif "NO" in response:
-            return 0.0
-        # Fallback analysis for unclear responses
-        positive_indicators = ["PRESENT", "FOUND", "CONTAINS", "SHOWS", "INDICATES"]
-        negative_indicators = ["ABSENT", "NONE", "NOTHING", "LACKS"]
-        if any(indicator in response for indicator in positive_indicators):
-            return 0.7
-        elif any(indicator in response for indicator in negative_indicators):
-            return 0.0
-        return 0.0
     async def analyze_chunk(
         self,
@@ -200,17 +158,10 @@ class ContentAnalyzer:
             description = info["description"]
             prompt = f"""
-            Analyze this text carefully for any indication of {mapped_name}.
-            Context: {description}
-            Guidelines:
-            - Consider both explicit and implicit references
-            - Ignore metaphorical or figurative language
-            - Look for concrete evidence in the text
-            Text to analyze: {chunk}
-            Is there evidence of {mapped_name}? Respond with YES, NO, or MAYBE and briefly explain why.
             Answer:
             """
@@ -221,21 +172,20 @@ class ContentAnalyzer:
                 with torch.no_grad():
                     outputs = self.model.generate(
                         **inputs,
-                        max_new_tokens=32,
-                        num_return_sequences=1,
                         do_sample=True,
-                        temperature=0.7,
-                        top_p=0.92,
-                        top_k=50,
-                        repetition_penalty=1.1,
                         pad_token_id=self.tokenizer.eos_token_id
                     )
-                response_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-                confidence = self._process_model_response(response_text)
-                if confidence > 0.5:
-                    chunk_triggers[mapped_name] = chunk_triggers.get(mapped_name, 0) + confidence
                 if progress:
                     current_progress += progress_step
@@ -252,11 +202,11 @@ class ContentAnalyzer:
             await self.load_model(progress)
         chunks = self._chunk_text(script)
-        trigger_scores = {}
         progress_step = 0.4 / (len(chunks) * len(self.trigger_categories))
-        current_progress = 0.5
-        for chunk in chunks:
             chunk_triggers = await self.analyze_chunk(
                 chunk,
                 progress,
@@ -264,17 +214,15 @@ class ContentAnalyzer:
                 progress_step
             )
-            for trigger, score in chunk_triggers.items():
-                trigger_scores[trigger] = trigger_scores.get(trigger, 0) + score
         if progress:
             progress(0.95, "Finalizing results...")
-        # Normalize scores by number of chunks and apply threshold
-        chunk_count = len(chunks)
         final_triggers = [
-            trigger for trigger, score in trigger_scores.items()
-            if score / chunk_count > 0.3  # Adjusted threshold for better balance
         ]
         return final_triggers if final_triggers else ["None"]
@@ -312,7 +260,7 @@ async def analyze_content(
         }
 if __name__ == "__main__":
-    # Gradio interface
     iface = gr.Interface(
         fn=analyze_content,
         inputs=gr.Textbox(lines=8, label="Input Text"),

                 "mapped_name": "Sexual Abuse",
                 "description": (
                     "Any form of non-consensual sexual act, behavior, or interaction, involving coercion, manipulation, or physical force. "
+                  "This includes incidents of sexual assault, molestation, exploitation, harassment, and any acts where an individual is subjected to sexual acts against their will or without their consent. "
+                  "It also covers discussions or depictions of the aftermath of such abuse, such as trauma, emotional distress, legal proceedings, or therapy. "
+                  "References to inappropriate sexual advances, groping, or any other form of sexual misconduct are also included, as well as the psychological and emotional impact on survivors. "
+                  "Scenes where individuals are placed in sexually compromising situations, even if not directly acted upon, may also fall under this category."
                 )
             },
             "Self-Harm": {
             logger.error(f"Error loading model: {str(e)}")
             raise
+    def _chunk_text(self, text: str, chunk_size: int = 256, overlap: int = 15) -> List[str]:
         """Split text into overlapping chunks for processing."""
+        return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size - overlap)]
     async def analyze_chunk(
         self,
             description = info["description"]
             prompt = f"""
+            Check this text for any indication of {mapped_name} ({description}).
+            Be sensitive to subtle references or implications, make sure the text is not metaphorical.
+            Respond concisely with: YES, NO, or MAYBE.
+            Text: {chunk}
             Answer:
             """
                 with torch.no_grad():
                     outputs = self.model.generate(
                         **inputs,
+                        max_new_tokens=3,
                         do_sample=True,
+                        temperature=0.5,
+                        top_p=0.9,
                         pad_token_id=self.tokenizer.eos_token_id
                     )
+                response_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True).strip().upper()
+                first_word = response_text.split("\n")[-1].split()[0] if response_text else "NO"
+                if first_word == "YES":
+                    chunk_triggers[mapped_name] = chunk_triggers.get(mapped_name, 0) + 1
+                elif first_word == "MAYBE":
+                    chunk_triggers[mapped_name] = chunk_triggers.get(mapped_name, 0) + 0.5
                 if progress:
                     current_progress += progress_step
             await self.load_model(progress)
         chunks = self._chunk_text(script)
+        identified_triggers = {}
         progress_step = 0.4 / (len(chunks) * len(self.trigger_categories))
+        current_progress = 0.5  # Starting after model loading
+        for chunk_idx, chunk in enumerate(chunks, 1):
             chunk_triggers = await self.analyze_chunk(
                 chunk,
                 progress,
                 progress_step
             )
+            for trigger, count in chunk_triggers.items():
+                identified_triggers[trigger] = identified_triggers.get(trigger, 0) + count
         if progress:
             progress(0.95, "Finalizing results...")
         final_triggers = [
+            trigger for trigger, count in identified_triggers.items()
+            if count > 0.5
         ]
         return final_triggers if final_triggers else ["None"]
         }
 if __name__ == "__main__":
+    # This section is mainly for testing the analyzer directly
     iface = gr.Interface(
         fn=analyze_content,
         inputs=gr.Textbox(lines=8, label="Input Text"),