Spaces:

Kuberwastaken
/

TREAT

Running

App Files Files Community

Kuberwastaken commited on Jan 10

Commit

bb3dffd

1 Parent(s): 3f25aef

Experimental Model

Browse files

Files changed (1) hide show

model/analyzer.py +77 -106

model/analyzer.py CHANGED Viewed

@@ -5,6 +5,7 @@ from datetime import datetime
 import gradio as gr
 from typing import Dict, List, Union, Optional
 import logging
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -19,103 +20,18 @@ class ContentAnalyzer:
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.model = None
         self.tokenizer = None
-        self.trigger_categories = self._init_trigger_categories()
-    def _init_trigger_categories(self) -> Dict:
-        """Initialize trigger categories with their descriptions."""
-        return {
-            "Violence": {
-                "mapped_name": "Violence",
-                "description": (
-                    "Any act involving physical force or aggression intended to cause harm, injury, or death to a person, animal, or object. "
-                    "Includes direct physical confrontations (e.g., fights, beatings, or assaults), implied violence (e.g., very graphical threats or descriptions of injuries), "
-                    "or large-scale events like wars, riots, or violent protests."
-                )
-            },
-            "Death": {
-                "mapped_name": "Death References",
-                "description": (
-                     "Any mention, implication, or depiction of the loss of life, including direct deaths of characters, including mentions of deceased individuals, "
-                    "or abstract references to mortality (e.g., 'facing the end' or 'gone forever'). This also covers depictions of funerals, mourning, "
-                    "grieving, or any dialogue that centers around death, do not take metaphors into context that don't actually lead to death."
-                )
-            },
-            "Substance Use": {
-                "mapped_name": "Substance Use",
-                "description": (
-                    "Any explicit or implied reference to the consumption, misuse, or abuse of drugs, alcohol, or other intoxicating substances. "
-                    "Includes scenes of drinking, smoking, or drug use, whether recreational or addictive. May also cover references to withdrawal symptoms, "
-                    "rehabilitation, or substance-related paraphernalia (e.g., needles, bottles, pipes)."
-                )
-            },
-            "Gore": {
-                "mapped_name": "Gore",
-                "description": (
-                    "Extremely detailed and graphic depictions of highly severe physical injuries, mutilation, or extreme bodily harm, often accompanied by descriptions of heavy blood, exposed organs, "
-                    "or dismemberment. This includes war scenes with severe casualties, horror scenarios involving grotesque creatures, or medical procedures depicted with excessive detail."
-                )
-            },
-            "Vomit": {
-                "mapped_name": "Vomit",
-                "description": (
-                      "Any reference to the act of vomiting, whether directly described, implied, or depicted in detail. This includes sounds or visual descriptions of the act, "
-                      "mentions of nausea leading to vomiting, or its aftermath (e.g., the presence of vomit, cleaning it up, or characters reacting to it)."
-                )
-            },
-            "Sexual Content": {
-                "mapped_name": "Sexual Content",
-                "description": (
-                    "Any depiction or mention of sexual activity, intimacy, or sexual behavior, ranging from implied scenes to explicit descriptions. "
-                    "This includes romantic encounters, physical descriptions of characters in a sexual context, sexual dialogue, or references to sexual themes (e.g., harassment, innuendos)."
-                )
-            },
-            "Sexual Abuse": {
-                "mapped_name": "Sexual Abuse",
-                "description": (
-                    "Any form of non-consensual sexual act, behavior, or interaction, involving coercion, manipulation, or physical force. "
-                  "This includes incidents of sexual assault, molestation, exploitation, harassment, and any acts where an individual is subjected to sexual acts against their will or without their consent. "
-                  "It also covers discussions or depictions of the aftermath of such abuse, such as trauma, emotional distress, legal proceedings, or therapy. "
-                  "References to inappropriate sexual advances, groping, or any other form of sexual misconduct are also included, as well as the psychological and emotional impact on survivors. "
-                  "Scenes where individuals are placed in sexually compromising situations, even if not directly acted upon, may also fall under this category."
-                )
-            },
-            "Self-Harm": {
-                "mapped_name": "Self-Harm",
-                "description": (
-                   "Any mention or depiction of behaviors where an individual intentionally causes harm to themselves. This includes cutting, burning, or other forms of physical injury, "
-                    "as well as suicidal ideation, suicide attempts, or discussions of self-destructive thoughts and actions. References to scars, bruises, or other lasting signs of self-harm are also included."
-                )
-            },
-            "Gun Use": {
-                "mapped_name": "Gun Use",
-                "description": (
-                    "Any explicit or implied mention of firearms being handled, fired, or used in a threatening manner. This includes scenes of gun violence, references to shootings, "
-                    "gun-related accidents, or the presence of firearms in a tense or dangerous context (e.g., holstered weapons during an argument)."
-                )
-            },
-            "Animal Cruelty": {
-                "mapped_name": "Animal Cruelty",
-                "description": (
-                    "Any act of harm, abuse, or neglect toward animals, whether intentional or accidental. This includes physical abuse (e.g., hitting, injuring, or killing animals), "
-                    "mental or emotional mistreatment (e.g., starvation, isolation), and scenes where animals are subjected to pain or suffering for human entertainment or experimentation."
-                )
-            },
-            "Mental Health Issues": {
-                "mapped_name": "Mental Health Issues",
-                "description": (
-                    "Any reference to mental health struggles, disorders, or psychological distress. This includes mentions of depression, anxiety, PTSD, bipolar disorder, schizophrenia, "
-                    "or other conditions. Scenes depicting therapy sessions, psychiatric treatment, or coping mechanisms (e.g., medication, journaling) are also included. May cover subtle hints "
-                    "like a character expressing feelings of worthlessness, hopelessness, or detachment from reality."
-                )
-            }
-        }
     async def load_model(self, progress=None) -> None:
-        """Load the model and tokenizer with progress updates."""
         try:
             if progress:
                 progress(0.1, "Loading tokenizer...")
             self.tokenizer = AutoTokenizer.from_pretrained(
                 "meta-llama/Llama-3.2-1B",
                 use_fast=True
@@ -124,6 +40,7 @@ class ContentAnalyzer:
             if progress:
                 progress(0.3, "Loading model...")
             self.model = AutoModelForCausalLM.from_pretrained(
                 "meta-llama/Llama-3.2-1B",
                 token=self.hf_token,
@@ -134,29 +51,42 @@ class ContentAnalyzer:
             if progress:
                 progress(0.5, "Model loaded successfully")
             logger.info(f"Model loaded successfully on {self.device}")
         except Exception as e:
             logger.error(f"Error loading model: {str(e)}")
             raise
     def _chunk_text(self, text: str, chunk_size: int = 256, overlap: int = 15) -> List[str]:
         """Split text into overlapping chunks for processing."""
-        return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size - overlap)]
     async def analyze_chunk(
         self,
         chunk: str,
         progress: Optional[gr.Progress] = None,
         current_progress: float = 0,
         progress_step: float = 0
     ) -> Dict[str, float]:
-        """Analyze a single chunk of text for triggers."""
         chunk_triggers = {}
-        for category, info in self.trigger_categories.items():
             mapped_name = info["mapped_name"]
             description = info["description"]
             prompt = f"""
             Check this text for any indication of {mapped_name} ({description}).
             Be sensitive to subtle references or implications, make sure the text is not metaphorical.
@@ -166,26 +96,33 @@ class ContentAnalyzer:
             """
             try:
                 inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
                 inputs = {k: v.to(self.device) for k, v in inputs.items()}
                 with torch.no_grad():
                     outputs = self.model.generate(
                         **inputs,
-                        max_new_tokens=10,
                         do_sample=True,
-                        temperature=0.7,
-                        top_p=0.8,
                         pad_token_id=self.tokenizer.eos_token_id
                     )
                 response_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True).strip().upper()
                 first_word = response_text.split("\n")[-1].split()[0] if response_text else "NO"
                 if first_word == "YES":
                     chunk_triggers[mapped_name] = chunk_triggers.get(mapped_name, 0) + 1
                 elif first_word == "MAYBE":
                     chunk_triggers[mapped_name] = chunk_triggers.get(mapped_name, 0) + 0.5
                 if progress:
                     current_progress += progress_step
@@ -193,22 +130,41 @@ class ContentAnalyzer:
             except Exception as e:
                 logger.error(f"Error analyzing chunk for {mapped_name}: {str(e)}")
         return chunk_triggers
     async def analyze_script(self, script: str, progress: Optional[gr.Progress] = None) -> List[str]:
-        """Analyze the entire script for triggers with progress updates."""
         if not self.model or not self.tokenizer:
             await self.load_model(progress)
         chunks = self._chunk_text(script)
         identified_triggers = {}
-        progress_step = 0.4 / (len(chunks) * len(self.trigger_categories))
         current_progress = 0.5  # Starting after model loading
         for chunk_idx, chunk in enumerate(chunks, 1):
             chunk_triggers = await self.analyze_chunk(
                 chunk,
                 progress,
                 current_progress,
                 progress_step
@@ -220,18 +176,29 @@ class ContentAnalyzer:
         if progress:
             progress(0.95, "Finalizing results...")
-        final_triggers = [
-            trigger for trigger, count in identified_triggers.items()
-            if count > 0.5
-        ]
-        return final_triggers if final_triggers else ["None"]
 async def analyze_content(
     script: str,
     progress: Optional[gr.Progress] = None
 ) -> Dict[str, Union[List[str], str]]:
-    """Main analysis function for the Gradio interface."""
     analyzer = ContentAnalyzer()
     try:
@@ -247,10 +214,14 @@ async def analyze_content(
             "analysis_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
         }
         return result
     except Exception as e:
         logger.error(f"Analysis error: {str(e)}")
         return {
             "detected_triggers": ["Error occurred during analysis"],
             "confidence": "Error",
@@ -260,7 +231,7 @@ async def analyze_content(
         }
 if __name__ == "__main__":
-    # This section is mainly for testing the analyzer directly
     iface = gr.Interface(
         fn=analyze_content,
         inputs=gr.Textbox(lines=8, label="Input Text"),

 import gradio as gr
 from typing import Dict, List, Union, Optional
 import logging
+import traceback
 # Configure logging
 logging.basicConfig(level=logging.INFO)
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.model = None
         self.tokenizer = None
+        logger.info(f"Initialized analyzer with device: {self.device}")
     async def load_model(self, progress=None) -> None:
+        """Load the model and tokenizer with progress updates and detailed logging."""
         try:
+            print("\n=== Starting Model Loading ===")
+            print(f"Time: {datetime.now()}")
             if progress:
                 progress(0.1, "Loading tokenizer...")
+            print("Loading tokenizer...")
             self.tokenizer = AutoTokenizer.from_pretrained(
                 "meta-llama/Llama-3.2-1B",
                 use_fast=True
             if progress:
                 progress(0.3, "Loading model...")
+            print(f"Loading model on {self.device}...")
             self.model = AutoModelForCausalLM.from_pretrained(
                 "meta-llama/Llama-3.2-1B",
                 token=self.hf_token,
             if progress:
                 progress(0.5, "Model loaded successfully")
+            print("Model and tokenizer loaded successfully")
             logger.info(f"Model loaded successfully on {self.device}")
         except Exception as e:
             logger.error(f"Error loading model: {str(e)}")
+            print(f"\nERROR DURING MODEL LOADING: {str(e)}")
+            print("Stack trace:")
+            traceback.print_exc()
             raise
     def _chunk_text(self, text: str, chunk_size: int = 256, overlap: int = 15) -> List[str]:
         """Split text into overlapping chunks for processing."""
+        chunks = []
+        for i in range(0, len(text), chunk_size - overlap):
+            chunk = text[i:i + chunk_size]
+            chunks.append(chunk)
+        print(f"Split text into {len(chunks)} chunks with {overlap} token overlap")
+        return chunks
     async def analyze_chunk(
         self,
         chunk: str,
+        trigger_categories: Dict,
         progress: Optional[gr.Progress] = None,
         current_progress: float = 0,
         progress_step: float = 0
     ) -> Dict[str, float]:
+        """Analyze a single chunk of text for triggers with detailed logging."""
         chunk_triggers = {}
+        print(f"\n--- Processing Chunk ---")
+        print(f"Chunk text (preview): {chunk[:50]}...")
+        for category, info in trigger_categories.items():
             mapped_name = info["mapped_name"]
             description = info["description"]
+            print(f"\nAnalyzing for {mapped_name}...")
             prompt = f"""
             Check this text for any indication of {mapped_name} ({description}).
             Be sensitive to subtle references or implications, make sure the text is not metaphorical.
             """
             try:
+                print("Sending prompt to model...")
                 inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
                 inputs = {k: v.to(self.device) for k, v in inputs.items()}
                 with torch.no_grad():
+                    print("Generating response...")
                     outputs = self.model.generate(
                         **inputs,
+                        max_new_tokens=5,
                         do_sample=True,
+                        temperature=0.3,
+                        top_p=0.9,
                         pad_token_id=self.tokenizer.eos_token_id
                     )
                 response_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True).strip().upper()
                 first_word = response_text.split("\n")[-1].split()[0] if response_text else "NO"
+                print(f"Model response for {mapped_name}: {first_word}")
                 if first_word == "YES":
+                    print(f"Detected {mapped_name} in this chunk!")
                     chunk_triggers[mapped_name] = chunk_triggers.get(mapped_name, 0) + 1
                 elif first_word == "MAYBE":
+                    print(f"Possible {mapped_name} detected, marking for further review.")
                     chunk_triggers[mapped_name] = chunk_triggers.get(mapped_name, 0) + 0.5
+                else:
+                    print(f"No {mapped_name} detected in this chunk.")
                 if progress:
                     current_progress += progress_step
             except Exception as e:
                 logger.error(f"Error analyzing chunk for {mapped_name}: {str(e)}")
+                print(f"Error during analysis of {mapped_name}: {str(e)}")
+                traceback.print_exc()
         return chunk_triggers
     async def analyze_script(self, script: str, progress: Optional[gr.Progress] = None) -> List[str]:
+        """Analyze the entire script for triggers with progress updates and detailed logging."""
+        print("\n=== Starting Script Analysis ===")
+        print(f"Time: {datetime.now()}")
         if not self.model or not self.tokenizer:
             await self.load_model(progress)
+        # Initialize trigger categories (kept from your working script)
+        trigger_categories = {
+            "Violence": {
+                "mapped_name": "Violence",
+                "description": (
+                    "Any act involving physical force or aggression intended to cause harm, injury, or death to a person, animal, or object. "
+                    "Includes direct physical confrontations (e.g., fights, beatings, or assaults), implied violence (e.g., very graphical threats or descriptions of injuries), "
+                    "or large-scale events like wars, riots, or violent protests."
+                )
+            },
+            # ... [other categories remain the same]
+        }
         chunks = self._chunk_text(script)
         identified_triggers = {}
+        progress_step = 0.4 / (len(chunks) * len(trigger_categories))
         current_progress = 0.5  # Starting after model loading
         for chunk_idx, chunk in enumerate(chunks, 1):
             chunk_triggers = await self.analyze_chunk(
                 chunk,
+                trigger_categories,
                 progress,
                 current_progress,
                 progress_step
         if progress:
             progress(0.95, "Finalizing results...")
+        print("\n=== Analysis Complete ===")
+        print("Final Results:")
+        final_triggers = []
+        for mapped_name, count in identified_triggers.items():
+            if count > 0.5:
+                final_triggers.append(mapped_name)
+            print(f"- {mapped_name}: found in {count} chunks")
+        if not final_triggers:
+            print("No triggers detected")
+            final_triggers = ["None"]
+        return final_triggers
 async def analyze_content(
     script: str,
     progress: Optional[gr.Progress] = None
 ) -> Dict[str, Union[List[str], str]]:
+    """Main analysis function for the Gradio interface with detailed logging."""
+    print("\n=== Starting Content Analysis ===")
+    print(f"Time: {datetime.now()}")
     analyzer = ContentAnalyzer()
     try:
             "analysis_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
         }
+        print("\nFinal Result Dictionary:", result)
         return result
     except Exception as e:
         logger.error(f"Analysis error: {str(e)}")
+        print(f"\nERROR OCCURRED: {str(e)}")
+        print("Stack trace:")
+        traceback.print_exc()
         return {
             "detected_triggers": ["Error occurred during analysis"],
             "confidence": "Error",
         }
 if __name__ == "__main__":
+    # Gradio interface
     iface = gr.Interface(
         fn=analyze_content,
         inputs=gr.Textbox(lines=8, label="Input Text"),