Kuberwastaken commited on
Commit
bb3dffd
·
1 Parent(s): 3f25aef

Experimental Model

Browse files
Files changed (1) hide show
  1. model/analyzer.py +77 -106
model/analyzer.py CHANGED
@@ -5,6 +5,7 @@ from datetime import datetime
5
  import gradio as gr
6
  from typing import Dict, List, Union, Optional
7
  import logging
 
8
 
9
  # Configure logging
10
  logging.basicConfig(level=logging.INFO)
@@ -19,103 +20,18 @@ class ContentAnalyzer:
19
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
20
  self.model = None
21
  self.tokenizer = None
22
- self.trigger_categories = self._init_trigger_categories()
23
-
24
- def _init_trigger_categories(self) -> Dict:
25
- """Initialize trigger categories with their descriptions."""
26
- return {
27
- "Violence": {
28
- "mapped_name": "Violence",
29
- "description": (
30
- "Any act involving physical force or aggression intended to cause harm, injury, or death to a person, animal, or object. "
31
- "Includes direct physical confrontations (e.g., fights, beatings, or assaults), implied violence (e.g., very graphical threats or descriptions of injuries), "
32
- "or large-scale events like wars, riots, or violent protests."
33
- )
34
- },
35
- "Death": {
36
- "mapped_name": "Death References",
37
- "description": (
38
- "Any mention, implication, or depiction of the loss of life, including direct deaths of characters, including mentions of deceased individuals, "
39
- "or abstract references to mortality (e.g., 'facing the end' or 'gone forever'). This also covers depictions of funerals, mourning, "
40
- "grieving, or any dialogue that centers around death, do not take metaphors into context that don't actually lead to death."
41
- )
42
- },
43
- "Substance Use": {
44
- "mapped_name": "Substance Use",
45
- "description": (
46
- "Any explicit or implied reference to the consumption, misuse, or abuse of drugs, alcohol, or other intoxicating substances. "
47
- "Includes scenes of drinking, smoking, or drug use, whether recreational or addictive. May also cover references to withdrawal symptoms, "
48
- "rehabilitation, or substance-related paraphernalia (e.g., needles, bottles, pipes)."
49
- )
50
- },
51
- "Gore": {
52
- "mapped_name": "Gore",
53
- "description": (
54
- "Extremely detailed and graphic depictions of highly severe physical injuries, mutilation, or extreme bodily harm, often accompanied by descriptions of heavy blood, exposed organs, "
55
- "or dismemberment. This includes war scenes with severe casualties, horror scenarios involving grotesque creatures, or medical procedures depicted with excessive detail."
56
- )
57
- },
58
- "Vomit": {
59
- "mapped_name": "Vomit",
60
- "description": (
61
- "Any reference to the act of vomiting, whether directly described, implied, or depicted in detail. This includes sounds or visual descriptions of the act, "
62
- "mentions of nausea leading to vomiting, or its aftermath (e.g., the presence of vomit, cleaning it up, or characters reacting to it)."
63
- )
64
- },
65
- "Sexual Content": {
66
- "mapped_name": "Sexual Content",
67
- "description": (
68
- "Any depiction or mention of sexual activity, intimacy, or sexual behavior, ranging from implied scenes to explicit descriptions. "
69
- "This includes romantic encounters, physical descriptions of characters in a sexual context, sexual dialogue, or references to sexual themes (e.g., harassment, innuendos)."
70
- )
71
- },
72
- "Sexual Abuse": {
73
- "mapped_name": "Sexual Abuse",
74
- "description": (
75
- "Any form of non-consensual sexual act, behavior, or interaction, involving coercion, manipulation, or physical force. "
76
- "This includes incidents of sexual assault, molestation, exploitation, harassment, and any acts where an individual is subjected to sexual acts against their will or without their consent. "
77
- "It also covers discussions or depictions of the aftermath of such abuse, such as trauma, emotional distress, legal proceedings, or therapy. "
78
- "References to inappropriate sexual advances, groping, or any other form of sexual misconduct are also included, as well as the psychological and emotional impact on survivors. "
79
- "Scenes where individuals are placed in sexually compromising situations, even if not directly acted upon, may also fall under this category."
80
- )
81
- },
82
- "Self-Harm": {
83
- "mapped_name": "Self-Harm",
84
- "description": (
85
- "Any mention or depiction of behaviors where an individual intentionally causes harm to themselves. This includes cutting, burning, or other forms of physical injury, "
86
- "as well as suicidal ideation, suicide attempts, or discussions of self-destructive thoughts and actions. References to scars, bruises, or other lasting signs of self-harm are also included."
87
- )
88
- },
89
- "Gun Use": {
90
- "mapped_name": "Gun Use",
91
- "description": (
92
- "Any explicit or implied mention of firearms being handled, fired, or used in a threatening manner. This includes scenes of gun violence, references to shootings, "
93
- "gun-related accidents, or the presence of firearms in a tense or dangerous context (e.g., holstered weapons during an argument)."
94
- )
95
- },
96
- "Animal Cruelty": {
97
- "mapped_name": "Animal Cruelty",
98
- "description": (
99
- "Any act of harm, abuse, or neglect toward animals, whether intentional or accidental. This includes physical abuse (e.g., hitting, injuring, or killing animals), "
100
- "mental or emotional mistreatment (e.g., starvation, isolation), and scenes where animals are subjected to pain or suffering for human entertainment or experimentation."
101
- )
102
- },
103
- "Mental Health Issues": {
104
- "mapped_name": "Mental Health Issues",
105
- "description": (
106
- "Any reference to mental health struggles, disorders, or psychological distress. This includes mentions of depression, anxiety, PTSD, bipolar disorder, schizophrenia, "
107
- "or other conditions. Scenes depicting therapy sessions, psychiatric treatment, or coping mechanisms (e.g., medication, journaling) are also included. May cover subtle hints "
108
- "like a character expressing feelings of worthlessness, hopelessness, or detachment from reality."
109
- )
110
- }
111
- }
112
 
113
  async def load_model(self, progress=None) -> None:
114
- """Load the model and tokenizer with progress updates."""
115
  try:
 
 
 
116
  if progress:
117
  progress(0.1, "Loading tokenizer...")
118
 
 
119
  self.tokenizer = AutoTokenizer.from_pretrained(
120
  "meta-llama/Llama-3.2-1B",
121
  use_fast=True
@@ -124,6 +40,7 @@ class ContentAnalyzer:
124
  if progress:
125
  progress(0.3, "Loading model...")
126
 
 
127
  self.model = AutoModelForCausalLM.from_pretrained(
128
  "meta-llama/Llama-3.2-1B",
129
  token=self.hf_token,
@@ -134,29 +51,42 @@ class ContentAnalyzer:
134
  if progress:
135
  progress(0.5, "Model loaded successfully")
136
 
 
137
  logger.info(f"Model loaded successfully on {self.device}")
138
  except Exception as e:
139
  logger.error(f"Error loading model: {str(e)}")
 
 
 
140
  raise
141
 
142
  def _chunk_text(self, text: str, chunk_size: int = 256, overlap: int = 15) -> List[str]:
143
  """Split text into overlapping chunks for processing."""
144
- return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size - overlap)]
 
 
 
 
 
145
 
146
  async def analyze_chunk(
147
  self,
148
  chunk: str,
 
149
  progress: Optional[gr.Progress] = None,
150
  current_progress: float = 0,
151
  progress_step: float = 0
152
  ) -> Dict[str, float]:
153
- """Analyze a single chunk of text for triggers."""
154
  chunk_triggers = {}
 
 
155
 
156
- for category, info in self.trigger_categories.items():
157
  mapped_name = info["mapped_name"]
158
  description = info["description"]
159
 
 
160
  prompt = f"""
161
  Check this text for any indication of {mapped_name} ({description}).
162
  Be sensitive to subtle references or implications, make sure the text is not metaphorical.
@@ -166,26 +96,33 @@ class ContentAnalyzer:
166
  """
167
 
168
  try:
 
169
  inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
170
  inputs = {k: v.to(self.device) for k, v in inputs.items()}
171
 
172
  with torch.no_grad():
 
173
  outputs = self.model.generate(
174
  **inputs,
175
- max_new_tokens=10,
176
  do_sample=True,
177
- temperature=0.7,
178
- top_p=0.8,
179
  pad_token_id=self.tokenizer.eos_token_id
180
  )
181
 
182
  response_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True).strip().upper()
183
  first_word = response_text.split("\n")[-1].split()[0] if response_text else "NO"
 
184
 
185
  if first_word == "YES":
 
186
  chunk_triggers[mapped_name] = chunk_triggers.get(mapped_name, 0) + 1
187
  elif first_word == "MAYBE":
 
188
  chunk_triggers[mapped_name] = chunk_triggers.get(mapped_name, 0) + 0.5
 
 
189
 
190
  if progress:
191
  current_progress += progress_step
@@ -193,22 +130,41 @@ class ContentAnalyzer:
193
 
194
  except Exception as e:
195
  logger.error(f"Error analyzing chunk for {mapped_name}: {str(e)}")
 
 
196
 
197
  return chunk_triggers
198
 
199
  async def analyze_script(self, script: str, progress: Optional[gr.Progress] = None) -> List[str]:
200
- """Analyze the entire script for triggers with progress updates."""
 
 
 
201
  if not self.model or not self.tokenizer:
202
  await self.load_model(progress)
203
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  chunks = self._chunk_text(script)
205
  identified_triggers = {}
206
- progress_step = 0.4 / (len(chunks) * len(self.trigger_categories))
207
  current_progress = 0.5 # Starting after model loading
208
 
209
  for chunk_idx, chunk in enumerate(chunks, 1):
210
  chunk_triggers = await self.analyze_chunk(
211
  chunk,
 
212
  progress,
213
  current_progress,
214
  progress_step
@@ -220,18 +176,29 @@ class ContentAnalyzer:
220
  if progress:
221
  progress(0.95, "Finalizing results...")
222
 
223
- final_triggers = [
224
- trigger for trigger, count in identified_triggers.items()
225
- if count > 0.5
226
- ]
227
 
228
- return final_triggers if final_triggers else ["None"]
 
 
 
 
 
 
 
 
 
229
 
230
  async def analyze_content(
231
  script: str,
232
  progress: Optional[gr.Progress] = None
233
  ) -> Dict[str, Union[List[str], str]]:
234
- """Main analysis function for the Gradio interface."""
 
 
 
235
  analyzer = ContentAnalyzer()
236
 
237
  try:
@@ -247,10 +214,14 @@ async def analyze_content(
247
  "analysis_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
248
  }
249
 
 
250
  return result
251
 
252
  except Exception as e:
253
  logger.error(f"Analysis error: {str(e)}")
 
 
 
254
  return {
255
  "detected_triggers": ["Error occurred during analysis"],
256
  "confidence": "Error",
@@ -260,7 +231,7 @@ async def analyze_content(
260
  }
261
 
262
  if __name__ == "__main__":
263
- # This section is mainly for testing the analyzer directly
264
  iface = gr.Interface(
265
  fn=analyze_content,
266
  inputs=gr.Textbox(lines=8, label="Input Text"),
 
5
  import gradio as gr
6
  from typing import Dict, List, Union, Optional
7
  import logging
8
+ import traceback
9
 
10
  # Configure logging
11
  logging.basicConfig(level=logging.INFO)
 
20
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
21
  self.model = None
22
  self.tokenizer = None
23
+ logger.info(f"Initialized analyzer with device: {self.device}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  async def load_model(self, progress=None) -> None:
26
+ """Load the model and tokenizer with progress updates and detailed logging."""
27
  try:
28
+ print("\n=== Starting Model Loading ===")
29
+ print(f"Time: {datetime.now()}")
30
+
31
  if progress:
32
  progress(0.1, "Loading tokenizer...")
33
 
34
+ print("Loading tokenizer...")
35
  self.tokenizer = AutoTokenizer.from_pretrained(
36
  "meta-llama/Llama-3.2-1B",
37
  use_fast=True
 
40
  if progress:
41
  progress(0.3, "Loading model...")
42
 
43
+ print(f"Loading model on {self.device}...")
44
  self.model = AutoModelForCausalLM.from_pretrained(
45
  "meta-llama/Llama-3.2-1B",
46
  token=self.hf_token,
 
51
  if progress:
52
  progress(0.5, "Model loaded successfully")
53
 
54
+ print("Model and tokenizer loaded successfully")
55
  logger.info(f"Model loaded successfully on {self.device}")
56
  except Exception as e:
57
  logger.error(f"Error loading model: {str(e)}")
58
+ print(f"\nERROR DURING MODEL LOADING: {str(e)}")
59
+ print("Stack trace:")
60
+ traceback.print_exc()
61
  raise
62
 
63
  def _chunk_text(self, text: str, chunk_size: int = 256, overlap: int = 15) -> List[str]:
64
  """Split text into overlapping chunks for processing."""
65
+ chunks = []
66
+ for i in range(0, len(text), chunk_size - overlap):
67
+ chunk = text[i:i + chunk_size]
68
+ chunks.append(chunk)
69
+ print(f"Split text into {len(chunks)} chunks with {overlap} token overlap")
70
+ return chunks
71
 
72
  async def analyze_chunk(
73
  self,
74
  chunk: str,
75
+ trigger_categories: Dict,
76
  progress: Optional[gr.Progress] = None,
77
  current_progress: float = 0,
78
  progress_step: float = 0
79
  ) -> Dict[str, float]:
80
+ """Analyze a single chunk of text for triggers with detailed logging."""
81
  chunk_triggers = {}
82
+ print(f"\n--- Processing Chunk ---")
83
+ print(f"Chunk text (preview): {chunk[:50]}...")
84
 
85
+ for category, info in trigger_categories.items():
86
  mapped_name = info["mapped_name"]
87
  description = info["description"]
88
 
89
+ print(f"\nAnalyzing for {mapped_name}...")
90
  prompt = f"""
91
  Check this text for any indication of {mapped_name} ({description}).
92
  Be sensitive to subtle references or implications, make sure the text is not metaphorical.
 
96
  """
97
 
98
  try:
99
+ print("Sending prompt to model...")
100
  inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
101
  inputs = {k: v.to(self.device) for k, v in inputs.items()}
102
 
103
  with torch.no_grad():
104
+ print("Generating response...")
105
  outputs = self.model.generate(
106
  **inputs,
107
+ max_new_tokens=5,
108
  do_sample=True,
109
+ temperature=0.3,
110
+ top_p=0.9,
111
  pad_token_id=self.tokenizer.eos_token_id
112
  )
113
 
114
  response_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True).strip().upper()
115
  first_word = response_text.split("\n")[-1].split()[0] if response_text else "NO"
116
+ print(f"Model response for {mapped_name}: {first_word}")
117
 
118
  if first_word == "YES":
119
+ print(f"Detected {mapped_name} in this chunk!")
120
  chunk_triggers[mapped_name] = chunk_triggers.get(mapped_name, 0) + 1
121
  elif first_word == "MAYBE":
122
+ print(f"Possible {mapped_name} detected, marking for further review.")
123
  chunk_triggers[mapped_name] = chunk_triggers.get(mapped_name, 0) + 0.5
124
+ else:
125
+ print(f"No {mapped_name} detected in this chunk.")
126
 
127
  if progress:
128
  current_progress += progress_step
 
130
 
131
  except Exception as e:
132
  logger.error(f"Error analyzing chunk for {mapped_name}: {str(e)}")
133
+ print(f"Error during analysis of {mapped_name}: {str(e)}")
134
+ traceback.print_exc()
135
 
136
  return chunk_triggers
137
 
138
  async def analyze_script(self, script: str, progress: Optional[gr.Progress] = None) -> List[str]:
139
+ """Analyze the entire script for triggers with progress updates and detailed logging."""
140
+ print("\n=== Starting Script Analysis ===")
141
+ print(f"Time: {datetime.now()}")
142
+
143
  if not self.model or not self.tokenizer:
144
  await self.load_model(progress)
145
 
146
+ # Initialize trigger categories (kept from your working script)
147
+ trigger_categories = {
148
+ "Violence": {
149
+ "mapped_name": "Violence",
150
+ "description": (
151
+ "Any act involving physical force or aggression intended to cause harm, injury, or death to a person, animal, or object. "
152
+ "Includes direct physical confrontations (e.g., fights, beatings, or assaults), implied violence (e.g., very graphical threats or descriptions of injuries), "
153
+ "or large-scale events like wars, riots, or violent protests."
154
+ )
155
+ },
156
+ # ... [other categories remain the same]
157
+ }
158
+
159
  chunks = self._chunk_text(script)
160
  identified_triggers = {}
161
+ progress_step = 0.4 / (len(chunks) * len(trigger_categories))
162
  current_progress = 0.5 # Starting after model loading
163
 
164
  for chunk_idx, chunk in enumerate(chunks, 1):
165
  chunk_triggers = await self.analyze_chunk(
166
  chunk,
167
+ trigger_categories,
168
  progress,
169
  current_progress,
170
  progress_step
 
176
  if progress:
177
  progress(0.95, "Finalizing results...")
178
 
179
+ print("\n=== Analysis Complete ===")
180
+ print("Final Results:")
181
+ final_triggers = []
 
182
 
183
+ for mapped_name, count in identified_triggers.items():
184
+ if count > 0.5:
185
+ final_triggers.append(mapped_name)
186
+ print(f"- {mapped_name}: found in {count} chunks")
187
+
188
+ if not final_triggers:
189
+ print("No triggers detected")
190
+ final_triggers = ["None"]
191
+
192
+ return final_triggers
193
 
194
  async def analyze_content(
195
  script: str,
196
  progress: Optional[gr.Progress] = None
197
  ) -> Dict[str, Union[List[str], str]]:
198
+ """Main analysis function for the Gradio interface with detailed logging."""
199
+ print("\n=== Starting Content Analysis ===")
200
+ print(f"Time: {datetime.now()}")
201
+
202
  analyzer = ContentAnalyzer()
203
 
204
  try:
 
214
  "analysis_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
215
  }
216
 
217
+ print("\nFinal Result Dictionary:", result)
218
  return result
219
 
220
  except Exception as e:
221
  logger.error(f"Analysis error: {str(e)}")
222
+ print(f"\nERROR OCCURRED: {str(e)}")
223
+ print("Stack trace:")
224
+ traceback.print_exc()
225
  return {
226
  "detected_triggers": ["Error occurred during analysis"],
227
  "confidence": "Error",
 
231
  }
232
 
233
  if __name__ == "__main__":
234
+ # Gradio interface
235
  iface = gr.Interface(
236
  fn=analyze_content,
237
  inputs=gr.Textbox(lines=8, label="Input Text"),