Kuberwastaken commited on
Commit
e15502b
·
1 Parent(s): e4c9f8c

Fixed Analyse Script

Browse files
Files changed (1) hide show
  1. model/analyzer.py +56 -11
model/analyzer.py CHANGED
@@ -26,7 +26,7 @@ class ContentAnalyzer:
26
  self.tokenizer = AutoTokenizer.from_pretrained(
27
  self.model_name,
28
  use_fast=True,
29
- token=os.environ.get("HF_TOKEN") # Add token here
30
  )
31
 
32
  print(f"Loading model on {self.device}...")
@@ -35,14 +35,13 @@ class ContentAnalyzer:
35
  torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
36
  low_cpu_mem_usage=True,
37
  device_map="auto",
38
- token=os.environ.get("HF_TOKEN") # Add token here
39
  )
40
  return True
41
  except Exception as e:
42
  print(f"Model loading error: {str(e)}")
43
  return False
44
 
45
- # Rest of your code remains exactly the same
46
  def cleanup(self):
47
  if self.device == "cuda":
48
  torch.cuda.empty_cache()
@@ -52,6 +51,7 @@ class ContentAnalyzer:
52
  mapped_name = category_info["mapped_name"]
53
  description = category_info["description"]
54
 
 
55
  prompt = f"""Check this text for any indication of {mapped_name} ({description}).
56
  Be sensitive to subtle references or implications, make sure the text is not metaphorical.
57
  Respond concisely with: YES, NO, or MAYBE.
@@ -59,10 +59,12 @@ class ContentAnalyzer:
59
  Answer:"""
60
 
61
  try:
 
62
  inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
63
  inputs = {k: v.to(self.device) for k, v in inputs.items()}
64
 
65
  with torch.no_grad():
 
66
  outputs = self.model.generate(
67
  **inputs,
68
  max_new_tokens=10,
@@ -75,6 +77,14 @@ class ContentAnalyzer:
75
  response = self.tokenizer.decode(outputs[0], skip_special_tokens=True).strip().upper()
76
  first_word = response.split("\n")[-1].split()[0] if response else "NO"
77
 
 
 
 
 
 
 
 
 
78
  score = 1 if first_word == "YES" else 0.5 if first_word == "MAYBE" else 0
79
  return score, first_word
80
 
@@ -83,6 +93,9 @@ class ContentAnalyzer:
83
  return 0, "NO"
84
 
85
  def analyze_text(self, text):
 
 
 
86
  if not self.load_model():
87
  return {
88
  "detected_triggers": {"0": "Error"},
@@ -95,14 +108,44 @@ class ContentAnalyzer:
95
  overlap = 15
96
  script_chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size - overlap)]
97
 
 
98
  trigger_categories = {
99
- "Violence": {"mapped_name": "Violence", "description": "Any act involving physical force or aggression intended to cause harm, injury, or death."},
100
- "Death": {"mapped_name": "Death References", "description": "Any mention, implication, or depiction of the loss of life, including direct deaths or abstract references to mortality."},
101
- "Substance_Use": {"mapped_name": "Substance Use", "description": "References to consumption, misuse, or abuse of drugs, alcohol, or other intoxicating substances."},
102
- "Gore": {"mapped_name": "Gore", "description": "Graphic depictions of severe physical injuries, mutilation, or extreme bodily harm."},
103
- "Sexual_Content": {"mapped_name": "Sexual Content", "description": "Depictions or mentions of sexual activity, intimacy, or sexual behavior."},
104
- "Self_Harm": {"mapped_name": "Self-Harm", "description": "Behaviors where an individual intentionally causes harm to themselves."},
105
- "Mental_Health": {"mapped_name": "Mental Health Issues", "description": "References to mental health struggles, disorders, or psychological distress."}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  }
107
 
108
  identified_triggers = {}
@@ -119,6 +162,7 @@ class ContentAnalyzer:
119
  final_triggers = [category for category, count in identified_triggers.items() if count > 0.5]
120
  self.cleanup()
121
 
 
122
  if not final_triggers:
123
  result = {
124
  "detected_triggers": {"0": "None"},
@@ -134,7 +178,8 @@ class ContentAnalyzer:
134
  "model": self.model_name,
135
  "analysis_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
136
  }
137
-
 
138
  return result
139
 
140
  def analyze_content(text):
 
26
  self.tokenizer = AutoTokenizer.from_pretrained(
27
  self.model_name,
28
  use_fast=True,
29
+ token=os.environ.get("HF_TOKEN")
30
  )
31
 
32
  print(f"Loading model on {self.device}...")
 
35
  torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
36
  low_cpu_mem_usage=True,
37
  device_map="auto",
38
+ token=os.environ.get("HF_TOKEN")
39
  )
40
  return True
41
  except Exception as e:
42
  print(f"Model loading error: {str(e)}")
43
  return False
44
 
 
45
  def cleanup(self):
46
  if self.device == "cuda":
47
  torch.cuda.empty_cache()
 
51
  mapped_name = category_info["mapped_name"]
52
  description = category_info["description"]
53
 
54
+ print(f"\nAnalyzing for {mapped_name}...")
55
  prompt = f"""Check this text for any indication of {mapped_name} ({description}).
56
  Be sensitive to subtle references or implications, make sure the text is not metaphorical.
57
  Respond concisely with: YES, NO, or MAYBE.
 
59
  Answer:"""
60
 
61
  try:
62
+ print(f"Sending prompt to model...")
63
  inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
64
  inputs = {k: v.to(self.device) for k, v in inputs.items()}
65
 
66
  with torch.no_grad():
67
+ print("Generating response...")
68
  outputs = self.model.generate(
69
  **inputs,
70
  max_new_tokens=10,
 
77
  response = self.tokenizer.decode(outputs[0], skip_special_tokens=True).strip().upper()
78
  first_word = response.split("\n")[-1].split()[0] if response else "NO"
79
 
80
+ print(f"Model response for {mapped_name}: {first_word}")
81
+ if first_word == "YES":
82
+ print(f"Detected {mapped_name} in this chunk!")
83
+ elif first_word == "MAYBE":
84
+ print(f"Possible {mapped_name} detected, marking for review.")
85
+ else:
86
+ print(f"No {mapped_name} detected in this chunk.")
87
+
88
  score = 1 if first_word == "YES" else 0.5 if first_word == "MAYBE" else 0
89
  return score, first_word
90
 
 
93
  return 0, "NO"
94
 
95
  def analyze_text(self, text):
96
+ print("\n=== Starting Analysis ===")
97
+ print(f"Time: {datetime.now()}")
98
+
99
  if not self.load_model():
100
  return {
101
  "detected_triggers": {"0": "Error"},
 
108
  overlap = 15
109
  script_chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size - overlap)]
110
 
111
+ # Using the more detailed trigger categories
112
  trigger_categories = {
113
+ "Violence": {
114
+ "mapped_name": "Violence",
115
+ "description": "Any act involving physical force or aggression intended to cause harm, injury, or death to a person, animal, or object. Includes direct physical confrontations, implied violence, or large-scale events like wars, riots, or violent protests."
116
+ },
117
+ "Death": {
118
+ "mapped_name": "Death References",
119
+ "description": "Any mention, implication, or depiction of the loss of life, including direct deaths of characters, mentions of deceased individuals, or abstract references to mortality."
120
+ },
121
+ "Substance_Use": {
122
+ "mapped_name": "Substance Use",
123
+ "description": "Any explicit or implied reference to the consumption, misuse, or abuse of drugs, alcohol, or other intoxicating substances."
124
+ },
125
+ "Gore": {
126
+ "mapped_name": "Gore",
127
+ "description": "Extremely detailed and graphic depictions of highly severe physical injuries, mutilation, or extreme bodily harm."
128
+ },
129
+ "Sexual_Content": {
130
+ "mapped_name": "Sexual Content",
131
+ "description": "Any depiction or mention of sexual activity, intimacy, or sexual behavior."
132
+ },
133
+ "Self_Harm": {
134
+ "mapped_name": "Self-Harm",
135
+ "description": "Any mention or depiction of behaviors where an individual intentionally causes harm to themselves."
136
+ },
137
+ "Gun_Use": {
138
+ "mapped_name": "Gun Use",
139
+ "description": "Any explicit or implied mention of firearms being handled, fired, or used in a threatening manner."
140
+ },
141
+ "Animal_Cruelty": {
142
+ "mapped_name": "Animal Cruelty",
143
+ "description": "Any act of harm, abuse, or neglect toward animals, whether intentional or accidental."
144
+ },
145
+ "Mental_Health": {
146
+ "mapped_name": "Mental Health Issues",
147
+ "description": "Any reference to mental health struggles, disorders, or psychological distress."
148
+ }
149
  }
150
 
151
  identified_triggers = {}
 
162
  final_triggers = [category for category, count in identified_triggers.items() if count > 0.5]
163
  self.cleanup()
164
 
165
+ print("\n=== Analysis Complete ===")
166
  if not final_triggers:
167
  result = {
168
  "detected_triggers": {"0": "None"},
 
178
  "model": self.model_name,
179
  "analysis_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
180
  }
181
+
182
+ print("\nFinal Result:", result)
183
  return result
184
 
185
  def analyze_content(text):