Spaces:
Running
Running
Commit
·
bb3dffd
1
Parent(s):
3f25aef
Experimental Model
Browse files- model/analyzer.py +77 -106
model/analyzer.py
CHANGED
@@ -5,6 +5,7 @@ from datetime import datetime
|
|
5 |
import gradio as gr
|
6 |
from typing import Dict, List, Union, Optional
|
7 |
import logging
|
|
|
8 |
|
9 |
# Configure logging
|
10 |
logging.basicConfig(level=logging.INFO)
|
@@ -19,103 +20,18 @@ class ContentAnalyzer:
|
|
19 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
20 |
self.model = None
|
21 |
self.tokenizer = None
|
22 |
-
|
23 |
-
|
24 |
-
def _init_trigger_categories(self) -> Dict:
|
25 |
-
"""Initialize trigger categories with their descriptions."""
|
26 |
-
return {
|
27 |
-
"Violence": {
|
28 |
-
"mapped_name": "Violence",
|
29 |
-
"description": (
|
30 |
-
"Any act involving physical force or aggression intended to cause harm, injury, or death to a person, animal, or object. "
|
31 |
-
"Includes direct physical confrontations (e.g., fights, beatings, or assaults), implied violence (e.g., very graphical threats or descriptions of injuries), "
|
32 |
-
"or large-scale events like wars, riots, or violent protests."
|
33 |
-
)
|
34 |
-
},
|
35 |
-
"Death": {
|
36 |
-
"mapped_name": "Death References",
|
37 |
-
"description": (
|
38 |
-
"Any mention, implication, or depiction of the loss of life, including direct deaths of characters, including mentions of deceased individuals, "
|
39 |
-
"or abstract references to mortality (e.g., 'facing the end' or 'gone forever'). This also covers depictions of funerals, mourning, "
|
40 |
-
"grieving, or any dialogue that centers around death, do not take metaphors into context that don't actually lead to death."
|
41 |
-
)
|
42 |
-
},
|
43 |
-
"Substance Use": {
|
44 |
-
"mapped_name": "Substance Use",
|
45 |
-
"description": (
|
46 |
-
"Any explicit or implied reference to the consumption, misuse, or abuse of drugs, alcohol, or other intoxicating substances. "
|
47 |
-
"Includes scenes of drinking, smoking, or drug use, whether recreational or addictive. May also cover references to withdrawal symptoms, "
|
48 |
-
"rehabilitation, or substance-related paraphernalia (e.g., needles, bottles, pipes)."
|
49 |
-
)
|
50 |
-
},
|
51 |
-
"Gore": {
|
52 |
-
"mapped_name": "Gore",
|
53 |
-
"description": (
|
54 |
-
"Extremely detailed and graphic depictions of highly severe physical injuries, mutilation, or extreme bodily harm, often accompanied by descriptions of heavy blood, exposed organs, "
|
55 |
-
"or dismemberment. This includes war scenes with severe casualties, horror scenarios involving grotesque creatures, or medical procedures depicted with excessive detail."
|
56 |
-
)
|
57 |
-
},
|
58 |
-
"Vomit": {
|
59 |
-
"mapped_name": "Vomit",
|
60 |
-
"description": (
|
61 |
-
"Any reference to the act of vomiting, whether directly described, implied, or depicted in detail. This includes sounds or visual descriptions of the act, "
|
62 |
-
"mentions of nausea leading to vomiting, or its aftermath (e.g., the presence of vomit, cleaning it up, or characters reacting to it)."
|
63 |
-
)
|
64 |
-
},
|
65 |
-
"Sexual Content": {
|
66 |
-
"mapped_name": "Sexual Content",
|
67 |
-
"description": (
|
68 |
-
"Any depiction or mention of sexual activity, intimacy, or sexual behavior, ranging from implied scenes to explicit descriptions. "
|
69 |
-
"This includes romantic encounters, physical descriptions of characters in a sexual context, sexual dialogue, or references to sexual themes (e.g., harassment, innuendos)."
|
70 |
-
)
|
71 |
-
},
|
72 |
-
"Sexual Abuse": {
|
73 |
-
"mapped_name": "Sexual Abuse",
|
74 |
-
"description": (
|
75 |
-
"Any form of non-consensual sexual act, behavior, or interaction, involving coercion, manipulation, or physical force. "
|
76 |
-
"This includes incidents of sexual assault, molestation, exploitation, harassment, and any acts where an individual is subjected to sexual acts against their will or without their consent. "
|
77 |
-
"It also covers discussions or depictions of the aftermath of such abuse, such as trauma, emotional distress, legal proceedings, or therapy. "
|
78 |
-
"References to inappropriate sexual advances, groping, or any other form of sexual misconduct are also included, as well as the psychological and emotional impact on survivors. "
|
79 |
-
"Scenes where individuals are placed in sexually compromising situations, even if not directly acted upon, may also fall under this category."
|
80 |
-
)
|
81 |
-
},
|
82 |
-
"Self-Harm": {
|
83 |
-
"mapped_name": "Self-Harm",
|
84 |
-
"description": (
|
85 |
-
"Any mention or depiction of behaviors where an individual intentionally causes harm to themselves. This includes cutting, burning, or other forms of physical injury, "
|
86 |
-
"as well as suicidal ideation, suicide attempts, or discussions of self-destructive thoughts and actions. References to scars, bruises, or other lasting signs of self-harm are also included."
|
87 |
-
)
|
88 |
-
},
|
89 |
-
"Gun Use": {
|
90 |
-
"mapped_name": "Gun Use",
|
91 |
-
"description": (
|
92 |
-
"Any explicit or implied mention of firearms being handled, fired, or used in a threatening manner. This includes scenes of gun violence, references to shootings, "
|
93 |
-
"gun-related accidents, or the presence of firearms in a tense or dangerous context (e.g., holstered weapons during an argument)."
|
94 |
-
)
|
95 |
-
},
|
96 |
-
"Animal Cruelty": {
|
97 |
-
"mapped_name": "Animal Cruelty",
|
98 |
-
"description": (
|
99 |
-
"Any act of harm, abuse, or neglect toward animals, whether intentional or accidental. This includes physical abuse (e.g., hitting, injuring, or killing animals), "
|
100 |
-
"mental or emotional mistreatment (e.g., starvation, isolation), and scenes where animals are subjected to pain or suffering for human entertainment or experimentation."
|
101 |
-
)
|
102 |
-
},
|
103 |
-
"Mental Health Issues": {
|
104 |
-
"mapped_name": "Mental Health Issues",
|
105 |
-
"description": (
|
106 |
-
"Any reference to mental health struggles, disorders, or psychological distress. This includes mentions of depression, anxiety, PTSD, bipolar disorder, schizophrenia, "
|
107 |
-
"or other conditions. Scenes depicting therapy sessions, psychiatric treatment, or coping mechanisms (e.g., medication, journaling) are also included. May cover subtle hints "
|
108 |
-
"like a character expressing feelings of worthlessness, hopelessness, or detachment from reality."
|
109 |
-
)
|
110 |
-
}
|
111 |
-
}
|
112 |
|
113 |
async def load_model(self, progress=None) -> None:
|
114 |
-
"""Load the model and tokenizer with progress updates."""
|
115 |
try:
|
|
|
|
|
|
|
116 |
if progress:
|
117 |
progress(0.1, "Loading tokenizer...")
|
118 |
|
|
|
119 |
self.tokenizer = AutoTokenizer.from_pretrained(
|
120 |
"meta-llama/Llama-3.2-1B",
|
121 |
use_fast=True
|
@@ -124,6 +40,7 @@ class ContentAnalyzer:
|
|
124 |
if progress:
|
125 |
progress(0.3, "Loading model...")
|
126 |
|
|
|
127 |
self.model = AutoModelForCausalLM.from_pretrained(
|
128 |
"meta-llama/Llama-3.2-1B",
|
129 |
token=self.hf_token,
|
@@ -134,29 +51,42 @@ class ContentAnalyzer:
|
|
134 |
if progress:
|
135 |
progress(0.5, "Model loaded successfully")
|
136 |
|
|
|
137 |
logger.info(f"Model loaded successfully on {self.device}")
|
138 |
except Exception as e:
|
139 |
logger.error(f"Error loading model: {str(e)}")
|
|
|
|
|
|
|
140 |
raise
|
141 |
|
142 |
def _chunk_text(self, text: str, chunk_size: int = 256, overlap: int = 15) -> List[str]:
|
143 |
"""Split text into overlapping chunks for processing."""
|
144 |
-
|
|
|
|
|
|
|
|
|
|
|
145 |
|
146 |
async def analyze_chunk(
|
147 |
self,
|
148 |
chunk: str,
|
|
|
149 |
progress: Optional[gr.Progress] = None,
|
150 |
current_progress: float = 0,
|
151 |
progress_step: float = 0
|
152 |
) -> Dict[str, float]:
|
153 |
-
"""Analyze a single chunk of text for triggers."""
|
154 |
chunk_triggers = {}
|
|
|
|
|
155 |
|
156 |
-
for category, info in
|
157 |
mapped_name = info["mapped_name"]
|
158 |
description = info["description"]
|
159 |
|
|
|
160 |
prompt = f"""
|
161 |
Check this text for any indication of {mapped_name} ({description}).
|
162 |
Be sensitive to subtle references or implications, make sure the text is not metaphorical.
|
@@ -166,26 +96,33 @@ class ContentAnalyzer:
|
|
166 |
"""
|
167 |
|
168 |
try:
|
|
|
169 |
inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
|
170 |
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
171 |
|
172 |
with torch.no_grad():
|
|
|
173 |
outputs = self.model.generate(
|
174 |
**inputs,
|
175 |
-
max_new_tokens=
|
176 |
do_sample=True,
|
177 |
-
temperature=0.
|
178 |
-
top_p=0.
|
179 |
pad_token_id=self.tokenizer.eos_token_id
|
180 |
)
|
181 |
|
182 |
response_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True).strip().upper()
|
183 |
first_word = response_text.split("\n")[-1].split()[0] if response_text else "NO"
|
|
|
184 |
|
185 |
if first_word == "YES":
|
|
|
186 |
chunk_triggers[mapped_name] = chunk_triggers.get(mapped_name, 0) + 1
|
187 |
elif first_word == "MAYBE":
|
|
|
188 |
chunk_triggers[mapped_name] = chunk_triggers.get(mapped_name, 0) + 0.5
|
|
|
|
|
189 |
|
190 |
if progress:
|
191 |
current_progress += progress_step
|
@@ -193,22 +130,41 @@ class ContentAnalyzer:
|
|
193 |
|
194 |
except Exception as e:
|
195 |
logger.error(f"Error analyzing chunk for {mapped_name}: {str(e)}")
|
|
|
|
|
196 |
|
197 |
return chunk_triggers
|
198 |
|
199 |
async def analyze_script(self, script: str, progress: Optional[gr.Progress] = None) -> List[str]:
|
200 |
-
"""Analyze the entire script for triggers with progress updates."""
|
|
|
|
|
|
|
201 |
if not self.model or not self.tokenizer:
|
202 |
await self.load_model(progress)
|
203 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
chunks = self._chunk_text(script)
|
205 |
identified_triggers = {}
|
206 |
-
progress_step = 0.4 / (len(chunks) * len(
|
207 |
current_progress = 0.5 # Starting after model loading
|
208 |
|
209 |
for chunk_idx, chunk in enumerate(chunks, 1):
|
210 |
chunk_triggers = await self.analyze_chunk(
|
211 |
chunk,
|
|
|
212 |
progress,
|
213 |
current_progress,
|
214 |
progress_step
|
@@ -220,18 +176,29 @@ class ContentAnalyzer:
|
|
220 |
if progress:
|
221 |
progress(0.95, "Finalizing results...")
|
222 |
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
]
|
227 |
|
228 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
229 |
|
230 |
async def analyze_content(
|
231 |
script: str,
|
232 |
progress: Optional[gr.Progress] = None
|
233 |
) -> Dict[str, Union[List[str], str]]:
|
234 |
-
"""Main analysis function for the Gradio interface."""
|
|
|
|
|
|
|
235 |
analyzer = ContentAnalyzer()
|
236 |
|
237 |
try:
|
@@ -247,10 +214,14 @@ async def analyze_content(
|
|
247 |
"analysis_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
248 |
}
|
249 |
|
|
|
250 |
return result
|
251 |
|
252 |
except Exception as e:
|
253 |
logger.error(f"Analysis error: {str(e)}")
|
|
|
|
|
|
|
254 |
return {
|
255 |
"detected_triggers": ["Error occurred during analysis"],
|
256 |
"confidence": "Error",
|
@@ -260,7 +231,7 @@ async def analyze_content(
|
|
260 |
}
|
261 |
|
262 |
if __name__ == "__main__":
|
263 |
-
#
|
264 |
iface = gr.Interface(
|
265 |
fn=analyze_content,
|
266 |
inputs=gr.Textbox(lines=8, label="Input Text"),
|
|
|
5 |
import gradio as gr
|
6 |
from typing import Dict, List, Union, Optional
|
7 |
import logging
|
8 |
+
import traceback
|
9 |
|
10 |
# Configure logging
|
11 |
logging.basicConfig(level=logging.INFO)
|
|
|
20 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
21 |
self.model = None
|
22 |
self.tokenizer = None
|
23 |
+
logger.info(f"Initialized analyzer with device: {self.device}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
async def load_model(self, progress=None) -> None:
|
26 |
+
"""Load the model and tokenizer with progress updates and detailed logging."""
|
27 |
try:
|
28 |
+
print("\n=== Starting Model Loading ===")
|
29 |
+
print(f"Time: {datetime.now()}")
|
30 |
+
|
31 |
if progress:
|
32 |
progress(0.1, "Loading tokenizer...")
|
33 |
|
34 |
+
print("Loading tokenizer...")
|
35 |
self.tokenizer = AutoTokenizer.from_pretrained(
|
36 |
"meta-llama/Llama-3.2-1B",
|
37 |
use_fast=True
|
|
|
40 |
if progress:
|
41 |
progress(0.3, "Loading model...")
|
42 |
|
43 |
+
print(f"Loading model on {self.device}...")
|
44 |
self.model = AutoModelForCausalLM.from_pretrained(
|
45 |
"meta-llama/Llama-3.2-1B",
|
46 |
token=self.hf_token,
|
|
|
51 |
if progress:
|
52 |
progress(0.5, "Model loaded successfully")
|
53 |
|
54 |
+
print("Model and tokenizer loaded successfully")
|
55 |
logger.info(f"Model loaded successfully on {self.device}")
|
56 |
except Exception as e:
|
57 |
logger.error(f"Error loading model: {str(e)}")
|
58 |
+
print(f"\nERROR DURING MODEL LOADING: {str(e)}")
|
59 |
+
print("Stack trace:")
|
60 |
+
traceback.print_exc()
|
61 |
raise
|
62 |
|
63 |
def _chunk_text(self, text: str, chunk_size: int = 256, overlap: int = 15) -> List[str]:
|
64 |
"""Split text into overlapping chunks for processing."""
|
65 |
+
chunks = []
|
66 |
+
for i in range(0, len(text), chunk_size - overlap):
|
67 |
+
chunk = text[i:i + chunk_size]
|
68 |
+
chunks.append(chunk)
|
69 |
+
print(f"Split text into {len(chunks)} chunks with {overlap} token overlap")
|
70 |
+
return chunks
|
71 |
|
72 |
async def analyze_chunk(
|
73 |
self,
|
74 |
chunk: str,
|
75 |
+
trigger_categories: Dict,
|
76 |
progress: Optional[gr.Progress] = None,
|
77 |
current_progress: float = 0,
|
78 |
progress_step: float = 0
|
79 |
) -> Dict[str, float]:
|
80 |
+
"""Analyze a single chunk of text for triggers with detailed logging."""
|
81 |
chunk_triggers = {}
|
82 |
+
print(f"\n--- Processing Chunk ---")
|
83 |
+
print(f"Chunk text (preview): {chunk[:50]}...")
|
84 |
|
85 |
+
for category, info in trigger_categories.items():
|
86 |
mapped_name = info["mapped_name"]
|
87 |
description = info["description"]
|
88 |
|
89 |
+
print(f"\nAnalyzing for {mapped_name}...")
|
90 |
prompt = f"""
|
91 |
Check this text for any indication of {mapped_name} ({description}).
|
92 |
Be sensitive to subtle references or implications, make sure the text is not metaphorical.
|
|
|
96 |
"""
|
97 |
|
98 |
try:
|
99 |
+
print("Sending prompt to model...")
|
100 |
inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
|
101 |
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
102 |
|
103 |
with torch.no_grad():
|
104 |
+
print("Generating response...")
|
105 |
outputs = self.model.generate(
|
106 |
**inputs,
|
107 |
+
max_new_tokens=5,
|
108 |
do_sample=True,
|
109 |
+
temperature=0.3,
|
110 |
+
top_p=0.9,
|
111 |
pad_token_id=self.tokenizer.eos_token_id
|
112 |
)
|
113 |
|
114 |
response_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True).strip().upper()
|
115 |
first_word = response_text.split("\n")[-1].split()[0] if response_text else "NO"
|
116 |
+
print(f"Model response for {mapped_name}: {first_word}")
|
117 |
|
118 |
if first_word == "YES":
|
119 |
+
print(f"Detected {mapped_name} in this chunk!")
|
120 |
chunk_triggers[mapped_name] = chunk_triggers.get(mapped_name, 0) + 1
|
121 |
elif first_word == "MAYBE":
|
122 |
+
print(f"Possible {mapped_name} detected, marking for further review.")
|
123 |
chunk_triggers[mapped_name] = chunk_triggers.get(mapped_name, 0) + 0.5
|
124 |
+
else:
|
125 |
+
print(f"No {mapped_name} detected in this chunk.")
|
126 |
|
127 |
if progress:
|
128 |
current_progress += progress_step
|
|
|
130 |
|
131 |
except Exception as e:
|
132 |
logger.error(f"Error analyzing chunk for {mapped_name}: {str(e)}")
|
133 |
+
print(f"Error during analysis of {mapped_name}: {str(e)}")
|
134 |
+
traceback.print_exc()
|
135 |
|
136 |
return chunk_triggers
|
137 |
|
138 |
async def analyze_script(self, script: str, progress: Optional[gr.Progress] = None) -> List[str]:
|
139 |
+
"""Analyze the entire script for triggers with progress updates and detailed logging."""
|
140 |
+
print("\n=== Starting Script Analysis ===")
|
141 |
+
print(f"Time: {datetime.now()}")
|
142 |
+
|
143 |
if not self.model or not self.tokenizer:
|
144 |
await self.load_model(progress)
|
145 |
|
146 |
+
# Initialize trigger categories (kept from your working script)
|
147 |
+
trigger_categories = {
|
148 |
+
"Violence": {
|
149 |
+
"mapped_name": "Violence",
|
150 |
+
"description": (
|
151 |
+
"Any act involving physical force or aggression intended to cause harm, injury, or death to a person, animal, or object. "
|
152 |
+
"Includes direct physical confrontations (e.g., fights, beatings, or assaults), implied violence (e.g., very graphical threats or descriptions of injuries), "
|
153 |
+
"or large-scale events like wars, riots, or violent protests."
|
154 |
+
)
|
155 |
+
},
|
156 |
+
# ... [other categories remain the same]
|
157 |
+
}
|
158 |
+
|
159 |
chunks = self._chunk_text(script)
|
160 |
identified_triggers = {}
|
161 |
+
progress_step = 0.4 / (len(chunks) * len(trigger_categories))
|
162 |
current_progress = 0.5 # Starting after model loading
|
163 |
|
164 |
for chunk_idx, chunk in enumerate(chunks, 1):
|
165 |
chunk_triggers = await self.analyze_chunk(
|
166 |
chunk,
|
167 |
+
trigger_categories,
|
168 |
progress,
|
169 |
current_progress,
|
170 |
progress_step
|
|
|
176 |
if progress:
|
177 |
progress(0.95, "Finalizing results...")
|
178 |
|
179 |
+
print("\n=== Analysis Complete ===")
|
180 |
+
print("Final Results:")
|
181 |
+
final_triggers = []
|
|
|
182 |
|
183 |
+
for mapped_name, count in identified_triggers.items():
|
184 |
+
if count > 0.5:
|
185 |
+
final_triggers.append(mapped_name)
|
186 |
+
print(f"- {mapped_name}: found in {count} chunks")
|
187 |
+
|
188 |
+
if not final_triggers:
|
189 |
+
print("No triggers detected")
|
190 |
+
final_triggers = ["None"]
|
191 |
+
|
192 |
+
return final_triggers
|
193 |
|
194 |
async def analyze_content(
|
195 |
script: str,
|
196 |
progress: Optional[gr.Progress] = None
|
197 |
) -> Dict[str, Union[List[str], str]]:
|
198 |
+
"""Main analysis function for the Gradio interface with detailed logging."""
|
199 |
+
print("\n=== Starting Content Analysis ===")
|
200 |
+
print(f"Time: {datetime.now()}")
|
201 |
+
|
202 |
analyzer = ContentAnalyzer()
|
203 |
|
204 |
try:
|
|
|
214 |
"analysis_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
215 |
}
|
216 |
|
217 |
+
print("\nFinal Result Dictionary:", result)
|
218 |
return result
|
219 |
|
220 |
except Exception as e:
|
221 |
logger.error(f"Analysis error: {str(e)}")
|
222 |
+
print(f"\nERROR OCCURRED: {str(e)}")
|
223 |
+
print("Stack trace:")
|
224 |
+
traceback.print_exc()
|
225 |
return {
|
226 |
"detected_triggers": ["Error occurred during analysis"],
|
227 |
"confidence": "Error",
|
|
|
231 |
}
|
232 |
|
233 |
if __name__ == "__main__":
|
234 |
+
# Gradio interface
|
235 |
iface = gr.Interface(
|
236 |
fn=analyze_content,
|
237 |
inputs=gr.Textbox(lines=8, label="Input Text"),
|