|
|
|
|
|
|
|
import numpy as np |
|
import pandas as pd |
|
from typing import List, Dict, Any |
|
import logging |
|
import re |
|
from sklearn.base import BaseEstimator, TransformerMixin |
|
from collections import Counter |
|
import warnings |
|
warnings.filterwarnings('ignore') |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
class SentimentAnalyzer(BaseEstimator, TransformerMixin): |
|
""" |
|
Advanced sentiment analysis for fake news detection. |
|
Focuses on emotional manipulation patterns common in misinformation. |
|
""" |
|
|
|
def __init__(self): |
|
self.emotion_lexicons = self._load_emotion_lexicons() |
|
self.manipulation_patterns = self._load_manipulation_patterns() |
|
self.is_fitted_ = False |
|
|
|
def _load_emotion_lexicons(self): |
|
"""Load emotion lexicons for sentiment analysis""" |
|
|
|
lexicons = { |
|
'positive': { |
|
'amazing', 'awesome', 'brilliant', 'excellent', 'fantastic', 'great', |
|
'incredible', 'outstanding', 'perfect', 'wonderful', 'superb', 'magnificent', |
|
'love', 'adore', 'cherish', 'treasure', 'admire', 'appreciate', |
|
'happy', 'joyful', 'pleased', 'delighted', 'thrilled', 'ecstatic', |
|
'hope', 'optimistic', 'confident', 'assured', 'certain', 'positive' |
|
}, |
|
'negative': { |
|
'awful', 'terrible', 'horrible', 'disgusting', 'appalling', 'shocking', |
|
'hate', 'despise', 'loathe', 'detest', 'abhor', 'resent', |
|
'angry', 'furious', 'outraged', 'livid', 'irate', 'enraged', |
|
'sad', 'depressed', 'miserable', 'devastated', 'heartbroken', 'grief', |
|
'fear', 'afraid', 'terrified', 'scared', 'anxious', 'worried', |
|
'disaster', 'catastrophe', 'crisis', 'emergency', 'danger', 'threat' |
|
}, |
|
'anger': { |
|
'angry', 'furious', 'outraged', 'livid', 'irate', 'enraged', 'mad', |
|
'rage', 'fury', 'wrath', 'indignant', 'resentful', 'hostile', |
|
'attack', 'assault', 'violence', 'aggression', 'combat', 'fight' |
|
}, |
|
'fear': { |
|
'fear', 'afraid', 'terrified', 'scared', 'anxious', 'worried', 'panic', |
|
'terror', 'horror', 'dread', 'nightmare', 'threat', 'danger', |
|
'risk', 'warning', 'alert', 'caution', 'alarm', 'emergency' |
|
}, |
|
'trust': { |
|
'trust', 'believe', 'faith', 'confidence', 'reliable', 'honest', |
|
'truthful', 'sincere', 'genuine', 'authentic', 'credible', 'trustworthy' |
|
}, |
|
'disgust': { |
|
'disgusting', 'revolting', 'repulsive', 'nauseating', 'sickening', |
|
'corrupt', 'contaminated', 'polluted', 'tainted', 'filthy', 'dirty' |
|
} |
|
} |
|
|
|
return lexicons |
|
|
|
def _load_manipulation_patterns(self): |
|
"""Load patterns common in emotional manipulation""" |
|
patterns = { |
|
'urgency_words': { |
|
'urgent', 'immediate', 'emergency', 'crisis', 'breaking', 'alert', |
|
'now', 'quickly', 'hurry', 'rush', 'asap', 'immediately' |
|
}, |
|
'authority_claims': { |
|
'experts', 'scientists', 'doctors', 'officials', 'authorities', |
|
'government', 'studies', 'research', 'proven', 'confirmed' |
|
}, |
|
'conspiracy_words': { |
|
'conspiracy', 'cover-up', 'hidden', 'secret', 'expose', 'reveal', |
|
'truth', 'lies', 'deception', 'agenda', 'plot', 'scheme' |
|
}, |
|
'absolute_terms': { |
|
'always', 'never', 'all', 'none', 'every', 'everyone', 'nobody', |
|
'everywhere', 'nowhere', 'completely', 'totally', 'absolutely' |
|
}, |
|
'divisive_language': { |
|
'us', 'them', 'enemy', 'traitor', 'patriot', 'real', 'fake', |
|
'elite', 'establishment', 'mainstream', 'alternative' |
|
} |
|
} |
|
|
|
return patterns |
|
|
|
def fit(self, X, y=None): |
|
"""Fit the sentiment analyzer (mainly for API consistency)""" |
|
self.is_fitted_ = True |
|
return self |
|
|
|
def transform(self, X): |
|
"""Extract sentiment and emotional manipulation features""" |
|
if not self.is_fitted_: |
|
raise ValueError("SentimentAnalyzer must be fitted before transform") |
|
|
|
|
|
if isinstance(X, pd.Series): |
|
X = X.values |
|
elif isinstance(X, list): |
|
X = np.array(X) |
|
|
|
features = [] |
|
|
|
for text in X: |
|
text_features = self._extract_sentiment_features(str(text)) |
|
features.append(text_features) |
|
|
|
return np.array(features) |
|
|
|
def fit_transform(self, X, y=None): |
|
"""Fit and transform in one step""" |
|
return self.fit(X, y).transform(X) |
|
|
|
def _extract_sentiment_features(self, text): |
|
"""Extract comprehensive sentiment features from text""" |
|
text_lower = text.lower() |
|
words = re.findall(r'\b\w+\b', text_lower) |
|
total_words = len(words) |
|
|
|
if total_words == 0: |
|
return [0.0] * 25 |
|
|
|
features = [] |
|
|
|
|
|
for emotion, lexicon in self.emotion_lexicons.items(): |
|
emotion_count = sum(1 for word in words if word in lexicon) |
|
emotion_ratio = emotion_count / total_words |
|
features.append(emotion_ratio) |
|
|
|
|
|
for pattern_type, pattern_words in self.manipulation_patterns.items(): |
|
pattern_count = sum(1 for word in words if word in pattern_words) |
|
pattern_ratio = pattern_count / total_words |
|
features.append(pattern_ratio) |
|
|
|
|
|
features.extend(self._extract_advanced_sentiment_features(text, words)) |
|
|
|
return features |
|
|
|
def _extract_advanced_sentiment_features(self, text, words): |
|
"""Extract advanced sentiment and emotional manipulation features""" |
|
features = [] |
|
|
|
|
|
exclamation_count = text.count('!') |
|
question_count = text.count('?') |
|
features.append(exclamation_count / len(text) if len(text) > 0 else 0) |
|
features.append(question_count / len(text) if len(text) > 0 else 0) |
|
|
|
|
|
caps_words = sum(1 for word in words if word.isupper() and len(word) > 1) |
|
features.append(caps_words / len(words) if len(words) > 0 else 0) |
|
|
|
|
|
intense_exclamation = len(re.findall(r'!{2,}', text)) |
|
intense_question = len(re.findall(r'\?{2,}', text)) |
|
features.append(intense_exclamation / len(text) if len(text) > 0 else 0) |
|
features.append(intense_question / len(text) if len(text) > 0 else 0) |
|
|
|
|
|
positive_count = sum(1 for word in words if word in self.emotion_lexicons['positive']) |
|
negative_count = sum(1 for word in words if word in self.emotion_lexicons['negative']) |
|
|
|
if positive_count + negative_count > 0: |
|
emotional_contrast = min(positive_count, negative_count) / (positive_count + negative_count) |
|
else: |
|
emotional_contrast = 0 |
|
features.append(emotional_contrast) |
|
|
|
|
|
authority_count = sum(1 for word in words if word in self.manipulation_patterns['authority_claims']) |
|
conspiracy_count = sum(1 for word in words if word in self.manipulation_patterns['conspiracy_words']) |
|
|
|
total_claims = authority_count + conspiracy_count |
|
if total_claims > 0: |
|
authority_ratio = authority_count / total_claims |
|
else: |
|
authority_ratio = 0.5 |
|
features.append(authority_ratio) |
|
|
|
|
|
urgency_count = sum(1 for word in words if word in self.manipulation_patterns['urgency_words']) |
|
features.append(urgency_count / len(words) if len(words) > 0 else 0) |
|
|
|
|
|
personal_pronouns = {'you', 'your', 'yours', 'we', 'us', 'our', 'ours'} |
|
pronoun_count = sum(1 for word in words if word in personal_pronouns) |
|
features.append(pronoun_count / len(words) if len(words) > 0 else 0) |
|
|
|
return features |
|
|
|
def get_feature_names(self): |
|
"""Get names of extracted features""" |
|
feature_names = [] |
|
|
|
|
|
for emotion in self.emotion_lexicons.keys(): |
|
feature_names.append(f'sentiment_{emotion}_ratio') |
|
|
|
|
|
for pattern in self.manipulation_patterns.keys(): |
|
feature_names.append(f'sentiment_{pattern}_ratio') |
|
|
|
|
|
advanced_features = [ |
|
'sentiment_exclamation_density', |
|
'sentiment_question_density', |
|
'sentiment_caps_words_ratio', |
|
'sentiment_intense_exclamation_density', |
|
'sentiment_intense_question_density', |
|
'sentiment_emotional_contrast', |
|
'sentiment_authority_ratio', |
|
'sentiment_urgency_density', |
|
'sentiment_personal_pronouns_ratio' |
|
] |
|
|
|
feature_names.extend(advanced_features) |
|
|
|
return feature_names |
|
|
|
def analyze_text_sentiment(self, text): |
|
"""Detailed sentiment analysis of a single text""" |
|
if not self.is_fitted_: |
|
raise ValueError("SentimentAnalyzer must be fitted before analysis") |
|
|
|
text_lower = text.lower() |
|
words = re.findall(r'\b\w+\b', text_lower) |
|
|
|
analysis = { |
|
'text_length': len(text), |
|
'word_count': len(words), |
|
'emotions': {}, |
|
'manipulation_patterns': {}, |
|
'overall_sentiment': 'neutral', |
|
'manipulation_score': 0.0, |
|
'emotional_intensity': 0.0 |
|
} |
|
|
|
total_words = len(words) |
|
if total_words == 0: |
|
return analysis |
|
|
|
|
|
for emotion, lexicon in self.emotion_lexicons.items(): |
|
emotion_count = sum(1 for word in words if word in lexicon) |
|
analysis['emotions'][emotion] = { |
|
'count': emotion_count, |
|
'ratio': emotion_count / total_words, |
|
'words_found': [word for word in words if word in lexicon][:5] |
|
} |
|
|
|
|
|
for pattern_type, pattern_words in self.manipulation_patterns.items(): |
|
pattern_count = sum(1 for word in words if word in pattern_words) |
|
analysis['manipulation_patterns'][pattern_type] = { |
|
'count': pattern_count, |
|
'ratio': pattern_count / total_words, |
|
'words_found': [word for word in words if word in pattern_words][:3] |
|
} |
|
|
|
|
|
positive_score = analysis['emotions']['positive']['ratio'] |
|
negative_score = analysis['emotions']['negative']['ratio'] |
|
|
|
if positive_score > negative_score + 0.02: |
|
analysis['overall_sentiment'] = 'positive' |
|
elif negative_score > positive_score + 0.02: |
|
analysis['overall_sentiment'] = 'negative' |
|
else: |
|
analysis['overall_sentiment'] = 'neutral' |
|
|
|
|
|
manipulation_indicators = [ |
|
analysis['manipulation_patterns']['urgency_words']['ratio'], |
|
analysis['manipulation_patterns']['conspiracy_words']['ratio'], |
|
analysis['manipulation_patterns']['absolute_terms']['ratio'], |
|
analysis['manipulation_patterns']['divisive_language']['ratio'] |
|
] |
|
analysis['manipulation_score'] = sum(manipulation_indicators) / len(manipulation_indicators) |
|
|
|
|
|
fear_anger_score = (analysis['emotions']['fear']['ratio'] + |
|
analysis['emotions']['anger']['ratio']) |
|
exclamation_density = text.count('!') / len(text) if len(text) > 0 else 0 |
|
caps_density = sum(1 for c in text if c.isupper()) / len(text) if len(text) > 0 else 0 |
|
|
|
analysis['emotional_intensity'] = (fear_anger_score + exclamation_density + caps_density) / 3 |
|
|
|
return analysis |
|
|
|
def get_manipulation_indicators(self, text): |
|
"""Get specific manipulation indicators for fact-checking""" |
|
analysis = self.analyze_text_sentiment(text) |
|
|
|
indicators = { |
|
'high_emotional_intensity': analysis['emotional_intensity'] > 0.1, |
|
'urgency_manipulation': analysis['manipulation_patterns']['urgency_words']['ratio'] > 0.02, |
|
'conspiracy_language': analysis['manipulation_patterns']['conspiracy_words']['ratio'] > 0.01, |
|
'absolute_claims': analysis['manipulation_patterns']['absolute_terms']['ratio'] > 0.03, |
|
'divisive_framing': analysis['manipulation_patterns']['divisive_language']['ratio'] > 0.02, |
|
'emotional_overload': (analysis['emotions']['fear']['ratio'] + |
|
analysis['emotions']['anger']['ratio']) > 0.05 |
|
} |
|
|
|
|
|
risk_score = sum(indicators.values()) / len(indicators) |
|
indicators['overall_manipulation_risk'] = 'high' if risk_score > 0.5 else 'medium' if risk_score > 0.3 else 'low' |
|
|
|
return indicators |