Spaces:

Ahmedik95316
/

Fake-News-Detection-with-MLOps

Running

App Files Files Community

Fake-News-Detection-with-MLOps / features /sentiment_analyzer.py

Ahmedik95316

Update features/sentiment_analyzer.py

be90b49 19 days ago

raw

history blame contribute delete

14.1 kB

	# features/sentiment_analyzer.py
	# Sentiment Analysis Component for Enhanced Feature Engineering

	import numpy as np
	import pandas as pd
	from typing import List, Dict, Any
	import logging
	import re
	from sklearn.base import BaseEstimator, TransformerMixin
	from collections import Counter
	import warnings
	warnings.filterwarnings('ignore')

	logger = logging.getLogger(__name__)


	class SentimentAnalyzer(BaseEstimator, TransformerMixin):
	"""
	Advanced sentiment analysis for fake news detection.
	Focuses on emotional manipulation patterns common in misinformation.
	"""

	def __init__(self):
	self.emotion_lexicons = self._load_emotion_lexicons()
	self.manipulation_patterns = self._load_manipulation_patterns()
	self.is_fitted_ = False

	def _load_emotion_lexicons(self):
	"""Load emotion lexicons for sentiment analysis"""
	# Basic emotion lexicons (in production, these could be loaded from files)
	lexicons = {
	'positive': {
	'amazing', 'awesome', 'brilliant', 'excellent', 'fantastic', 'great',
	'incredible', 'outstanding', 'perfect', 'wonderful', 'superb', 'magnificent',
	'love', 'adore', 'cherish', 'treasure', 'admire', 'appreciate',
	'happy', 'joyful', 'pleased', 'delighted', 'thrilled', 'ecstatic',
	'hope', 'optimistic', 'confident', 'assured', 'certain', 'positive'
	},
	'negative': {
	'awful', 'terrible', 'horrible', 'disgusting', 'appalling', 'shocking',
	'hate', 'despise', 'loathe', 'detest', 'abhor', 'resent',
	'angry', 'furious', 'outraged', 'livid', 'irate', 'enraged',
	'sad', 'depressed', 'miserable', 'devastated', 'heartbroken', 'grief',
	'fear', 'afraid', 'terrified', 'scared', 'anxious', 'worried',
	'disaster', 'catastrophe', 'crisis', 'emergency', 'danger', 'threat'
	},
	'anger': {
	'angry', 'furious', 'outraged', 'livid', 'irate', 'enraged', 'mad',
	'rage', 'fury', 'wrath', 'indignant', 'resentful', 'hostile',
	'attack', 'assault', 'violence', 'aggression', 'combat', 'fight'
	},
	'fear': {
	'fear', 'afraid', 'terrified', 'scared', 'anxious', 'worried', 'panic',
	'terror', 'horror', 'dread', 'nightmare', 'threat', 'danger',
	'risk', 'warning', 'alert', 'caution', 'alarm', 'emergency'
	},
	'trust': {
	'trust', 'believe', 'faith', 'confidence', 'reliable', 'honest',
	'truthful', 'sincere', 'genuine', 'authentic', 'credible', 'trustworthy'
	},
	'disgust': {
	'disgusting', 'revolting', 'repulsive', 'nauseating', 'sickening',
	'corrupt', 'contaminated', 'polluted', 'tainted', 'filthy', 'dirty'
	}
	}

	return lexicons

	def _load_manipulation_patterns(self):
	"""Load patterns common in emotional manipulation"""
	patterns = {
	'urgency_words': {
	'urgent', 'immediate', 'emergency', 'crisis', 'breaking', 'alert',
	'now', 'quickly', 'hurry', 'rush', 'asap', 'immediately'
	},
	'authority_claims': {
	'experts', 'scientists', 'doctors', 'officials', 'authorities',
	'government', 'studies', 'research', 'proven', 'confirmed'
	},
	'conspiracy_words': {
	'conspiracy', 'cover-up', 'hidden', 'secret', 'expose', 'reveal',
	'truth', 'lies', 'deception', 'agenda', 'plot', 'scheme'
	},
	'absolute_terms': {
	'always', 'never', 'all', 'none', 'every', 'everyone', 'nobody',
	'everywhere', 'nowhere', 'completely', 'totally', 'absolutely'
	},
	'divisive_language': {
	'us', 'them', 'enemy', 'traitor', 'patriot', 'real', 'fake',
	'elite', 'establishment', 'mainstream', 'alternative'
	}
	}

	return patterns

	def fit(self, X, y=None):
	"""Fit the sentiment analyzer (mainly for API consistency)"""
	self.is_fitted_ = True
	return self

	def transform(self, X):
	"""Extract sentiment and emotional manipulation features"""
	if not self.is_fitted_:
	raise ValueError("SentimentAnalyzer must be fitted before transform")

	# Convert input to array if needed
	if isinstance(X, pd.Series):
	X = X.values
	elif isinstance(X, list):
	X = np.array(X)

	features = []

	for text in X:
	text_features = self._extract_sentiment_features(str(text))
	features.append(text_features)

	return np.array(features)

	def fit_transform(self, X, y=None):
	"""Fit and transform in one step"""
	return self.fit(X, y).transform(X)

	def _extract_sentiment_features(self, text):
	"""Extract comprehensive sentiment features from text"""
	text_lower = text.lower()
	words = re.findall(r'\b\w+\b', text_lower)
	total_words = len(words)

	if total_words == 0:
	return [0.0] * 25 # Return zeros for empty text

	features = []

	# Basic sentiment scores
	for emotion, lexicon in self.emotion_lexicons.items():
	emotion_count = sum(1 for word in words if word in lexicon)
	emotion_ratio = emotion_count / total_words
	features.append(emotion_ratio)

	# Manipulation pattern features
	for pattern_type, pattern_words in self.manipulation_patterns.items():
	pattern_count = sum(1 for word in words if word in pattern_words)
	pattern_ratio = pattern_count / total_words
	features.append(pattern_ratio)

	# Advanced sentiment features
	features.extend(self._extract_advanced_sentiment_features(text, words))

	return features

	def _extract_advanced_sentiment_features(self, text, words):
	"""Extract advanced sentiment and emotional manipulation features"""
	features = []

	# Exclamation and question mark patterns
	exclamation_count = text.count('!')
	question_count = text.count('?')
	features.append(exclamation_count / len(text) if len(text) > 0 else 0)
	features.append(question_count / len(text) if len(text) > 0 else 0)

	# Capitalization patterns (potential shouting)
	caps_words = sum(1 for word in words if word.isupper() and len(word) > 1)
	features.append(caps_words / len(words) if len(words) > 0 else 0)

	# Emotional intensity (multiple exclamation/question marks)
	intense_exclamation = len(re.findall(r'!{2,}', text))
	intense_question = len(re.findall(r'\?{2,}', text))
	features.append(intense_exclamation / len(text) if len(text) > 0 else 0)
	features.append(intense_question / len(text) if len(text) > 0 else 0)

	# Emotional contrast (mixing positive and negative)
	positive_count = sum(1 for word in words if word in self.emotion_lexicons['positive'])
	negative_count = sum(1 for word in words if word in self.emotion_lexicons['negative'])

	if positive_count + negative_count > 0:
	emotional_contrast = min(positive_count, negative_count) / (positive_count + negative_count)
	else:
	emotional_contrast = 0
	features.append(emotional_contrast)

	# Authority vs conspiracy balance
	authority_count = sum(1 for word in words if word in self.manipulation_patterns['authority_claims'])
	conspiracy_count = sum(1 for word in words if word in self.manipulation_patterns['conspiracy_words'])

	total_claims = authority_count + conspiracy_count
	if total_claims > 0:
	authority_ratio = authority_count / total_claims
	else:
	authority_ratio = 0.5 # Neutral when no claims
	features.append(authority_ratio)

	# Urgency density
	urgency_count = sum(1 for word in words if word in self.manipulation_patterns['urgency_words'])
	features.append(urgency_count / len(words) if len(words) > 0 else 0)

	# Personal pronouns (engagement tactics)
	personal_pronouns = {'you', 'your', 'yours', 'we', 'us', 'our', 'ours'}
	pronoun_count = sum(1 for word in words if word in personal_pronouns)
	features.append(pronoun_count / len(words) if len(words) > 0 else 0)

	return features

	def get_feature_names(self):
	"""Get names of extracted features"""
	feature_names = []

	# Basic emotion features
	for emotion in self.emotion_lexicons.keys():
	feature_names.append(f'sentiment_{emotion}_ratio')

	# Manipulation pattern features
	for pattern in self.manipulation_patterns.keys():
	feature_names.append(f'sentiment_{pattern}_ratio')

	# Advanced features
	advanced_features = [
	'sentiment_exclamation_density',
	'sentiment_question_density',
	'sentiment_caps_words_ratio',
	'sentiment_intense_exclamation_density',
	'sentiment_intense_question_density',
	'sentiment_emotional_contrast',
	'sentiment_authority_ratio',
	'sentiment_urgency_density',
	'sentiment_personal_pronouns_ratio'
	]

	feature_names.extend(advanced_features)

	return feature_names

	def analyze_text_sentiment(self, text):
	"""Detailed sentiment analysis of a single text"""
	if not self.is_fitted_:
	raise ValueError("SentimentAnalyzer must be fitted before analysis")

	text_lower = text.lower()
	words = re.findall(r'\b\w+\b', text_lower)

	analysis = {
	'text_length': len(text),
	'word_count': len(words),
	'emotions': {},
	'manipulation_patterns': {},
	'overall_sentiment': 'neutral',
	'manipulation_score': 0.0,
	'emotional_intensity': 0.0
	}

	total_words = len(words)
	if total_words == 0:
	return analysis

	# Analyze emotions
	for emotion, lexicon in self.emotion_lexicons.items():
	emotion_count = sum(1 for word in words if word in lexicon)
	analysis['emotions'][emotion] = {
	'count': emotion_count,
	'ratio': emotion_count / total_words,
	'words_found': [word for word in words if word in lexicon][:5] # Top 5 matches
	}

	# Analyze manipulation patterns
	for pattern_type, pattern_words in self.manipulation_patterns.items():
	pattern_count = sum(1 for word in words if word in pattern_words)
	analysis['manipulation_patterns'][pattern_type] = {
	'count': pattern_count,
	'ratio': pattern_count / total_words,
	'words_found': [word for word in words if word in pattern_words][:3] # Top 3 matches
	}

	# Calculate overall sentiment
	positive_score = analysis['emotions']['positive']['ratio']
	negative_score = analysis['emotions']['negative']['ratio']

	if positive_score > negative_score + 0.02:
	analysis['overall_sentiment'] = 'positive'
	elif negative_score > positive_score + 0.02:
	analysis['overall_sentiment'] = 'negative'
	else:
	analysis['overall_sentiment'] = 'neutral'

	# Calculate manipulation score
	manipulation_indicators = [
	analysis['manipulation_patterns']['urgency_words']['ratio'],
	analysis['manipulation_patterns']['conspiracy_words']['ratio'],
	analysis['manipulation_patterns']['absolute_terms']['ratio'],
	analysis['manipulation_patterns']['divisive_language']['ratio']
	]
	analysis['manipulation_score'] = sum(manipulation_indicators) / len(manipulation_indicators)

	# Calculate emotional intensity
	fear_anger_score = (analysis['emotions']['fear']['ratio'] +
	analysis['emotions']['anger']['ratio'])
	exclamation_density = text.count('!') / len(text) if len(text) > 0 else 0
	caps_density = sum(1 for c in text if c.isupper()) / len(text) if len(text) > 0 else 0

	analysis['emotional_intensity'] = (fear_anger_score + exclamation_density + caps_density) / 3

	return analysis

	def get_manipulation_indicators(self, text):
	"""Get specific manipulation indicators for fact-checking"""
	analysis = self.analyze_text_sentiment(text)

	indicators = {
	'high_emotional_intensity': analysis['emotional_intensity'] > 0.1,
	'urgency_manipulation': analysis['manipulation_patterns']['urgency_words']['ratio'] > 0.02,
	'conspiracy_language': analysis['manipulation_patterns']['conspiracy_words']['ratio'] > 0.01,
	'absolute_claims': analysis['manipulation_patterns']['absolute_terms']['ratio'] > 0.03,
	'divisive_framing': analysis['manipulation_patterns']['divisive_language']['ratio'] > 0.02,
	'emotional_overload': (analysis['emotions']['fear']['ratio'] +
	analysis['emotions']['anger']['ratio']) > 0.05
	}

	# Overall manipulation risk
	risk_score = sum(indicators.values()) / len(indicators)
	indicators['overall_manipulation_risk'] = 'high' if risk_score > 0.5 else 'medium' if risk_score > 0.3 else 'low'

	return indicators