File size: 6,156 Bytes
b1acf7e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
"""
Fused sentiment analysis model combining text, audio, and vision models.
"""
import logging
from typing import Tuple, Optional, List
from PIL import Image
from .text_model import predict_text_sentiment
from .audio_model import predict_audio_sentiment
from .vision_model import predict_vision_sentiment
logger = logging.getLogger(__name__)
def predict_fused_sentiment(
text: Optional[str] = None,
audio_bytes: Optional[bytes] = None,
image: Optional[Image.Image] = None,
) -> Tuple[str, float]:
"""
Implement ensemble/fusion logic combining all three models.
Args:
text: Input text for text sentiment analysis
audio_bytes: Audio bytes for audio sentiment analysis
image: Input image for vision sentiment analysis
Returns:
Tuple of (fused_sentiment, overall_confidence)
"""
results = []
if text:
text_sentiment, text_conf = predict_text_sentiment(text)
results.append(("Text", text_sentiment, text_conf))
if audio_bytes:
audio_sentiment, audio_conf = predict_audio_sentiment(audio_bytes)
results.append(("Audio", audio_sentiment, audio_conf))
if image:
vision_sentiment, vision_conf = predict_vision_sentiment(image)
results.append(("Vision", vision_sentiment, vision_conf))
if not results:
return "No inputs provided", 0.0
# Simple ensemble logic (can be enhanced with more sophisticated fusion strategies)
sentiment_counts = {}
total_confidence = 0
modality_weights = {"Text": 0.3, "Audio": 0.35, "Vision": 0.35} # Weighted voting
for modality, sentiment, confidence in results:
if sentiment not in sentiment_counts:
sentiment_counts[sentiment] = {"count": 0, "weighted_conf": 0}
sentiment_counts[sentiment]["count"] += 1
weight = modality_weights.get(modality, 0.33)
sentiment_counts[sentiment]["weighted_conf"] += confidence * weight
total_confidence += confidence
# Weighted majority voting with confidence averaging
if sentiment_counts:
# Find sentiment with highest weighted confidence
final_sentiment = max(
sentiment_counts.keys(), key=lambda s: sentiment_counts[s]["weighted_conf"]
)
# Calculate overall confidence as weighted average
avg_confidence = total_confidence / len(results)
logger.info(
f"Fused sentiment analysis completed: {final_sentiment} (confidence: {avg_confidence:.2f})"
)
logger.info(f"Individual results: {results}")
return final_sentiment, avg_confidence
else:
return "No valid predictions", 0.0
def get_fusion_strategy_info() -> dict:
"""Get information about the fusion strategy."""
return {
"strategy_name": "Weighted Ensemble Fusion",
"description": "Combines predictions from text, audio, and vision models using weighted voting",
"modality_weights": {"Text": 0.3, "Audio": 0.35, "Vision": 0.35},
"fusion_method": "Weighted majority voting with confidence averaging",
"advantages": [
"Robust to individual model failures",
"Leverages complementary information from different modalities",
"Configurable modality weights",
"Real-time ensemble prediction",
],
"use_cases": [
"Multi-modal content analysis",
"Enhanced sentiment accuracy",
"Cross-validation of predictions",
"Comprehensive emotional understanding",
],
}
def analyze_modality_agreement(
text: Optional[str] = None,
audio_bytes: Optional[bytes] = None,
image: Optional[Image.Image] = None,
) -> dict:
"""
Analyze agreement between different modalities.
Args:
text: Input text
audio_bytes: Audio bytes
image: Input image
Returns:
Dictionary containing agreement analysis
"""
results = {}
if text:
text_sentiment, text_conf = predict_text_sentiment(text)
results["text"] = {"sentiment": text_sentiment, "confidence": text_conf}
if audio_bytes:
audio_sentiment, audio_conf = predict_audio_sentiment(audio_bytes)
results["audio"] = {"sentiment": audio_sentiment, "confidence": audio_conf}
if image:
vision_sentiment, vision_conf = predict_vision_sentiment(image)
results["vision"] = {"sentiment": vision_sentiment, "confidence": vision_conf}
if len(results) < 2:
return {"agreement_level": "insufficient_modalities", "details": results}
# Analyze agreement
sentiments = [result["sentiment"] for result in results.values()]
unique_sentiments = set(sentiments)
if len(unique_sentiments) == 1:
agreement_level = "perfect"
agreement_score = 1.0
elif len(unique_sentiments) == 2:
agreement_level = "partial"
agreement_score = 0.5
else:
agreement_level = "low"
agreement_score = 0.0
# Calculate confidence consistency
confidences = [result["confidence"] for result in results.values()]
confidence_std = sum(confidences) / len(confidences) if confidences else 0
return {
"agreement_level": agreement_level,
"agreement_score": agreement_score,
"modalities_analyzed": len(results),
"sentiment_distribution": {s: sentiments.count(s) for s in unique_sentiments},
"confidence_consistency": confidence_std,
"individual_results": results,
"recommendation": _get_agreement_recommendation(agreement_level, len(results)),
}
def _get_agreement_recommendation(agreement_level: str, num_modalities: int) -> str:
"""Get recommendation based on agreement level."""
if agreement_level == "perfect":
return "High confidence in prediction - all modalities agree"
elif agreement_level == "partial":
return "Moderate confidence - consider modality-specific factors"
elif agreement_level == "low":
return "Low confidence - modalities disagree, consider context"
else:
return "Insufficient data for reliable fusion"
|