""" Refactored Sentiment Fused - Multimodal Sentiment Analysis Application This is the main entry point for the application, now using a modular structure. """ import streamlit as st import pandas as pd from PIL import Image import logging # Import our modular components from src.config.settings import ( APP_NAME, APP_VERSION, APP_ICON, APP_LAYOUT, CUSTOM_CSS, SUPPORTED_IMAGE_FORMATS, SUPPORTED_AUDIO_FORMATS, SUPPORTED_VIDEO_FORMATS, ) from src.models.text_model import predict_text_sentiment from src.models.audio_model import predict_audio_sentiment, load_audio_model from src.models.vision_model import predict_vision_sentiment, load_vision_model from src.models.fused_model import predict_fused_sentiment from src.utils.preprocessing import ( extract_frames_from_video, extract_audio_from_video, transcribe_audio, ) from src.utils.file_handling import get_file_info, format_file_size from src.utils.sentiment_mapping import get_sentiment_colors, format_sentiment_result # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Page configuration st.set_page_config( page_title=APP_NAME, page_icon=APP_ICON, layout=APP_LAYOUT, initial_sidebar_state="expanded", ) # Apply custom CSS st.markdown(CUSTOM_CSS, unsafe_allow_html=True) def render_home_page(): """Render the home page with model information.""" st.markdown( f'

{APP_NAME}

', unsafe_allow_html=True, ) st.markdown( """

Welcome to your Multi-Modal Sentiment Analysis Testing Platform!

This application provides a comprehensive testing environment for your three independent sentiment analysis models:

""", unsafe_allow_html=True, ) col1, col2, col3 = st.columns(3) with col1: st.markdown( """

Text Sentiment Model

READY TO USE - Analyze sentiment from text input using TextBlob

""", unsafe_allow_html=True, ) with col2: st.markdown( """

Audio Sentiment Model

READY TO USE - Analyze sentiment from audio files using fine-tuned Wav2Vec2

""", unsafe_allow_html=True, ) with col3: st.markdown( """

Vision Sentiment Model

Analyze sentiment from images using fine-tuned ResNet-50

""", unsafe_allow_html=True, ) st.markdown( """

Fused Model

Combine predictions from all three models for enhanced accuracy

""", unsafe_allow_html=True, ) st.markdown( """

đŸŽŦ Max Fusion

Ultimate video-based sentiment analysis combining all three modalities

""", unsafe_allow_html=True, ) st.markdown("---") st.markdown( """

Note: This application now has ALL THREE MODELS fully integrated and ready to use!

TextBlob (Text) + Wav2Vec2 (Audio) + ResNet-50 (Vision)

Models are now loaded from Google Drive automatically!

""", unsafe_allow_html=True, ) def render_text_sentiment_page(): """Render the text sentiment analysis page.""" st.title("Text Sentiment Analysis") st.markdown("Analyze the sentiment of your text using our TextBlob-based model.") # Text input text_input = st.text_area( "Enter your text here:", height=150, placeholder="Type or paste your text here to analyze its sentiment...", ) # Analyze button if st.button("Analyze Sentiment", type="primary", use_container_width=True): if text_input and text_input.strip(): with st.spinner("Analyzing text sentiment..."): sentiment, confidence = predict_text_sentiment(text_input) # Display results st.markdown("### Results") # Display results in columns col1, col2 = st.columns(2) with col1: st.metric("Sentiment", sentiment) with col2: st.metric("Confidence", f"{confidence:.2f}") # Color-coded sentiment display sentiment_colors = get_sentiment_colors() emoji = sentiment_colors.get(sentiment, "❓") st.markdown( f"""

{emoji} Sentiment: {sentiment}

Confidence: {confidence:.2f}

Input Text: "{text_input[:100]}{'...' if len(text_input) > 100 else ''}"

Model: TextBlob (Natural Language Processing)

""", unsafe_allow_html=True, ) else: st.error("Please enter some text to analyze.") def render_audio_sentiment_page(): """Render the audio sentiment analysis page.""" st.title("Audio Sentiment Analysis") st.markdown( "Analyze the sentiment of your audio files using our fine-tuned Wav2Vec2 model." ) # Preprocessing information st.info( "**Audio Preprocessing**: Audio will be automatically processed to match CREMA-D + RAVDESS training format: " "16kHz sampling rate, max 5 seconds, with automatic resampling and feature extraction." ) # Model status model, device, num_classes, feature_extractor = load_audio_model() if model is None: st.error( "Audio model could not be loaded. Please check the Google Drive setup." ) st.info( "Expected: Models should be configured in Google Drive and accessible via the model manager." ) else: st.success( f"Audio model loaded successfully on {device} with {num_classes} classes!" ) # Input method selection st.subheader("Choose Input Method") input_method = st.radio( "Select how you want to provide audio:", ["Upload Audio File", "Record Audio"], horizontal=True, ) if input_method == "Upload Audio File": # File uploader uploaded_audio = st.file_uploader( "Choose an audio file", type=SUPPORTED_AUDIO_FORMATS, help="Supported formats: WAV, MP3, M4A, FLAC", ) audio_source = "uploaded_file" audio_name = uploaded_audio.name if uploaded_audio else None else: # Audio recording st.markdown( """

Audio Recording

Record audio directly with your microphone (max 5 seconds).

Note: Make sure your microphone is accessible and you have permission to use it.

""", unsafe_allow_html=True, ) # Audio recorder recorded_audio = st.audio_input( label="Click to start recording", help="Click the microphone button to start/stop recording. Maximum recording time is 5 seconds.", ) if recorded_audio is not None: # Display recorded audio st.audio(recorded_audio, format="audio/wav") st.success("Audio recorded successfully!") # Convert recorded audio to bytes for processing uploaded_audio = recorded_audio audio_source = "recorded" audio_name = "Recorded Audio" else: uploaded_audio = None audio_source = None audio_name = None if uploaded_audio is not None: # Display audio player if audio_source == "recorded": st.audio(uploaded_audio, format="audio/wav") st.info(f"{audio_name} | Source: Microphone Recording") else: st.audio( uploaded_audio, format=f'audio/{uploaded_audio.name.split(".")[-1]}' ) # File info for uploaded files file_info = get_file_info(uploaded_audio) st.info( f"File: {file_info['name']} | Size: {format_file_size(file_info['size_bytes'])}" ) # Analyze button if st.button( "Analyze Audio Sentiment", type="primary", use_container_width=True ): if model is None: st.error("Model not loaded. Cannot analyze audio.") else: with st.spinner("Analyzing audio sentiment..."): audio_bytes = uploaded_audio.getvalue() sentiment, confidence = predict_audio_sentiment(audio_bytes) # Display results st.markdown("### Results") col1, col2 = st.columns(2) with col1: st.metric("Sentiment", sentiment) with col2: st.metric("Confidence", f"{confidence:.2f}") # Color-coded sentiment display sentiment_colors = get_sentiment_colors() emoji = sentiment_colors.get(sentiment, "❓") st.markdown( f"""

{emoji} Sentiment: {sentiment}

Confidence: {confidence:.2f}

Audio Source: {audio_name}

Model: Wav2Vec2 (Fine-tuned on RAVDESS + CREMA-D)

""", unsafe_allow_html=True, ) else: if input_method == "Upload Audio File": st.info("Please upload an audio file to begin analysis.") else: st.info("Click the microphone button above to record audio for analysis.") def render_vision_sentiment_page(): """Render the vision sentiment analysis page.""" st.title("Vision Sentiment Analysis") st.markdown( "Analyze the sentiment of your images using our fine-tuned ResNet-50 model." ) st.info( "**Note**: Images will be automatically preprocessed to match FER2013 format: face detection, grayscale conversion, and 224x224 resize (converted to 3-channel RGB)." ) # Face cropping is set to 0% (no padding) for tightest crop st.info("**Face Cropping**: Set to 0% padding for tightest crop on facial features") # Model status model, device, num_classes = load_vision_model() if model is None: st.error( "Vision model could not be loaded. Please check the Google Drive setup." ) st.info( "Expected: Models should be configured in Google Drive and accessible via the model manager." ) else: st.success( f"Vision model loaded successfully on {device} with {num_classes} classes!" ) # Input method selection st.subheader("Choose Input Method") input_method = st.radio( "Select how you want to provide an image:", ["Upload Image File", "Take Photo with Camera"], horizontal=True, ) if input_method == "Upload Image File": # File uploader uploaded_image = st.file_uploader( "Choose an image file", type=SUPPORTED_IMAGE_FORMATS, help="Supported formats: PNG, JPG, JPEG, BMP, TIFF", ) if uploaded_image is not None: # Display image image = Image.open(uploaded_image) st.image( image, caption=f"Uploaded Image: {uploaded_image.name}", use_container_width=True, ) # File info file_info = get_file_info(uploaded_image) st.info( f"File: {file_info['name']} | Size: {format_file_size(file_info['size_bytes'])} | Dimensions: {image.size[0]}x{image.size[1]}" ) # Analyze button if st.button( "Analyze Image Sentiment", type="primary", use_container_width=True ): if model is None: st.error("Model not loaded. Cannot analyze image.") else: with st.spinner("Analyzing image sentiment..."): sentiment, confidence = predict_vision_sentiment(image) # Display results st.markdown("### Results") col1, col2 = st.columns(2) with col1: st.metric("Sentiment", sentiment) with col2: st.metric("Confidence", f"{confidence:.2f}") # Color-coded sentiment display sentiment_colors = get_sentiment_colors() emoji = sentiment_colors.get(sentiment, "❓") st.markdown( f"""

{emoji} Sentiment: {sentiment}

Confidence: {confidence:.2f}

Image File: {uploaded_image.name}

Model: ResNet-50 (Fine-tuned on FER2013)

""", unsafe_allow_html=True, ) else: # Camera capture st.markdown( """

Camera Capture

Take a photo directly with your camera to analyze its sentiment.

Note: Make sure your camera is accessible and you have permission to use it.

""", unsafe_allow_html=True, ) # Camera input camera_photo = st.camera_input( "Take a photo", help="Click the camera button to take a photo, or use the upload button to select an existing photo", ) if camera_photo is not None: # Display captured image image = Image.open(camera_photo) st.image( image, caption="Captured Photo", use_container_width=True, ) # Image info st.info( f"Captured Photo | Dimensions: {image.size[0]}x{image.size[1]} | Format: {image.format}" ) # Analyze button if st.button( "Analyze Photo Sentiment", type="primary", use_container_width=True ): if model is None: st.error("Model not loaded. Cannot analyze image.") else: with st.spinner("Analyzing photo sentiment..."): sentiment, confidence = predict_vision_sentiment(image) # Display results st.markdown("### Results") col1, col2 = st.columns(2) with col1: st.metric("Sentiment", sentiment) with col2: st.metric("Confidence", f"{confidence:.2f}") # Color-coded sentiment display sentiment_colors = get_sentiment_colors() emoji = sentiment_colors.get(sentiment, "❓") st.markdown( f"""

{emoji} Sentiment: {sentiment}

Confidence: {confidence:.2f}

Image Source: Camera Capture

Model: ResNet-50 (Fine-tuned on FER2013)

""", unsafe_allow_html=True, ) # Show info if no image is provided if input_method == "Upload Image File" and "uploaded_image" not in locals(): st.info("Please upload an image file to begin analysis.") elif input_method == "Take Photo with Camera" and "camera_photo" not in locals(): st.info("Click the camera button above to take a photo for analysis.") def render_fused_model_page(): """Render the fused model analysis page.""" st.title("Fused Model Analysis") st.markdown( "Combine predictions from all three models for enhanced sentiment analysis." ) st.markdown( """

Multi-Modal Sentiment Analysis

This page allows you to input text, audio, and/or image data to get a comprehensive sentiment analysis using all three models combined.

""", unsafe_allow_html=True, ) # Input sections col1, col2 = st.columns(2) with col1: st.subheader("Text Input") text_input = st.text_area( "Enter text (optional):", height=100, placeholder="Type or paste your text here...", ) st.subheader("Audio Input") # Audio preprocessing information for fused model st.info( "**Audio Preprocessing**: Audio will be automatically processed to match CREMA-D + RAVDESS training format: " "16kHz sampling rate, max 5 seconds, with automatic resampling and feature extraction." ) # Audio input method for fused model audio_input_method = st.radio( "Audio input method:", ["Upload File", "Record Audio"], key="fused_audio_method", horizontal=True, ) if audio_input_method == "Upload File": uploaded_audio = st.file_uploader( "Upload audio file (optional):", type=SUPPORTED_AUDIO_FORMATS, key="fused_audio", ) audio_source = "uploaded_file" audio_name = uploaded_audio.name if uploaded_audio else None else: # Audio recorder for fused model recorded_audio = st.audio_input( label="Record audio (optional):", key="fused_audio_recorder", help="Click to record audio for sentiment analysis", ) if recorded_audio is not None: st.audio(recorded_audio, format="audio/wav") st.success("Audio recorded successfully!") uploaded_audio = recorded_audio audio_source = "recorded" audio_name = "Recorded Audio" else: uploaded_audio = None audio_source = None audio_name = None with col2: st.subheader("Image Input") # Face cropping is set to 0% (no padding) for tightest crop st.info( "**Face Cropping**: Set to 0% padding for tightest crop on facial features" ) # Image input method for fused model image_input_method = st.radio( "Image input method:", ["Upload File", "Take Photo"], key="fused_image_method", horizontal=True, ) if image_input_method == "Upload File": uploaded_image = st.file_uploader( "Upload image file (optional):", type=SUPPORTED_IMAGE_FORMATS, key="fused_image", ) if uploaded_image: image = Image.open(uploaded_image) st.image(image, caption="Uploaded Image", use_container_width=True) else: # Camera capture for fused model camera_photo = st.camera_input( "Take a photo (optional):", key="fused_camera", help="Click to take a photo for sentiment analysis", ) if camera_photo: image = Image.open(camera_photo) st.image(image, caption="Captured Photo", use_container_width=True) # Set uploaded_image to camera_photo for processing uploaded_image = camera_photo if uploaded_audio: st.audio( uploaded_audio, format=f'audio/{uploaded_audio.name.split(".")[-1]}' ) # Analyze button if st.button("Run Fused Analysis", type="primary", use_container_width=True): if text_input or uploaded_audio or uploaded_image: with st.spinner("Running fused sentiment analysis..."): # Prepare inputs audio_bytes = uploaded_audio.getvalue() if uploaded_audio else None image = Image.open(uploaded_image) if uploaded_image else None # Get fused prediction sentiment, confidence = predict_fused_sentiment( text=text_input if text_input else None, audio_bytes=audio_bytes, image=image, ) # Display results st.markdown("### Fused Model Results") col1, col2 = st.columns(2) with col1: st.metric("Final Sentiment", sentiment) with col2: st.metric("Overall Confidence", f"{confidence:.2f}") # Show individual model results st.markdown("### Individual Model Results") results_data = [] if text_input: text_sentiment, text_conf = predict_text_sentiment(text_input) results_data.append( { "Model": "Text (TextBlob)", "Input": f"Text: {text_input[:50]}...", "Sentiment": text_sentiment, "Confidence": f"{text_conf:.2f}", } ) if uploaded_audio: audio_sentiment, audio_conf = predict_audio_sentiment(audio_bytes) results_data.append( { "Model": "Audio (Wav2Vec2)", "Input": f"Audio: {audio_name}", "Sentiment": audio_sentiment, "Confidence": f"{audio_conf:.2f}", } ) if uploaded_image: # Face cropping is set to 0% (no padding) for tightest crop vision_sentiment, vision_conf = predict_vision_sentiment( image, crop_tightness=0.0 ) results_data.append( { "Model": "Vision (ResNet-50)", "Input": f"Image: {uploaded_image.name}", "Sentiment": vision_sentiment, "Confidence": f"{vision_conf:.2f}", } ) if results_data: df = pd.DataFrame(results_data) st.dataframe(df, use_container_width=True) # Final result display sentiment_colors = get_sentiment_colors() emoji = sentiment_colors.get(sentiment, "❓") st.markdown( f"""

{emoji} Final Fused Sentiment: {sentiment}

Overall Confidence: {confidence:.2f}

Models Used: {len(results_data)}

""", unsafe_allow_html=True, ) else: st.warning( "Please provide at least one input (text, audio, or image) for fused analysis." ) def render_max_fusion_page(): """Render the max fusion page for video-based analysis.""" st.title("Max Fusion - Multi-Modal Sentiment Analysis") st.markdown( """

Ultimate Multi-Modal Sentiment Analysis

Take photos with camera or upload videos to get comprehensive sentiment analysis from multiple modalities:

""", unsafe_allow_html=True, ) # Video input method selection st.subheader("Video Input") video_input_method = st.radio( "Choose input method:", ["Upload Video File", "Record Video (Coming Soon)"], horizontal=True, index=0, # Default to upload video ) if video_input_method == "Record Video (Coming Soon)": # Coming Soon message for video recording st.info("đŸŽĨ Video recording feature is coming soon!") st.info("📁 Please use the Upload Video File option for now.") # Show a nice coming soon message st.markdown("---") col1, col2, col3 = st.columns([1, 2, 1]) with col2: st.markdown( """

🚧 Coming Soon 🚧

Video recording feature is under development

Use Upload Video File for now!

""", unsafe_allow_html=True, ) # Placeholder for future recording functionality st.markdown( """ **Future Features:** - Real-time video recording with camera - Audio capture during recording - Automatic frame extraction - Live transcription - WebRTC integration for low-latency streaming """ ) # Skip all the recording logic for now uploaded_video = None video_source = None video_name = None video_file = None elif video_input_method == "Upload Video File": # File upload option st.markdown( """

📁 Upload Video File

Upload a video file for comprehensive multimodal analysis.

Supported Formats: MP4, AVI, MOV, MKV, WMV, FLV

Recommended: Videos with clear audio and visual content

""", unsafe_allow_html=True, ) uploaded_video = st.file_uploader( "Choose a video file", type=SUPPORTED_VIDEO_FORMATS, help="Supported formats: MP4, AVI, MOV, MKV, WMV, FLV", ) video_source = "uploaded_file" video_name = uploaded_video.name if uploaded_video else None video_file = uploaded_video if video_file is not None: # Display video or photo if video_source == "camera_photo": # For camera photos, we already displayed the image above st.info(f"Source: Camera Photo | Ready for vision analysis") # Add audio upload option for camera photo mode st.subheader("đŸŽĩ Audio Input for Analysis") st.info( "Since we're using a photo, please upload an audio file for audio sentiment analysis:" ) uploaded_audio = st.file_uploader( "Upload audio file for audio analysis:", type=SUPPORTED_AUDIO_FORMATS, key="camera_audio", help="Upload an audio file to complement the photo analysis", ) if uploaded_audio: st.audio( uploaded_audio, format=f'audio/{uploaded_audio.name.split(".")[-1]}' ) st.success("✅ Audio uploaded successfully!") audio_bytes = uploaded_audio.getvalue() else: audio_bytes = None st.warning("âš ī¸ Please upload an audio file for complete analysis") else: # For uploaded videos st.video(video_file) file_info = get_file_info(video_file) st.info( f"File: {file_info['name']} | Size: {format_file_size(file_info['size_bytes'])}" ) audio_bytes = None # Will be extracted from video # Video Processing Pipeline st.subheader("đŸŽŦ Video Processing Pipeline") # Initialize variables frames = [] audio_bytes = None transcribed_text = "" # Process uploaded video if uploaded_video: st.info("📁 Processing uploaded video file...") # Extract frames st.markdown("**1. đŸŽĨ Frame Extraction**") frames = extract_frames_from_video(uploaded_video, max_frames=5) if frames: st.success(f"✅ Extracted {len(frames)} representative frames") # Display extracted frames cols = st.columns(len(frames)) for i, frame in enumerate(frames): with cols[i]: st.image( frame, caption=f"Frame {i+1}", use_container_width=True ) else: st.warning("âš ī¸ Could not extract frames from video") frames = [] # Extract audio st.markdown("**2. đŸŽĩ Audio Extraction**") audio_bytes = extract_audio_from_video(uploaded_video) if audio_bytes: st.success("✅ Audio extracted successfully") st.audio(audio_bytes, format="audio/wav") else: st.warning("âš ī¸ Could not extract audio from video") audio_bytes = None # Transcribe audio st.markdown("**3. 📝 Audio Transcription**") if audio_bytes: transcribed_text = transcribe_audio(audio_bytes) if transcribed_text: st.success("✅ Audio transcribed successfully") st.markdown(f'**Transcribed Text:** "{transcribed_text}"') else: st.warning("âš ī¸ Could not transcribe audio") transcribed_text = "" else: transcribed_text = "" st.info("â„šī¸ No audio available for transcription") # Analysis button if st.button( "🚀 Run Max Fusion Analysis", type="primary", use_container_width=True ): with st.spinner( "🔄 Processing video and running comprehensive analysis..." ): # Run individual analyses st.subheader("🔍 Individual Model Analysis") results_data = [] # Vision analysis (use first frame for uploaded videos) if frames: st.markdown("**Vision Analysis:**") # For uploaded videos, use first frame vision_sentiment, vision_conf = predict_vision_sentiment( frames[0], crop_tightness=0.0 ) results_data.append( { "Model": "Vision (ResNet-50)", "Input": f"Video Frame 1", "Sentiment": vision_sentiment, "Confidence": f"{vision_conf:.2f}", } ) st.success( f"Vision: {vision_sentiment} (Confidence: {vision_conf:.2f})" ) # Audio analysis if audio_bytes: st.markdown("**Audio Analysis:**") audio_sentiment, audio_conf = predict_audio_sentiment(audio_bytes) results_data.append( { "Model": "Audio (Wav2Vec2)", "Input": f"Video Audio", "Sentiment": audio_sentiment, "Confidence": f"{audio_conf:.2f}", } ) st.success( f"Audio: {audio_sentiment} (Confidence: {audio_conf:.2f})" ) # Text analysis if transcribed_text: st.markdown("**Text Analysis:**") text_sentiment, text_conf = predict_text_sentiment(transcribed_text) results_data.append( { "Model": "Text (TextBlob)", "Input": f"Transcribed: {transcribed_text[:50]}...", "Sentiment": text_sentiment, "Confidence": f"{text_conf:.2f}", } ) st.success(f"Text: {text_sentiment} (Confidence: {text_conf:.2f})") # Run fused analysis st.subheader("đŸŽ¯ Max Fusion Results") if results_data: # Display results table df = pd.DataFrame(results_data) st.dataframe(df, use_container_width=True) # Calculate fused sentiment image_for_fusion = frames[0] if frames else None sentiment, confidence = predict_fused_sentiment( text=transcribed_text if transcribed_text else None, audio_bytes=audio_bytes, image=image_for_fusion, ) # Display final results col1, col2 = st.columns(2) with col1: st.metric("đŸŽ¯ Final Sentiment", sentiment) with col2: st.metric("📊 Overall Confidence", f"{confidence:.2f}") # Color-coded sentiment display sentiment_colors = get_sentiment_colors() emoji = sentiment_colors.get(sentiment, "❓") st.markdown( f"""

{emoji} Max Fusion Sentiment: {sentiment}

Overall Confidence: {confidence:.2f}

Modalities Analyzed: {len(results_data)}

Video Source: {video_name}

Analysis Type: Comprehensive Multi-Modal Sentiment Analysis

""", unsafe_allow_html=True, ) else: st.error( "❌ No analysis could be performed. Please check your video input." ) else: if video_input_method == "Record Video (Coming Soon)": st.info( "đŸŽĨ Video recording feature is coming soon! Please use Upload Video File for now." ) else: st.info("📁 Please upload a video file to begin Max Fusion analysis.") def main(): """Main application function.""" # Sidebar navigation st.sidebar.title("Sentiment Analysis") st.sidebar.markdown("---") # Navigation page = st.sidebar.selectbox( "Choose a page:", [ "Home", "Text Sentiment", "Audio Sentiment", "Vision Sentiment", "Fused Model", "Max Fusion", ], ) # Page routing if page == "Home": render_home_page() elif page == "Text Sentiment": render_text_sentiment_page() elif page == "Audio Sentiment": render_audio_sentiment_page() elif page == "Vision Sentiment": render_vision_sentiment_page() elif page == "Fused Model": render_fused_model_page() elif page == "Max Fusion": render_max_fusion_page() # Footer st.markdown("---") st.markdown( """

Built with â¤ī¸ | by iamfaham

Version: {version}

""".format( version=APP_VERSION ), unsafe_allow_html=True, ) if __name__ == "__main__": main()