#!/usr/bin/env python3 """ MCP Video Analysis Client with Llama 3 Integration This application serves as an MCP (Model Context Protocol) client that: 1. Connects to video analysis tools via MCP 2. Integrates with a Llama 3 model hosted on Modal for intelligent video understanding 3. Provides a Gradio interface for user interaction """ import os import json import logging from typing import Dict, Any, Optional import gradio as gr import httpx # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class MCPVideoAnalysisClient: """MCP Client for video analysis with Llama 3 integration.""" def __init__(self): # Modal backend for video processing self.video_analysis_endpoint = os.getenv( "MODAL_VIDEO_ANALYSIS_ENDPOINT_URL", "https://jomasego--video-analysis-gradio-pipeline-process-video-analysis.modal.run" ) # Modal backend for Llama 3 insights self.llama_endpoint = os.getenv( "MODAL_LLAMA3_ENDPOINT_URL" # This will be set to the deployed Llama 3 app URL. # e.g., "https://jomasego--llama3-inference-service-summarize.modal.run" ) logger.info(f"Initialized MCP Client.") logger.info(f"Video Analysis Endpoint: {self.video_analysis_endpoint}") if not self.llama_endpoint: logger.warning("MODAL_LLAMA3_ENDPOINT_URL not set. LLM insights will be unavailable.") else: logger.info(f"Llama 3 Endpoint: {self.llama_endpoint}") async def analyze_video_with_modal(self, video_url: str) -> Dict[str, Any]: """Call the Modal backend for comprehensive video analysis.""" try: async with httpx.AsyncClient(timeout=300.0) as client: logger.info(f"Calling video analysis backend: {video_url}") response = await client.post( self.video_analysis_endpoint, json={"video_url": video_url}, headers={"Content-Type": "application/json"} ) response.raise_for_status() return response.json() except Exception as e: logger.error(f"Error calling video analysis backend: {e}") return {"error": f"Video analysis backend error: {str(e)}"} async def get_insights_from_llama3(self, analysis_data: Dict[str, Any], user_query: Optional[str] = None) -> str: """Call the Llama 3 Modal backend for intelligent insights.""" if not self.llama_endpoint: return "Llama 3 endpoint is not configured. Cannot generate insights." try: payload = { "analysis_data": analysis_data, "user_query": user_query } async with httpx.AsyncClient(timeout=300.0) as client: logger.info(f"Calling Llama 3 Modal backend for insights.") response = await client.post( self.llama_endpoint, json=payload, headers={"Content-Type": "application/json"} ) response.raise_for_status() result = response.json() return result.get("summary", "No summary returned from Llama 3 service.") except Exception as e: logger.error(f"Error calling Llama 3 backend: {e}") return f"Error generating Llama 3 insights: {str(e)}" async def process_video_request(self, video_url: str, user_query: str = None) -> tuple[str, str]: """Process a complete video analysis request with Llama 3 enhancement.""" if not video_url or not video_url.strip(): return "Please provide a valid video URL.", "" try: # Step 1: Get video analysis from Modal backend logger.info(f"Starting video analysis for: {video_url}") video_analysis = await self.analyze_video_with_modal(video_url.strip()) # Step 2: Format the raw analysis for display raw_analysis = json.dumps(video_analysis, indent=2) # Step 3: Enhance with Llama 3 insights logger.info("Generating Llama 3 insights...") llama_insights = await self.get_insights_from_llama3(video_analysis, user_query) return llama_insights, raw_analysis except Exception as e: error_msg = f"Error processing video request: {str(e)}" logger.error(error_msg) return error_msg, "" # Initialize the MCP client try: mcp_client = MCPVideoAnalysisClient() logger.info("MCP Video Analysis Client initialized successfully") except Exception as e: logger.error(f"Failed to initialize MCP client: {e}") mcp_client = None # Gradio Interface Functions async def analyze_video_interface(video_url: str, user_query: str = None) -> tuple[str, str]: """Gradio interface function for video analysis.""" if not mcp_client: return "MCP Client not initialized. Please check your environment variables.", "" return await mcp_client.process_video_request(video_url, user_query) def create_gradio_interface(): """Create and configure the Gradio interface.""" with gr.Blocks( title="MCP Video Analysis with Llama 3", theme=gr.themes.Soft(), css=""" .gradio-container { max-width: 1200px !important; } .main-header { text-align: center; margin-bottom: 30px; } .analysis-output { max-height: 600px; overflow-y: auto; } """ ) as interface: gr.HTML("""

đŸŽĨ MCP Video Analysis with Llama 3 AI

Intelligent video content analysis powered by a Modal backend and Llama 3

""") with gr.Tab("🔍 Video Analysis"): with gr.Row(): with gr.Column(scale=1): video_url_input = gr.Textbox( label="Video URL", placeholder="Enter YouTube URL or direct video link...", lines=2 ) user_query_input = gr.Textbox( label="Specific Question (Optional)", placeholder="Ask a specific question about the video...", lines=2 ) with gr.Row(): analyze_btn = gr.Button("🚀 Analyze Video", variant="primary", size="lg") clear_btn = gr.Button("đŸ—‘ī¸ Clear", variant="secondary") with gr.Column(scale=2): llama_output = gr.Textbox( label="🤖 Llama 3 AI Insights", lines=20, elem_classes=["analysis-output"], interactive=False ) with gr.Row(): raw_analysis_output = gr.JSON( label="📊 Raw Analysis Data", elem_classes=["analysis-output"] ) # Example videos gr.HTML("

📝 Example Videos to Try:

") with gr.Row(): example_urls = [ "https://www.youtube.com/watch?v=dQw4w9WgXcQ", "https://www.youtube.com/watch?v=jNQXAC9IVRw", "https://www.youtube.com/watch?v=9bZkp7q19f0" ] for i, url in enumerate(example_urls, 1): gr.Button(f"Example {i}", size="sm").click( lambda url=url: url, outputs=video_url_input ) with gr.Tab("â„šī¸ About"): gr.Markdown(""" ## About MCP Video Analysis This application combines multiple AI technologies to provide comprehensive video analysis: ### 🔧 Technology Stack - **Modal Backend**: Scalable cloud compute for video processing and LLM inference - **Whisper**: Speech-to-text transcription - **Computer Vision Models**: Object detection, action recognition, and captioning - **Meta Llama 3**: Advanced AI for intelligent content analysis - **MCP Protocol**: Model Context Protocol for seamless integration ### đŸŽ¯ Features - **Transcription**: Extract spoken content from videos - **Visual Analysis**: Identify objects, actions, and scenes - **Content Understanding**: AI-powered insights and summaries - **Custom Queries**: Ask specific questions about video content ### 🚀 Usage 1. Enter a video URL (YouTube or direct link) 2. Optionally ask a specific question 3. Click "Analyze Video" to get comprehensive insights 4. Review both Llama 3's intelligent analysis and raw data ### 🔒 Privacy & Security - Video processing is handled securely in the cloud - No video data is stored permanently - API keys are handled securely via environment variables """) # Event handlers def clear_all(): return "", "", "", "" analyze_btn.click( fn=analyze_video_interface, inputs=[video_url_input, user_query_input], outputs=[llama_output, raw_analysis_output], show_progress=True ) clear_btn.click( fn=clear_all, outputs=[video_url_input, user_query_input, llama_output, raw_analysis_output] ) return interface # Create and launch the interface if __name__ == "__main__": interface = create_gradio_interface() interface.launch( server_name="0.0.0.0", server_port=7860, share=False, show_error=True )