Spaces:

jomasego
/

mcp-video-frontend

Sleeping

File size: 10,275 Bytes

#!/usr/bin/env python3
"""
MCP Video Analysis Client with Llama 3 Integration

This application serves as an MCP (Model Context Protocol) client that:
1. Connects to video analysis tools via MCP
2. Integrates with a Llama 3 model hosted on Modal for intelligent video understanding
3. Provides a Gradio interface for user interaction
"""

import os
import json
import logging
from typing import Dict, Any, Optional
import gradio as gr
import httpx

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class MCPVideoAnalysisClient:
    """MCP Client for video analysis with Llama 3 integration."""
    
    def __init__(self):
        # Modal backend for video processing
        self.video_analysis_endpoint = os.getenv(
            "MODAL_VIDEO_ANALYSIS_ENDPOINT_URL", 
            "https://jomasego--video-analysis-gradio-pipeline-process-video-analysis.modal.run"
        )
        
        # Modal backend for Llama 3 insights
        self.llama_endpoint = os.getenv(
            "MODAL_LLAMA3_ENDPOINT_URL"
            # This will be set to the deployed Llama 3 app URL.
            # e.g., "https://jomasego--llama3-inference-service-summarize.modal.run"
        )
        
        logger.info(f"Initialized MCP Client.")
        logger.info(f"Video Analysis Endpoint: {self.video_analysis_endpoint}")
        if not self.llama_endpoint:
            logger.warning("MODAL_LLAMA3_ENDPOINT_URL not set. LLM insights will be unavailable.")
        else:
            logger.info(f"Llama 3 Endpoint: {self.llama_endpoint}")

    async def analyze_video_with_modal(self, video_url: str) -> Dict[str, Any]:
        """Call the Modal backend for comprehensive video analysis."""
        try:
            async with httpx.AsyncClient(timeout=300.0) as client:
                logger.info(f"Calling video analysis backend: {video_url}")
                response = await client.post(
                    self.video_analysis_endpoint,
                    json={"video_url": video_url},
                    headers={"Content-Type": "application/json"}
                )
                response.raise_for_status()
                return response.json()
        except Exception as e:
            logger.error(f"Error calling video analysis backend: {e}")
            return {"error": f"Video analysis backend error: {str(e)}"}
    
    async def get_insights_from_llama3(self, analysis_data: Dict[str, Any], user_query: Optional[str] = None) -> str:
        """Call the Llama 3 Modal backend for intelligent insights."""
        if not self.llama_endpoint:
            return "Llama 3 endpoint is not configured. Cannot generate insights."

        try:
            payload = {
                "analysis_data": analysis_data,
                "user_query": user_query
            }
            async with httpx.AsyncClient(timeout=300.0) as client:
                logger.info(f"Calling Llama 3 Modal backend for insights.")
                response = await client.post(
                    self.llama_endpoint,
                    json=payload,
                    headers={"Content-Type": "application/json"}
                )
                response.raise_for_status()
                result = response.json()
                return result.get("summary", "No summary returned from Llama 3 service.")
        except Exception as e:
            logger.error(f"Error calling Llama 3 backend: {e}")
            return f"Error generating Llama 3 insights: {str(e)}"
    
    async def process_video_request(self, video_url: str, user_query: str = None) -> tuple[str, str]:
        """Process a complete video analysis request with Llama 3 enhancement."""
        if not video_url or not video_url.strip():
            return "Please provide a valid video URL.", ""
        
        try:
            # Step 1: Get video analysis from Modal backend
            logger.info(f"Starting video analysis for: {video_url}")
            video_analysis = await self.analyze_video_with_modal(video_url.strip())
            
            # Step 2: Format the raw analysis for display
            raw_analysis = json.dumps(video_analysis, indent=2)
            
            # Step 3: Enhance with Llama 3 insights
            logger.info("Generating Llama 3 insights...")
            llama_insights = await self.get_insights_from_llama3(video_analysis, user_query)
            
            return llama_insights, raw_analysis
            
        except Exception as e:
            error_msg = f"Error processing video request: {str(e)}"
            logger.error(error_msg)
            return error_msg, ""

# Initialize the MCP client
try:
    mcp_client = MCPVideoAnalysisClient()
    logger.info("MCP Video Analysis Client initialized successfully")
except Exception as e:
    logger.error(f"Failed to initialize MCP client: {e}")
    mcp_client = None

# Gradio Interface Functions
async def analyze_video_interface(video_url: str, user_query: str = None) -> tuple[str, str]:
    """Gradio interface function for video analysis."""
    if not mcp_client:
        return "MCP Client not initialized. Please check your environment variables.", ""
    
    return await mcp_client.process_video_request(video_url, user_query)

def create_gradio_interface():
    """Create and configure the Gradio interface."""
    
    with gr.Blocks(
        title="MCP Video Analysis with Llama 3",
        theme=gr.themes.Soft(),
        css="""
        .gradio-container {
            max-width: 1200px !important;
        }
        .main-header {
            text-align: center;
            margin-bottom: 30px;
        }
        .analysis-output {
            max-height: 600px;
            overflow-y: auto;
        }
        """
    ) as interface:
        
        gr.HTML("""
        <div class="main-header">
            <h1>🎥 MCP Video Analysis with Llama 3 AI</h1>
            <p>Intelligent video content analysis powered by a Modal backend and Llama 3</p>
        </div>
        """)
        
        with gr.Tab("🔍 Video Analysis"):
            with gr.Row():
                with gr.Column(scale=1):
                    video_url_input = gr.Textbox(
                        label="Video URL",
                        placeholder="Enter YouTube URL or direct video link...",
                        lines=2
                    )
                    user_query_input = gr.Textbox(
                        label="Specific Question (Optional)",
                        placeholder="Ask a specific question about the video...",
                        lines=2
                    )
                    
                    with gr.Row():
                        analyze_btn = gr.Button("🚀 Analyze Video", variant="primary", size="lg")
                        clear_btn = gr.Button("🗑️ Clear", variant="secondary")
                
                with gr.Column(scale=2):
                    llama_output = gr.Textbox(
                        label="🤖 Llama 3 AI Insights",
                        lines=20,
                        elem_classes=["analysis-output"],
                        interactive=False
                    )
            
            with gr.Row():
                raw_analysis_output = gr.JSON(
                    label="📊 Raw Analysis Data",
                    elem_classes=["analysis-output"]
                )
            
            # Example videos
            gr.HTML("<h3>📝 Example Videos to Try:</h3>")
            with gr.Row():
                example_urls = [
                    "https://www.youtube.com/watch?v=dQw4w9WgXcQ",
                    "https://www.youtube.com/watch?v=jNQXAC9IVRw",
                    "https://www.youtube.com/watch?v=9bZkp7q19f0"
                ]
                for i, url in enumerate(example_urls, 1):
                    gr.Button(f"Example {i}", size="sm").click(
                        lambda url=url: url, outputs=video_url_input
                    )
        
        with gr.Tab("ℹ️ About"):
            gr.Markdown("""
            ## About MCP Video Analysis
            
            This application combines multiple AI technologies to provide comprehensive video analysis:
            
            ### 🔧 Technology Stack
            - **Modal Backend**: Scalable cloud compute for video processing and LLM inference
            - **Whisper**: Speech-to-text transcription
            - **Computer Vision Models**: Object detection, action recognition, and captioning
            - **Meta Llama 3**: Advanced AI for intelligent content analysis
            - **MCP Protocol**: Model Context Protocol for seamless integration
            
            ### 🎯 Features
            - **Transcription**: Extract spoken content from videos
            - **Visual Analysis**: Identify objects, actions, and scenes
            - **Content Understanding**: AI-powered insights and summaries
            - **Custom Queries**: Ask specific questions about video content
            
            ### 🚀 Usage
            1. Enter a video URL (YouTube or direct link)
            2. Optionally ask a specific question
            3. Click "Analyze Video" to get comprehensive insights
            4. Review both Llama 3's intelligent analysis and raw data
            
            ### 🔒 Privacy & Security
            - Video processing is handled securely in the cloud
            - No video data is stored permanently
            - API keys are handled securely via environment variables
            """)
        
        # Event handlers
        def clear_all():
            return "", "", "", ""
        
        analyze_btn.click(
            fn=analyze_video_interface,
            inputs=[video_url_input, user_query_input],
            outputs=[llama_output, raw_analysis_output],
            show_progress=True
        )
        
        clear_btn.click(
            fn=clear_all,
            outputs=[video_url_input, user_query_input, llama_output, raw_analysis_output]
        )
    
    return interface

# Create and launch the interface
if __name__ == "__main__":
    interface = create_gradio_interface()
    interface.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        show_error=True
    )