jomasego
feat: Replace Anthropic with Llama 3 for video analysis
3e48648
#!/usr/bin/env python3
"""
MCP Video Analysis Client with Llama 3 Integration
This application serves as an MCP (Model Context Protocol) client that:
1. Connects to video analysis tools via MCP
2. Integrates with a Llama 3 model hosted on Modal for intelligent video understanding
3. Provides a Gradio interface for user interaction
"""
import os
import json
import logging
from typing import Dict, Any, Optional
import gradio as gr
import httpx
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class MCPVideoAnalysisClient:
"""MCP Client for video analysis with Llama 3 integration."""
def __init__(self):
# Modal backend for video processing
self.video_analysis_endpoint = os.getenv(
"MODAL_VIDEO_ANALYSIS_ENDPOINT_URL",
"https://jomasego--video-analysis-gradio-pipeline-process-video-analysis.modal.run"
)
# Modal backend for Llama 3 insights
self.llama_endpoint = os.getenv(
"MODAL_LLAMA3_ENDPOINT_URL"
# This will be set to the deployed Llama 3 app URL.
# e.g., "https://jomasego--llama3-inference-service-summarize.modal.run"
)
logger.info(f"Initialized MCP Client.")
logger.info(f"Video Analysis Endpoint: {self.video_analysis_endpoint}")
if not self.llama_endpoint:
logger.warning("MODAL_LLAMA3_ENDPOINT_URL not set. LLM insights will be unavailable.")
else:
logger.info(f"Llama 3 Endpoint: {self.llama_endpoint}")
async def analyze_video_with_modal(self, video_url: str) -> Dict[str, Any]:
"""Call the Modal backend for comprehensive video analysis."""
try:
async with httpx.AsyncClient(timeout=300.0) as client:
logger.info(f"Calling video analysis backend: {video_url}")
response = await client.post(
self.video_analysis_endpoint,
json={"video_url": video_url},
headers={"Content-Type": "application/json"}
)
response.raise_for_status()
return response.json()
except Exception as e:
logger.error(f"Error calling video analysis backend: {e}")
return {"error": f"Video analysis backend error: {str(e)}"}
async def get_insights_from_llama3(self, analysis_data: Dict[str, Any], user_query: Optional[str] = None) -> str:
"""Call the Llama 3 Modal backend for intelligent insights."""
if not self.llama_endpoint:
return "Llama 3 endpoint is not configured. Cannot generate insights."
try:
payload = {
"analysis_data": analysis_data,
"user_query": user_query
}
async with httpx.AsyncClient(timeout=300.0) as client:
logger.info(f"Calling Llama 3 Modal backend for insights.")
response = await client.post(
self.llama_endpoint,
json=payload,
headers={"Content-Type": "application/json"}
)
response.raise_for_status()
result = response.json()
return result.get("summary", "No summary returned from Llama 3 service.")
except Exception as e:
logger.error(f"Error calling Llama 3 backend: {e}")
return f"Error generating Llama 3 insights: {str(e)}"
async def process_video_request(self, video_url: str, user_query: str = None) -> tuple[str, str]:
"""Process a complete video analysis request with Llama 3 enhancement."""
if not video_url or not video_url.strip():
return "Please provide a valid video URL.", ""
try:
# Step 1: Get video analysis from Modal backend
logger.info(f"Starting video analysis for: {video_url}")
video_analysis = await self.analyze_video_with_modal(video_url.strip())
# Step 2: Format the raw analysis for display
raw_analysis = json.dumps(video_analysis, indent=2)
# Step 3: Enhance with Llama 3 insights
logger.info("Generating Llama 3 insights...")
llama_insights = await self.get_insights_from_llama3(video_analysis, user_query)
return llama_insights, raw_analysis
except Exception as e:
error_msg = f"Error processing video request: {str(e)}"
logger.error(error_msg)
return error_msg, ""
# Initialize the MCP client
try:
mcp_client = MCPVideoAnalysisClient()
logger.info("MCP Video Analysis Client initialized successfully")
except Exception as e:
logger.error(f"Failed to initialize MCP client: {e}")
mcp_client = None
# Gradio Interface Functions
async def analyze_video_interface(video_url: str, user_query: str = None) -> tuple[str, str]:
"""Gradio interface function for video analysis."""
if not mcp_client:
return "MCP Client not initialized. Please check your environment variables.", ""
return await mcp_client.process_video_request(video_url, user_query)
def create_gradio_interface():
"""Create and configure the Gradio interface."""
with gr.Blocks(
title="MCP Video Analysis with Llama 3",
theme=gr.themes.Soft(),
css="""
.gradio-container {
max-width: 1200px !important;
}
.main-header {
text-align: center;
margin-bottom: 30px;
}
.analysis-output {
max-height: 600px;
overflow-y: auto;
}
"""
) as interface:
gr.HTML("""
<div class="main-header">
<h1>πŸŽ₯ MCP Video Analysis with Llama 3 AI</h1>
<p>Intelligent video content analysis powered by a Modal backend and Llama 3</p>
</div>
""")
with gr.Tab("πŸ” Video Analysis"):
with gr.Row():
with gr.Column(scale=1):
video_url_input = gr.Textbox(
label="Video URL",
placeholder="Enter YouTube URL or direct video link...",
lines=2
)
user_query_input = gr.Textbox(
label="Specific Question (Optional)",
placeholder="Ask a specific question about the video...",
lines=2
)
with gr.Row():
analyze_btn = gr.Button("πŸš€ Analyze Video", variant="primary", size="lg")
clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
with gr.Column(scale=2):
llama_output = gr.Textbox(
label="πŸ€– Llama 3 AI Insights",
lines=20,
elem_classes=["analysis-output"],
interactive=False
)
with gr.Row():
raw_analysis_output = gr.JSON(
label="πŸ“Š Raw Analysis Data",
elem_classes=["analysis-output"]
)
# Example videos
gr.HTML("<h3>πŸ“ Example Videos to Try:</h3>")
with gr.Row():
example_urls = [
"https://www.youtube.com/watch?v=dQw4w9WgXcQ",
"https://www.youtube.com/watch?v=jNQXAC9IVRw",
"https://www.youtube.com/watch?v=9bZkp7q19f0"
]
for i, url in enumerate(example_urls, 1):
gr.Button(f"Example {i}", size="sm").click(
lambda url=url: url, outputs=video_url_input
)
with gr.Tab("ℹ️ About"):
gr.Markdown("""
## About MCP Video Analysis
This application combines multiple AI technologies to provide comprehensive video analysis:
### πŸ”§ Technology Stack
- **Modal Backend**: Scalable cloud compute for video processing and LLM inference
- **Whisper**: Speech-to-text transcription
- **Computer Vision Models**: Object detection, action recognition, and captioning
- **Meta Llama 3**: Advanced AI for intelligent content analysis
- **MCP Protocol**: Model Context Protocol for seamless integration
### 🎯 Features
- **Transcription**: Extract spoken content from videos
- **Visual Analysis**: Identify objects, actions, and scenes
- **Content Understanding**: AI-powered insights and summaries
- **Custom Queries**: Ask specific questions about video content
### πŸš€ Usage
1. Enter a video URL (YouTube or direct link)
2. Optionally ask a specific question
3. Click "Analyze Video" to get comprehensive insights
4. Review both Llama 3's intelligent analysis and raw data
### πŸ”’ Privacy & Security
- Video processing is handled securely in the cloud
- No video data is stored permanently
- API keys are handled securely via environment variables
""")
# Event handlers
def clear_all():
return "", "", "", ""
analyze_btn.click(
fn=analyze_video_interface,
inputs=[video_url_input, user_query_input],
outputs=[llama_output, raw_analysis_output],
show_progress=True
)
clear_btn.click(
fn=clear_all,
outputs=[video_url_input, user_query_input, llama_output, raw_analysis_output]
)
return interface
# Create and launch the interface
if __name__ == "__main__":
interface = create_gradio_interface()
interface.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
show_error=True
)