Spaces:
Sleeping
Sleeping
#!/usr/bin/env python3 | |
""" | |
MCP Video Analysis Client with Llama 3 Integration | |
This application serves as an MCP (Model Context Protocol) client that: | |
1. Connects to video analysis tools via MCP | |
2. Integrates with a Llama 3 model hosted on Modal for intelligent video understanding | |
3. Provides a Gradio interface for user interaction | |
""" | |
import os | |
import json | |
import logging | |
from typing import Dict, Any, Optional | |
import gradio as gr | |
import httpx | |
# Configure logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
class MCPVideoAnalysisClient: | |
"""MCP Client for video analysis with Llama 3 integration.""" | |
def __init__(self): | |
# Modal backend for video processing | |
self.video_analysis_endpoint = os.getenv( | |
"MODAL_VIDEO_ANALYSIS_ENDPOINT_URL", | |
"https://jomasego--video-analysis-gradio-pipeline-process-video-analysis.modal.run" | |
) | |
# Modal backend for Llama 3 insights | |
self.llama_endpoint = os.getenv( | |
"MODAL_LLAMA3_ENDPOINT_URL" | |
# This will be set to the deployed Llama 3 app URL. | |
# e.g., "https://jomasego--llama3-inference-service-summarize.modal.run" | |
) | |
logger.info(f"Initialized MCP Client.") | |
logger.info(f"Video Analysis Endpoint: {self.video_analysis_endpoint}") | |
if not self.llama_endpoint: | |
logger.warning("MODAL_LLAMA3_ENDPOINT_URL not set. LLM insights will be unavailable.") | |
else: | |
logger.info(f"Llama 3 Endpoint: {self.llama_endpoint}") | |
async def analyze_video_with_modal(self, video_url: str) -> Dict[str, Any]: | |
"""Call the Modal backend for comprehensive video analysis.""" | |
try: | |
async with httpx.AsyncClient(timeout=300.0) as client: | |
logger.info(f"Calling video analysis backend: {video_url}") | |
response = await client.post( | |
self.video_analysis_endpoint, | |
json={"video_url": video_url}, | |
headers={"Content-Type": "application/json"} | |
) | |
response.raise_for_status() | |
return response.json() | |
except Exception as e: | |
logger.error(f"Error calling video analysis backend: {e}") | |
return {"error": f"Video analysis backend error: {str(e)}"} | |
async def get_insights_from_llama3(self, analysis_data: Dict[str, Any], user_query: Optional[str] = None) -> str: | |
"""Call the Llama 3 Modal backend for intelligent insights.""" | |
if not self.llama_endpoint: | |
return "Llama 3 endpoint is not configured. Cannot generate insights." | |
try: | |
payload = { | |
"analysis_data": analysis_data, | |
"user_query": user_query | |
} | |
async with httpx.AsyncClient(timeout=300.0) as client: | |
logger.info(f"Calling Llama 3 Modal backend for insights.") | |
response = await client.post( | |
self.llama_endpoint, | |
json=payload, | |
headers={"Content-Type": "application/json"} | |
) | |
response.raise_for_status() | |
result = response.json() | |
return result.get("summary", "No summary returned from Llama 3 service.") | |
except Exception as e: | |
logger.error(f"Error calling Llama 3 backend: {e}") | |
return f"Error generating Llama 3 insights: {str(e)}" | |
async def process_video_request(self, video_url: str, user_query: str = None) -> tuple[str, str]: | |
"""Process a complete video analysis request with Llama 3 enhancement.""" | |
if not video_url or not video_url.strip(): | |
return "Please provide a valid video URL.", "" | |
try: | |
# Step 1: Get video analysis from Modal backend | |
logger.info(f"Starting video analysis for: {video_url}") | |
video_analysis = await self.analyze_video_with_modal(video_url.strip()) | |
# Step 2: Format the raw analysis for display | |
raw_analysis = json.dumps(video_analysis, indent=2) | |
# Step 3: Enhance with Llama 3 insights | |
logger.info("Generating Llama 3 insights...") | |
llama_insights = await self.get_insights_from_llama3(video_analysis, user_query) | |
return llama_insights, raw_analysis | |
except Exception as e: | |
error_msg = f"Error processing video request: {str(e)}" | |
logger.error(error_msg) | |
return error_msg, "" | |
# Initialize the MCP client | |
try: | |
mcp_client = MCPVideoAnalysisClient() | |
logger.info("MCP Video Analysis Client initialized successfully") | |
except Exception as e: | |
logger.error(f"Failed to initialize MCP client: {e}") | |
mcp_client = None | |
# Gradio Interface Functions | |
async def analyze_video_interface(video_url: str, user_query: str = None) -> tuple[str, str]: | |
"""Gradio interface function for video analysis.""" | |
if not mcp_client: | |
return "MCP Client not initialized. Please check your environment variables.", "" | |
return await mcp_client.process_video_request(video_url, user_query) | |
def create_gradio_interface(): | |
"""Create and configure the Gradio interface.""" | |
with gr.Blocks( | |
title="MCP Video Analysis with Llama 3", | |
theme=gr.themes.Soft(), | |
css=""" | |
.gradio-container { | |
max-width: 1200px !important; | |
} | |
.main-header { | |
text-align: center; | |
margin-bottom: 30px; | |
} | |
.analysis-output { | |
max-height: 600px; | |
overflow-y: auto; | |
} | |
""" | |
) as interface: | |
gr.HTML(""" | |
<div class="main-header"> | |
<h1>π₯ MCP Video Analysis with Llama 3 AI</h1> | |
<p>Intelligent video content analysis powered by a Modal backend and Llama 3</p> | |
</div> | |
""") | |
with gr.Tab("π Video Analysis"): | |
with gr.Row(): | |
with gr.Column(scale=1): | |
video_url_input = gr.Textbox( | |
label="Video URL", | |
placeholder="Enter YouTube URL or direct video link...", | |
lines=2 | |
) | |
user_query_input = gr.Textbox( | |
label="Specific Question (Optional)", | |
placeholder="Ask a specific question about the video...", | |
lines=2 | |
) | |
with gr.Row(): | |
analyze_btn = gr.Button("π Analyze Video", variant="primary", size="lg") | |
clear_btn = gr.Button("ποΈ Clear", variant="secondary") | |
with gr.Column(scale=2): | |
llama_output = gr.Textbox( | |
label="π€ Llama 3 AI Insights", | |
lines=20, | |
elem_classes=["analysis-output"], | |
interactive=False | |
) | |
with gr.Row(): | |
raw_analysis_output = gr.JSON( | |
label="π Raw Analysis Data", | |
elem_classes=["analysis-output"] | |
) | |
# Example videos | |
gr.HTML("<h3>π Example Videos to Try:</h3>") | |
with gr.Row(): | |
example_urls = [ | |
"https://www.youtube.com/watch?v=dQw4w9WgXcQ", | |
"https://www.youtube.com/watch?v=jNQXAC9IVRw", | |
"https://www.youtube.com/watch?v=9bZkp7q19f0" | |
] | |
for i, url in enumerate(example_urls, 1): | |
gr.Button(f"Example {i}", size="sm").click( | |
lambda url=url: url, outputs=video_url_input | |
) | |
with gr.Tab("βΉοΈ About"): | |
gr.Markdown(""" | |
## About MCP Video Analysis | |
This application combines multiple AI technologies to provide comprehensive video analysis: | |
### π§ Technology Stack | |
- **Modal Backend**: Scalable cloud compute for video processing and LLM inference | |
- **Whisper**: Speech-to-text transcription | |
- **Computer Vision Models**: Object detection, action recognition, and captioning | |
- **Meta Llama 3**: Advanced AI for intelligent content analysis | |
- **MCP Protocol**: Model Context Protocol for seamless integration | |
### π― Features | |
- **Transcription**: Extract spoken content from videos | |
- **Visual Analysis**: Identify objects, actions, and scenes | |
- **Content Understanding**: AI-powered insights and summaries | |
- **Custom Queries**: Ask specific questions about video content | |
### π Usage | |
1. Enter a video URL (YouTube or direct link) | |
2. Optionally ask a specific question | |
3. Click "Analyze Video" to get comprehensive insights | |
4. Review both Llama 3's intelligent analysis and raw data | |
### π Privacy & Security | |
- Video processing is handled securely in the cloud | |
- No video data is stored permanently | |
- API keys are handled securely via environment variables | |
""") | |
# Event handlers | |
def clear_all(): | |
return "", "", "", "" | |
analyze_btn.click( | |
fn=analyze_video_interface, | |
inputs=[video_url_input, user_query_input], | |
outputs=[llama_output, raw_analysis_output], | |
show_progress=True | |
) | |
clear_btn.click( | |
fn=clear_all, | |
outputs=[video_url_input, user_query_input, llama_output, raw_analysis_output] | |
) | |
return interface | |
# Create and launch the interface | |
if __name__ == "__main__": | |
interface = create_gradio_interface() | |
interface.launch( | |
server_name="0.0.0.0", | |
server_port=7860, | |
share=False, | |
show_error=True | |
) | |