|
|
|
import requests |
|
import json |
|
from typing import Optional, Dict, Any |
|
from .utils import encode_image_to_base64, validate_file_exists, get_env_var, logger |
|
|
|
class MultimodalTools: |
|
"""Free multimodal AI tools using OpenRouter and other free services""" |
|
|
|
def __init__(self, openrouter_key: Optional[str] = None): |
|
self.openrouter_key = openrouter_key or get_env_var("OPENROUTER_API_KEY", None) |
|
self.openrouter_url = "https://openrouter.ai/api/v1/chat/completions" |
|
self.headers = { |
|
"Authorization": f"Bearer {self.openrouter_key}", |
|
"Content-Type": "application/json", |
|
"HTTP-Referer": "https://your-app.com", |
|
"X-Title": "Multimodal Tools" |
|
} |
|
|
|
|
|
self.vision_model = "google/gemini-2.5-flash-preview-05-20" |
|
self.text_model = "google/gemini-2.5-flash-preview-05-20" |
|
|
|
def _make_openrouter_request(self, payload: Dict[str, Any]) -> str: |
|
"""Make request to OpenRouter API with error handling""" |
|
try: |
|
response = requests.post( |
|
self.openrouter_url, |
|
headers=self.headers, |
|
json=payload, |
|
timeout=30 |
|
) |
|
response.raise_for_status() |
|
|
|
result = response.json() |
|
if 'choices' in result and len(result['choices']) > 0: |
|
return result['choices'][0]['message']['content'] |
|
else: |
|
logger.error(f"Unexpected response format: {result}") |
|
return "Error: Invalid response format" |
|
|
|
except requests.exceptions.RequestException as e: |
|
logger.error(f"OpenRouter API request failed: {str(e)}") |
|
return f"Error making API request: {str(e)}" |
|
except Exception as e: |
|
logger.error(f"Unexpected error: {str(e)}") |
|
return f"Unexpected error: {str(e)}" |
|
|
|
def analyze_image(self, image_path: str, question: str = "Describe this image in detail") -> str: |
|
""" |
|
Analyze image content using multimodal AI |
|
|
|
Args: |
|
image_path: Path to image file |
|
question: Question about the image |
|
|
|
Returns: |
|
AI analysis of the image |
|
""" |
|
if not validate_file_exists(image_path): |
|
return f"Error: Image file not found at {image_path}" |
|
|
|
try: |
|
encoded_image = encode_image_to_base64(image_path) |
|
|
|
payload = { |
|
"model": self.vision_model, |
|
"messages": [ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{"type": "text", "text": question}, |
|
{ |
|
"type": "image_url", |
|
"image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"} |
|
} |
|
] |
|
} |
|
], |
|
"temperature": 0, |
|
"max_tokens": 1024 |
|
} |
|
|
|
return self._make_openrouter_request(payload) |
|
|
|
except Exception as e: |
|
error_msg = f"Error analyzing image: {str(e)}" |
|
logger.error(error_msg) |
|
return error_msg |
|
|
|
def extract_text_from_image(self, image_path: str) -> str: |
|
""" |
|
Extract text from image using OCR via multimodal AI |
|
|
|
Args: |
|
image_path: Path to image file |
|
|
|
Returns: |
|
Extracted text from image |
|
""" |
|
ocr_prompt = """Extract all visible text from this image. |
|
Return only the text content without any additional commentary or formatting. |
|
If no text is visible, return 'No text found'.""" |
|
|
|
return self.analyze_image(image_path, ocr_prompt) |
|
|
|
def analyze_audio_transcript(self, transcript: str, question: str = "Summarize this audio content") -> str: |
|
""" |
|
Analyze audio content via transcript |
|
|
|
Args: |
|
transcript: Audio transcript text |
|
question: Question about the audio content |
|
|
|
Returns: |
|
AI analysis of the audio content |
|
""" |
|
if not transcript.strip(): |
|
return "Error: Empty transcript provided" |
|
|
|
try: |
|
payload = { |
|
"model": self.text_model, |
|
"messages": [ |
|
{ |
|
"role": "user", |
|
"content": f"Audio transcript: {transcript}\n\nQuestion: {question}" |
|
} |
|
], |
|
"temperature": 0, |
|
"max_tokens": 1024 |
|
} |
|
|
|
return self._make_openrouter_request(payload) |
|
|
|
except Exception as e: |
|
error_msg = f"Error analyzing audio transcript: {str(e)}" |
|
logger.error(error_msg) |
|
return error_msg |
|
|
|
def describe_image(self, image_path: str) -> str: |
|
"""Get a detailed description of an image""" |
|
return self.analyze_image( |
|
image_path, |
|
"Provide a detailed, objective description of this image including objects, people, colors, setting, and any notable details." |
|
) |
|
|
|
def answer_visual_question(self, image_path: str, question: str) -> str: |
|
"""Answer a specific question about an image""" |
|
return self.analyze_image(image_path, question) |
|
|
|
|
|
def analyze_image(image_path: str, question: str = "Describe this image in detail") -> str: |
|
"""Standalone function to analyze an image""" |
|
tools = MultimodalTools() |
|
return tools.analyze_image(image_path, question) |
|
|
|
def extract_text(image_path: str) -> str: |
|
"""Standalone function to extract text from an image""" |
|
tools = MultimodalTools() |
|
return tools.extract_text_from_image(image_path) |
|
|
|
def analyze_transcript(transcript: str, question: str = "Summarize this content") -> str: |
|
"""Standalone function to analyze audio transcript""" |
|
tools = MultimodalTools() |
|
return tools.analyze_audio_transcript(transcript, question) |
|
|