SCGR's picture
multi upload
351d460
from .vlm_service import VLMService, ModelType
from typing import Dict, Any, List
import openai
import base64
import asyncio
import json
class GPT4VService(VLMService):
"""GPT-4 Vision service implementation"""
def __init__(self, api_key: str):
super().__init__("GPT4V", ModelType.GPT4V)
self.client = openai.OpenAI(api_key=api_key)
self.model_name = "GPT-4O"
async def generate_caption(self, image_bytes: bytes, prompt: str, metadata_instructions: str = "") -> Dict[str, Any]:
"""Generate caption using GPT-4 Vision"""
try:
image_base64 = base64.b64encode(image_bytes).decode('utf-8')
response = await asyncio.to_thread(
self.client.chat.completions.create,
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt + "\n\n" + metadata_instructions},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_base64}"
}
}
]
}
],
max_tokens=800
)
content = response.choices[0].message.content
cleaned_content = content.strip()
if cleaned_content.startswith("```json"):
cleaned_content = cleaned_content[7:]
if cleaned_content.endswith("```"):
cleaned_content = cleaned_content[:-3]
cleaned_content = cleaned_content.strip()
metadata = {}
try:
metadata = json.loads(cleaned_content)
except json.JSONDecodeError:
if "```json" in content:
json_start = content.find("```json") + 7
json_end = content.find("```", json_start)
if json_end > json_start:
json_str = content[json_start:json_end].strip()
try:
metadata = json.loads(json_str)
except json.JSONDecodeError as e:
print(f"JSON parse error: {e}")
else:
import re
json_match = re.search(r'\{[^{}]*"metadata"[^{}]*\{[^{}]*\}', content)
if json_match:
try:
metadata = json.loads(json_match.group())
except json.JSONDecodeError:
pass
# Extract the three parts from the parsed JSON
description = metadata.get("description", "")
analysis = metadata.get("analysis", "")
recommended_actions = metadata.get("recommended_actions", "")
# Combine all three parts for backward compatibility
combined_content = f"Description: {description}\n\nAnalysis: {analysis}\n\nRecommended Actions: {recommended_actions}"
return {
"caption": combined_content,
"raw_response": {
"content": content,
"metadata": metadata,
"extracted_metadata": metadata
},
"metadata": metadata,
"description": description,
"analysis": analysis,
"recommended_actions": recommended_actions
}
except Exception as e:
raise Exception(f"GPT-4 Vision API error: {str(e)}")
async def generate_multi_image_caption(self, image_bytes_list: List[bytes], prompt: str, metadata_instructions: str = "") -> Dict[str, Any]:
"""Generate caption for multiple images using GPT-4 Vision"""
try:
# Create content array with text and multiple images
content = [{"type": "text", "text": prompt + "\n\n" + metadata_instructions}]
# Add each image to the content
for i, image_bytes in enumerate(image_bytes_list):
image_base64 = base64.b64encode(image_bytes).decode('utf-8')
content.append({
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_base64}"
}
})
response = await asyncio.to_thread(
self.client.chat.completions.create,
model="gpt-4o",
messages=[
{
"role": "user",
"content": content
}
],
max_tokens=1200 # Increased for multiple images
)
content = response.choices[0].message.content
cleaned_content = content.strip()
if cleaned_content.startswith("```json"):
cleaned_content = cleaned_content[7:]
if cleaned_content.endswith("```"):
cleaned_content = cleaned_content[:-3]
cleaned_content = cleaned_content.strip()
metadata = {}
try:
metadata = json.loads(cleaned_content)
except json.JSONDecodeError:
if "```json" in content:
json_start = content.find("```json") + 7
json_end = content.find("```", json_start)
if json_end > json_start:
json_str = content[json_start:json_end].strip()
try:
metadata = json.loads(json_str)
except json.JSONDecodeError as e:
print(f"JSON parse error: {e}")
else:
import re
json_match = re.search(r'\{[^{}]*"metadata"[^{}]*\{[^{}]*\}', content)
if json_match:
try:
metadata = json.loads(json_match.group())
except json.JSONDecodeError:
pass
# Extract the three parts from the parsed JSON
description = metadata.get("description", "")
analysis = metadata.get("analysis", "")
recommended_actions = metadata.get("recommended_actions", "")
# Combine all three parts for backward compatibility
combined_content = f"Description: {description}\n\nAnalysis: {analysis}\n\nRecommended Actions: {recommended_actions}"
return {
"caption": combined_content,
"raw_response": {
"content": content,
"metadata": metadata,
"extracted_metadata": metadata,
"image_count": len(image_bytes_list)
},
"metadata": metadata,
"description": description,
"analysis": analysis,
"recommended_actions": recommended_actions
}
except Exception as e:
raise Exception(f"GPT-4 Vision API error: {str(e)}")