Spaces:
Running
Running
File size: 7,547 Bytes
d7291ef 351d460 d7291ef 65933cd d7291ef 1686de5 d7291ef fe5d98f d7291ef fe5d98f d7291ef 65933cd d7291ef 65933cd d7291ef 4c43a48 5778774 65933cd 4c43a48 5778774 65933cd 4c43a48 65933cd 4c43a48 5778774 d7291ef 872dec2 d7291ef 872dec2 d7291ef 4c43a48 65933cd 872dec2 d7291ef 351d460 d7291ef 5778774 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
from .vlm_service import VLMService, ModelType
from typing import Dict, Any, List
import openai
import base64
import asyncio
import json
class GPT4VService(VLMService):
"""GPT-4 Vision service implementation"""
def __init__(self, api_key: str):
super().__init__("GPT4V", ModelType.GPT4V)
self.client = openai.OpenAI(api_key=api_key)
self.model_name = "GPT-4O"
async def generate_caption(self, image_bytes: bytes, prompt: str, metadata_instructions: str = "") -> Dict[str, Any]:
"""Generate caption using GPT-4 Vision"""
try:
image_base64 = base64.b64encode(image_bytes).decode('utf-8')
response = await asyncio.to_thread(
self.client.chat.completions.create,
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt + "\n\n" + metadata_instructions},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_base64}"
}
}
]
}
],
max_tokens=800
)
content = response.choices[0].message.content
cleaned_content = content.strip()
if cleaned_content.startswith("```json"):
cleaned_content = cleaned_content[7:]
if cleaned_content.endswith("```"):
cleaned_content = cleaned_content[:-3]
cleaned_content = cleaned_content.strip()
metadata = {}
try:
metadata = json.loads(cleaned_content)
except json.JSONDecodeError:
if "```json" in content:
json_start = content.find("```json") + 7
json_end = content.find("```", json_start)
if json_end > json_start:
json_str = content[json_start:json_end].strip()
try:
metadata = json.loads(json_str)
except json.JSONDecodeError as e:
print(f"JSON parse error: {e}")
else:
import re
json_match = re.search(r'\{[^{}]*"metadata"[^{}]*\{[^{}]*\}', content)
if json_match:
try:
metadata = json.loads(json_match.group())
except json.JSONDecodeError:
pass
# Extract the three parts from the parsed JSON
description = metadata.get("description", "")
analysis = metadata.get("analysis", "")
recommended_actions = metadata.get("recommended_actions", "")
# Combine all three parts for backward compatibility
combined_content = f"Description: {description}\n\nAnalysis: {analysis}\n\nRecommended Actions: {recommended_actions}"
return {
"caption": combined_content,
"raw_response": {
"content": content,
"metadata": metadata,
"extracted_metadata": metadata
},
"metadata": metadata,
"description": description,
"analysis": analysis,
"recommended_actions": recommended_actions
}
except Exception as e:
raise Exception(f"GPT-4 Vision API error: {str(e)}")
async def generate_multi_image_caption(self, image_bytes_list: List[bytes], prompt: str, metadata_instructions: str = "") -> Dict[str, Any]:
"""Generate caption for multiple images using GPT-4 Vision"""
try:
# Create content array with text and multiple images
content = [{"type": "text", "text": prompt + "\n\n" + metadata_instructions}]
# Add each image to the content
for i, image_bytes in enumerate(image_bytes_list):
image_base64 = base64.b64encode(image_bytes).decode('utf-8')
content.append({
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_base64}"
}
})
response = await asyncio.to_thread(
self.client.chat.completions.create,
model="gpt-4o",
messages=[
{
"role": "user",
"content": content
}
],
max_tokens=1200 # Increased for multiple images
)
content = response.choices[0].message.content
cleaned_content = content.strip()
if cleaned_content.startswith("```json"):
cleaned_content = cleaned_content[7:]
if cleaned_content.endswith("```"):
cleaned_content = cleaned_content[:-3]
cleaned_content = cleaned_content.strip()
metadata = {}
try:
metadata = json.loads(cleaned_content)
except json.JSONDecodeError:
if "```json" in content:
json_start = content.find("```json") + 7
json_end = content.find("```", json_start)
if json_end > json_start:
json_str = content[json_start:json_end].strip()
try:
metadata = json.loads(json_str)
except json.JSONDecodeError as e:
print(f"JSON parse error: {e}")
else:
import re
json_match = re.search(r'\{[^{}]*"metadata"[^{}]*\{[^{}]*\}', content)
if json_match:
try:
metadata = json.loads(json_match.group())
except json.JSONDecodeError:
pass
# Extract the three parts from the parsed JSON
description = metadata.get("description", "")
analysis = metadata.get("analysis", "")
recommended_actions = metadata.get("recommended_actions", "")
# Combine all three parts for backward compatibility
combined_content = f"Description: {description}\n\nAnalysis: {analysis}\n\nRecommended Actions: {recommended_actions}"
return {
"caption": combined_content,
"raw_response": {
"content": content,
"metadata": metadata,
"extracted_metadata": metadata,
"image_count": len(image_bytes_list)
},
"metadata": metadata,
"description": description,
"analysis": analysis,
"recommended_actions": recommended_actions
}
except Exception as e:
raise Exception(f"GPT-4 Vision API error: {str(e)}") |