File size: 7,547 Bytes
d7291ef
351d460
d7291ef
 
 
65933cd
d7291ef
 
 
 
 
 
 
1686de5
d7291ef
fe5d98f
d7291ef
 
 
 
 
 
 
 
 
 
 
fe5d98f
d7291ef
 
 
 
 
 
 
 
 
 
 
 
 
 
65933cd
 
 
 
 
 
d7291ef
65933cd
d7291ef
4c43a48
5778774
65933cd
 
 
 
 
 
4c43a48
5778774
 
65933cd
4c43a48
65933cd
 
 
4c43a48
5778774
 
d7291ef
872dec2
 
 
 
 
 
 
 
d7291ef
872dec2
d7291ef
4c43a48
 
 
65933cd
872dec2
 
 
 
d7291ef
 
351d460
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d7291ef
5778774
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
from .vlm_service import VLMService, ModelType
from typing import Dict, Any, List
import openai
import base64
import asyncio
import json

class GPT4VService(VLMService):
    """GPT-4 Vision service implementation"""
    
    def __init__(self, api_key: str):
        super().__init__("GPT4V", ModelType.GPT4V)
        self.client = openai.OpenAI(api_key=api_key)
        self.model_name = "GPT-4O"
    
    async def generate_caption(self, image_bytes: bytes, prompt: str, metadata_instructions: str = "") -> Dict[str, Any]:
        """Generate caption using GPT-4 Vision"""
        try:
            image_base64 = base64.b64encode(image_bytes).decode('utf-8')
            
            response = await asyncio.to_thread(
                self.client.chat.completions.create,
                model="gpt-4o",
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": prompt + "\n\n" + metadata_instructions},
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:image/jpeg;base64,{image_base64}"
                                }
                            }
                        ]
                    }
                ],
                max_tokens=800
            )
            
            content = response.choices[0].message.content
            
            cleaned_content = content.strip()
            if cleaned_content.startswith("```json"):
                cleaned_content = cleaned_content[7:]
            if cleaned_content.endswith("```"):
                cleaned_content = cleaned_content[:-3]
            cleaned_content = cleaned_content.strip()
            
            metadata = {}
            try:
                metadata = json.loads(cleaned_content)
            except json.JSONDecodeError:
                if "```json" in content:
                    json_start = content.find("```json") + 7
                    json_end = content.find("```", json_start)
                    if json_end > json_start:
                        json_str = content[json_start:json_end].strip()
                        try:
                            metadata = json.loads(json_str)
                        except json.JSONDecodeError as e:
                            print(f"JSON parse error: {e}")
                else:
                    import re
                    json_match = re.search(r'\{[^{}]*"metadata"[^{}]*\{[^{}]*\}', content)
                    if json_match:
                        try:
                            metadata = json.loads(json_match.group())
                        except json.JSONDecodeError:
                            pass
            
            # Extract the three parts from the parsed JSON
            description = metadata.get("description", "")
            analysis = metadata.get("analysis", "")
            recommended_actions = metadata.get("recommended_actions", "")
            
            # Combine all three parts for backward compatibility
            combined_content = f"Description: {description}\n\nAnalysis: {analysis}\n\nRecommended Actions: {recommended_actions}"
            
            return {
                "caption": combined_content,
                "raw_response": {
                    "content": content, 
                    "metadata": metadata,
                    "extracted_metadata": metadata
                },
                "metadata": metadata,
                "description": description,
                "analysis": analysis,
                "recommended_actions": recommended_actions
            }
            
        except Exception as e:
            raise Exception(f"GPT-4 Vision API error: {str(e)}")
    
    async def generate_multi_image_caption(self, image_bytes_list: List[bytes], prompt: str, metadata_instructions: str = "") -> Dict[str, Any]:
        """Generate caption for multiple images using GPT-4 Vision"""
        try:
            # Create content array with text and multiple images
            content = [{"type": "text", "text": prompt + "\n\n" + metadata_instructions}]
            
            # Add each image to the content
            for i, image_bytes in enumerate(image_bytes_list):
                image_base64 = base64.b64encode(image_bytes).decode('utf-8')
                content.append({
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{image_base64}"
                    }
                })
            
            response = await asyncio.to_thread(
                self.client.chat.completions.create,
                model="gpt-4o",
                messages=[
                    {
                        "role": "user",
                        "content": content
                    }
                ],
                max_tokens=1200  # Increased for multiple images
            )
            
            content = response.choices[0].message.content
            
            cleaned_content = content.strip()
            if cleaned_content.startswith("```json"):
                cleaned_content = cleaned_content[7:]
            if cleaned_content.endswith("```"):
                cleaned_content = cleaned_content[:-3]
            cleaned_content = cleaned_content.strip()
            
            metadata = {}
            try:
                metadata = json.loads(cleaned_content)
            except json.JSONDecodeError:
                if "```json" in content:
                    json_start = content.find("```json") + 7
                    json_end = content.find("```", json_start)
                    if json_end > json_start:
                        json_str = content[json_start:json_end].strip()
                        try:
                            metadata = json.loads(json_str)
                        except json.JSONDecodeError as e:
                            print(f"JSON parse error: {e}")
                else:
                    import re
                    json_match = re.search(r'\{[^{}]*"metadata"[^{}]*\{[^{}]*\}', content)
                    if json_match:
                        try:
                            metadata = json.loads(json_match.group())
                        except json.JSONDecodeError:
                            pass
            
            # Extract the three parts from the parsed JSON
            description = metadata.get("description", "")
            analysis = metadata.get("analysis", "")
            recommended_actions = metadata.get("recommended_actions", "")
            
            # Combine all three parts for backward compatibility
            combined_content = f"Description: {description}\n\nAnalysis: {analysis}\n\nRecommended Actions: {recommended_actions}"
            
            return {
                "caption": combined_content,
                "raw_response": {
                    "content": content, 
                    "metadata": metadata,
                    "extracted_metadata": metadata,
                    "image_count": len(image_bytes_list)
                },
                "metadata": metadata,
                "description": description,
                "analysis": analysis,
                "recommended_actions": recommended_actions
            }
            
        except Exception as e:
            raise Exception(f"GPT-4 Vision API error: {str(e)}")