File size: 5,211 Bytes
d7291ef
351d460
d7291ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe5d98f
d7291ef
fe5d98f
d7291ef
 
 
 
 
 
 
5778774
 
 
 
4c43a48
5778774
 
 
 
cb372e4
5778774
 
872dec2
 
 
5778774
872dec2
 
 
 
5778774
 
 
 
 
 
872dec2
 
 
5778774
 
cb372e4
5778774
4c43a48
5778774
 
 
 
 
 
872dec2
 
 
5778774
d7291ef
351d460
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d7291ef
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
from .vlm_service import VLMService, ModelType
from typing import Dict, Any, List
import asyncio
import time
import re
import json

import google.generativeai as genai


class GeminiService(VLMService):
    """Google Gemini Vision service implementation"""

    def __init__(self, api_key: str, model: str = "gemini-1.5-flash"):
        super().__init__("Gemini", ModelType.GEMINI_PRO_VISION)
        self.model_name = "GEMINI15"
        genai.configure(api_key=api_key)
        self.model_id = model
        self.model = genai.GenerativeModel(self.model_id)

    async def generate_caption(self, image_bytes: bytes, prompt: str, metadata_instructions: str = "") -> Dict[str, Any]:
        """Generate caption using Google Gemini Vision"""
        instruction = prompt + "\n\n" + metadata_instructions

        image_part = {
            "mime_type": "image/jpeg",
            "data": image_bytes,
        }

        start = time.time()
        response = await asyncio.to_thread(self.model.generate_content, [instruction, image_part])
        elapsed = time.time() - start

        content = getattr(response, "text", None) or ""

        cleaned_content = content
        if cleaned_content.startswith("```json"):
            cleaned_content = re.sub(r"^```json\s*", "", cleaned_content)
            cleaned_content = re.sub(r"\s*```$", "", cleaned_content)

        try:
            parsed = json.loads(cleaned_content)
            description = parsed.get("description", "")
            analysis = parsed.get("analysis", "")
            recommended_actions = parsed.get("recommended_actions", "")
            metadata = parsed.get("metadata", {})
            
            # Combine all three parts for backward compatibility
            caption_text = f"Description: {description}\n\nAnalysis: {analysis}\n\nRecommended Actions: {recommended_actions}"
            
            if metadata.get("epsg"):
                epsg_value = metadata["epsg"]
                allowed_epsg = ["4326", "3857", "32617", "32633", "32634", "OTHER"]
                if epsg_value not in allowed_epsg:
                    metadata["epsg"] = "OTHER"
        except json.JSONDecodeError:
            description = ""
            analysis = content
            recommended_actions = ""
            caption_text = content
            metadata = {}

        raw_response: Dict[str, Any] = {"model": self.model_id}

        return {
            "caption": caption_text,
            "metadata": metadata,
            "confidence": None,
            "processing_time": elapsed,
            "raw_response": raw_response,
            "description": description,
            "analysis": analysis,
            "recommended_actions": recommended_actions
        }

    async def generate_multi_image_caption(self, image_bytes_list: List[bytes], prompt: str, metadata_instructions: str = "") -> Dict[str, Any]:
        """Generate caption for multiple images using Google Gemini Vision"""
        instruction = prompt + "\n\n" + metadata_instructions

        # Create content list with instruction and multiple images
        content = [instruction]
        for image_bytes in image_bytes_list:
            image_part = {
                "mime_type": "image/jpeg",
                "data": image_bytes,
            }
            content.append(image_part)

        start = time.time()
        response = await asyncio.to_thread(self.model.generate_content, content)
        elapsed = time.time() - start

        content = getattr(response, "text", None) or ""

        cleaned_content = content
        if cleaned_content.startswith("```json"):
            cleaned_content = re.sub(r"^```json\s*", "", cleaned_content)
            cleaned_content = re.sub(r"\s*```$", "", cleaned_content)

        try:
            parsed = json.loads(cleaned_content)
            description = parsed.get("description", "")
            analysis = parsed.get("analysis", "")
            recommended_actions = parsed.get("recommended_actions", "")
            metadata = parsed.get("metadata", {})
            
            # Combine all three parts for backward compatibility
            caption_text = f"Description: {description}\n\nAnalysis: {analysis}\n\nRecommended Actions: {recommended_actions}"
            
            if metadata.get("epsg"):
                epsg_value = metadata["epsg"]
                allowed_epsg = ["4326", "3857", "32617", "32633", "32634", "OTHER"]
                if epsg_value not in allowed_epsg:
                    metadata["epsg"] = "OTHER"
        except json.JSONDecodeError:
            description = ""
            analysis = content
            recommended_actions = ""
            caption_text = content
            metadata = {}

        raw_response: Dict[str, Any] = {
            "model": self.model_id,
            "image_count": len(image_bytes_list)
        }

        return {
            "caption": caption_text,
            "metadata": metadata,
            "confidence": None,
            "processing_time": elapsed,
            "raw_response": raw_response,
            "description": description,
            "analysis": analysis,
            "recommended_actions": recommended_actions
        }