Spaces:
Running
Running
File size: 5,211 Bytes
d7291ef 351d460 d7291ef fe5d98f d7291ef fe5d98f d7291ef 5778774 4c43a48 5778774 cb372e4 5778774 872dec2 5778774 872dec2 5778774 872dec2 5778774 cb372e4 5778774 4c43a48 5778774 872dec2 5778774 d7291ef 351d460 d7291ef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
from .vlm_service import VLMService, ModelType
from typing import Dict, Any, List
import asyncio
import time
import re
import json
import google.generativeai as genai
class GeminiService(VLMService):
"""Google Gemini Vision service implementation"""
def __init__(self, api_key: str, model: str = "gemini-1.5-flash"):
super().__init__("Gemini", ModelType.GEMINI_PRO_VISION)
self.model_name = "GEMINI15"
genai.configure(api_key=api_key)
self.model_id = model
self.model = genai.GenerativeModel(self.model_id)
async def generate_caption(self, image_bytes: bytes, prompt: str, metadata_instructions: str = "") -> Dict[str, Any]:
"""Generate caption using Google Gemini Vision"""
instruction = prompt + "\n\n" + metadata_instructions
image_part = {
"mime_type": "image/jpeg",
"data": image_bytes,
}
start = time.time()
response = await asyncio.to_thread(self.model.generate_content, [instruction, image_part])
elapsed = time.time() - start
content = getattr(response, "text", None) or ""
cleaned_content = content
if cleaned_content.startswith("```json"):
cleaned_content = re.sub(r"^```json\s*", "", cleaned_content)
cleaned_content = re.sub(r"\s*```$", "", cleaned_content)
try:
parsed = json.loads(cleaned_content)
description = parsed.get("description", "")
analysis = parsed.get("analysis", "")
recommended_actions = parsed.get("recommended_actions", "")
metadata = parsed.get("metadata", {})
# Combine all three parts for backward compatibility
caption_text = f"Description: {description}\n\nAnalysis: {analysis}\n\nRecommended Actions: {recommended_actions}"
if metadata.get("epsg"):
epsg_value = metadata["epsg"]
allowed_epsg = ["4326", "3857", "32617", "32633", "32634", "OTHER"]
if epsg_value not in allowed_epsg:
metadata["epsg"] = "OTHER"
except json.JSONDecodeError:
description = ""
analysis = content
recommended_actions = ""
caption_text = content
metadata = {}
raw_response: Dict[str, Any] = {"model": self.model_id}
return {
"caption": caption_text,
"metadata": metadata,
"confidence": None,
"processing_time": elapsed,
"raw_response": raw_response,
"description": description,
"analysis": analysis,
"recommended_actions": recommended_actions
}
async def generate_multi_image_caption(self, image_bytes_list: List[bytes], prompt: str, metadata_instructions: str = "") -> Dict[str, Any]:
"""Generate caption for multiple images using Google Gemini Vision"""
instruction = prompt + "\n\n" + metadata_instructions
# Create content list with instruction and multiple images
content = [instruction]
for image_bytes in image_bytes_list:
image_part = {
"mime_type": "image/jpeg",
"data": image_bytes,
}
content.append(image_part)
start = time.time()
response = await asyncio.to_thread(self.model.generate_content, content)
elapsed = time.time() - start
content = getattr(response, "text", None) or ""
cleaned_content = content
if cleaned_content.startswith("```json"):
cleaned_content = re.sub(r"^```json\s*", "", cleaned_content)
cleaned_content = re.sub(r"\s*```$", "", cleaned_content)
try:
parsed = json.loads(cleaned_content)
description = parsed.get("description", "")
analysis = parsed.get("analysis", "")
recommended_actions = parsed.get("recommended_actions", "")
metadata = parsed.get("metadata", {})
# Combine all three parts for backward compatibility
caption_text = f"Description: {description}\n\nAnalysis: {analysis}\n\nRecommended Actions: {recommended_actions}"
if metadata.get("epsg"):
epsg_value = metadata["epsg"]
allowed_epsg = ["4326", "3857", "32617", "32633", "32634", "OTHER"]
if epsg_value not in allowed_epsg:
metadata["epsg"] = "OTHER"
except json.JSONDecodeError:
description = ""
analysis = content
recommended_actions = ""
caption_text = content
metadata = {}
raw_response: Dict[str, Any] = {
"model": self.model_id,
"image_count": len(image_bytes_list)
}
return {
"caption": caption_text,
"metadata": metadata,
"confidence": None,
"processing_time": elapsed,
"raw_response": raw_response,
"description": description,
"analysis": analysis,
"recommended_actions": recommended_actions
}
|