Spaces:

promptAId
/

Promptaid-VIsion

Running

App Files Files Community

Promptaid-VIsion / py_backend /app /services /gpt4v_service.py

SCGR

multi upload

351d460 3 days ago

raw

history blame contribute delete

7.55 kB

	from .vlm_service import VLMService, ModelType
	from typing import Dict, Any, List
	import openai
	import base64
	import asyncio
	import json

	class GPT4VService(VLMService):
	"""GPT-4 Vision service implementation"""

	def __init__(self, api_key: str):
	super().__init__("GPT4V", ModelType.GPT4V)
	self.client = openai.OpenAI(api_key=api_key)
	self.model_name = "GPT-4O"

	async def generate_caption(self, image_bytes: bytes, prompt: str, metadata_instructions: str = "") -> Dict[str, Any]:
	"""Generate caption using GPT-4 Vision"""
	try:
	image_base64 = base64.b64encode(image_bytes).decode('utf-8')

	response = await asyncio.to_thread(
	self.client.chat.completions.create,
	model="gpt-4o",
	messages=[
	{
	"role": "user",
	"content": [
	{"type": "text", "text": prompt + "\n\n" + metadata_instructions},
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:image/jpeg;base64,{image_base64}"
	}
	}
	]
	}
	],
	max_tokens=800
	)

	content = response.choices[0].message.content

	cleaned_content = content.strip()
	if cleaned_content.startswith("```json"):
	cleaned_content = cleaned_content[7:]
	if cleaned_content.endswith("```"):
	cleaned_content = cleaned_content[:-3]
	cleaned_content = cleaned_content.strip()

	metadata = {}
	try:
	metadata = json.loads(cleaned_content)
	except json.JSONDecodeError:
	if "```json" in content:
	json_start = content.find("```json") + 7
	json_end = content.find("```", json_start)
	if json_end > json_start:
	json_str = content[json_start:json_end].strip()
	try:
	metadata = json.loads(json_str)
	except json.JSONDecodeError as e:
	print(f"JSON parse error: {e}")
	else:
	import re
	json_match = re.search(r'\{[^{}]"metadata"[^{}]\{[^{}]*\}', content)
	if json_match:
	try:
	metadata = json.loads(json_match.group())
	except json.JSONDecodeError:
	pass

	# Extract the three parts from the parsed JSON
	description = metadata.get("description", "")
	analysis = metadata.get("analysis", "")
	recommended_actions = metadata.get("recommended_actions", "")

	# Combine all three parts for backward compatibility
	combined_content = f"Description: {description}\n\nAnalysis: {analysis}\n\nRecommended Actions: {recommended_actions}"

	return {
	"caption": combined_content,
	"raw_response": {
	"content": content,
	"metadata": metadata,
	"extracted_metadata": metadata
	},
	"metadata": metadata,
	"description": description,
	"analysis": analysis,
	"recommended_actions": recommended_actions
	}

	except Exception as e:
	raise Exception(f"GPT-4 Vision API error: {str(e)}")

	async def generate_multi_image_caption(self, image_bytes_list: List[bytes], prompt: str, metadata_instructions: str = "") -> Dict[str, Any]:
	"""Generate caption for multiple images using GPT-4 Vision"""
	try:
	# Create content array with text and multiple images
	content = [{"type": "text", "text": prompt + "\n\n" + metadata_instructions}]

	# Add each image to the content
	for i, image_bytes in enumerate(image_bytes_list):
	image_base64 = base64.b64encode(image_bytes).decode('utf-8')
	content.append({
	"type": "image_url",
	"image_url": {
	"url": f"data:image/jpeg;base64,{image_base64}"
	}
	})

	response = await asyncio.to_thread(
	self.client.chat.completions.create,
	model="gpt-4o",
	messages=[
	{
	"role": "user",
	"content": content
	}
	],
	max_tokens=1200 # Increased for multiple images
	)

	content = response.choices[0].message.content

	cleaned_content = content.strip()
	if cleaned_content.startswith("```json"):
	cleaned_content = cleaned_content[7:]
	if cleaned_content.endswith("```"):
	cleaned_content = cleaned_content[:-3]
	cleaned_content = cleaned_content.strip()

	metadata = {}
	try:
	metadata = json.loads(cleaned_content)
	except json.JSONDecodeError:
	if "```json" in content:
	json_start = content.find("```json") + 7
	json_end = content.find("```", json_start)
	if json_end > json_start:
	json_str = content[json_start:json_end].strip()
	try:
	metadata = json.loads(json_str)
	except json.JSONDecodeError as e:
	print(f"JSON parse error: {e}")
	else:
	import re
	json_match = re.search(r'\{[^{}]"metadata"[^{}]\{[^{}]*\}', content)
	if json_match:
	try:
	metadata = json.loads(json_match.group())
	except json.JSONDecodeError:
	pass

	# Extract the three parts from the parsed JSON
	description = metadata.get("description", "")
	analysis = metadata.get("analysis", "")
	recommended_actions = metadata.get("recommended_actions", "")

	# Combine all three parts for backward compatibility
	combined_content = f"Description: {description}\n\nAnalysis: {analysis}\n\nRecommended Actions: {recommended_actions}"

	return {
	"caption": combined_content,
	"raw_response": {
	"content": content,
	"metadata": metadata,
	"extracted_metadata": metadata,
	"image_count": len(image_bytes_list)
	},
	"metadata": metadata,
	"description": description,
	"analysis": analysis,
	"recommended_actions": recommended_actions
	}

	except Exception as e:
	raise Exception(f"GPT-4 Vision API error: {str(e)}")