Spaces:
Paused
Paused
""" | |
Vision-based query module using GPT-5 Vision. | |
Supports multimodal queries combining text and images. | |
""" | |
import base64 | |
import json | |
import logging | |
import sqlite3 | |
from typing import List, Tuple, Optional, Dict, Any | |
import numpy as np | |
from PIL import Image | |
from openai import OpenAI | |
from config import * | |
from utils import ImageProcessor, classify_image | |
logger = logging.getLogger(__name__) | |
class VisionRetriever: | |
"""Vision-based retrieval using GPT-5 Vision for image analysis and classification.""" | |
def __init__(self): | |
self.client = OpenAI(api_key=OPENAI_API_KEY) | |
self.image_processor = ImageProcessor() | |
def get_similar_images(self, query_image_path: str, top_k: int = 5) -> List[Dict[str, Any]]: | |
"""Find similar images in the database based on classification similarity.""" | |
try: | |
# Uses GPT-5 Vision for classification-based similarity search | |
# Note: This implementation uses classification similarity rather than embeddings | |
# Classify the query image | |
query_classification = classify_image(query_image_path) | |
# Query database for similar images | |
conn = sqlite3.connect(IMAGES_DB) | |
cursor = conn.cursor() | |
# Search for images with similar classification | |
cursor.execute(""" | |
SELECT image_id, image_path, classification, metadata | |
FROM images | |
WHERE classification LIKE ? | |
ORDER BY created_at DESC | |
LIMIT ? | |
""", (f"%{query_classification}%", top_k)) | |
results = cursor.fetchall() | |
conn.close() | |
similar_images = [] | |
for row in results: | |
image_id, image_path, classification, metadata_json = row | |
metadata = json.loads(metadata_json) if metadata_json else {} | |
similar_images.append({ | |
'image_id': image_id, | |
'image_path': image_path, | |
'classification': classification, | |
'metadata': metadata, | |
'similarity_score': 0.8 # Classification-based similarity score | |
}) | |
logger.info(f"Found {len(similar_images)} similar images for query") | |
return similar_images | |
except Exception as e: | |
logger.error(f"Error finding similar images: {e}") | |
return [] | |
def analyze_image_safety(self, image_path: str, question: str = None) -> str: | |
"""Analyze image for safety concerns using GPT-5 Vision.""" | |
try: | |
# Convert image to base64 | |
with open(image_path, "rb") as image_file: | |
image_b64 = base64.b64encode(image_file.read()).decode() | |
# Create analysis prompt | |
if question: | |
analysis_prompt = ( | |
f"Analyze this image in the context of the following question: {question}\n\n" | |
"Please provide a detailed safety analysis covering:\n" | |
"1. What equipment, machinery, or workplace elements are visible\n" | |
"2. Any potential safety hazards or compliance issues\n" | |
"3. Relevant OSHA standards or regulations that may apply\n" | |
"4. Recommendations for safety improvements\n" | |
"5. How this relates to the specific question asked" | |
) | |
else: | |
analysis_prompt = ( | |
"Analyze this image for occupational safety and health concerns. Provide:\n" | |
"1. Description of what's shown in the image\n" | |
"2. Identification of potential safety hazards\n" | |
"3. Relevant OSHA standards or safety regulations\n" | |
"4. Recommendations for improving safety" | |
) | |
messages = [{ | |
"role": "user", | |
"content": [ | |
{"type": "text", "text": analysis_prompt}, | |
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}", "detail": "high"}} | |
] | |
}] | |
# For GPT-5 vision, temperature must be default (1.0) and reasoning is not supported | |
response = self.client.chat.completions.create( | |
model=OPENAI_CHAT_MODEL, | |
messages=messages, | |
max_completion_tokens=DEFAULT_MAX_TOKENS | |
) | |
return response.choices[0].message.content.strip() | |
except Exception as e: | |
logger.error(f"Error analyzing image: {e}") | |
return f"I encountered an error while analyzing the image: {e}" | |
def retrieve_relevant_text(self, image_classification: str, question: str, top_k: int = 3) -> List[Dict[str, Any]]: | |
"""Retrieve text documents relevant to the image classification and question.""" | |
# This would integrate with other retrieval methods to find relevant text | |
# For now, we'll create a simple keyword-based search | |
try: | |
# Import other query modules for text retrieval | |
from query_vanilla import query as vanilla_query | |
# Create an enhanced query combining image classification and original question | |
enhanced_question = f"safety requirements for {image_classification} {question}" | |
# Use vanilla retrieval to find relevant text | |
_, text_citations = vanilla_query(enhanced_question, top_k=top_k) | |
return text_citations | |
except Exception as e: | |
logger.error(f"Error retrieving relevant text: {e}") | |
return [] | |
def generate_multimodal_answer(self, question: str, image_analysis: str, | |
text_citations: List[Dict], similar_images: List[Dict]) -> str: | |
"""Generate answer combining image analysis and text retrieval.""" | |
try: | |
# Prepare context from text citations | |
text_context = "" | |
if text_citations: | |
text_parts = [] | |
for i, citation in enumerate(text_citations, 1): | |
if 'text' in citation: | |
text_parts.append(f"[Text Source {i}] {citation['source']}: {citation['text'][:500]}...") | |
else: | |
text_parts.append(f"[Text Source {i}] {citation['source']}") | |
text_context = "\n\n".join(text_parts) | |
# Prepare context from similar images | |
image_context = "" | |
if similar_images: | |
image_parts = [] | |
for img in similar_images[:3]: # Limit to top 3 | |
source = img['metadata'].get('source', 'Unknown') | |
classification = img.get('classification', 'unknown') | |
image_parts.append(f"Similar image from {source}: classified as {classification}") | |
image_context = "\n".join(image_parts) | |
# Create comprehensive prompt | |
system_message = ( | |
"You are an expert in occupational safety and health. " | |
"You have been provided with an image analysis, relevant text documents, " | |
"and information about similar images in the database. " | |
"Provide a comprehensive answer that integrates all this information." | |
) | |
user_message = f"""Question: {question} | |
Image Analysis: | |
{image_analysis} | |
Relevant Text Documentation: | |
{text_context} | |
Similar Images Context: | |
{image_context} | |
Please provide a comprehensive answer that: | |
1. Addresses the specific question asked | |
2. Incorporates insights from the image analysis | |
3. References relevant regulatory information from the text sources | |
4. Notes any connections to similar cases or images | |
5. Provides actionable recommendations based on safety standards""" | |
# For GPT-5, temperature must be default (1.0) and reasoning is not supported | |
response = self.client.chat.completions.create( | |
model=OPENAI_CHAT_MODEL, | |
messages=[ | |
{"role": "system", "content": system_message}, | |
{"role": "user", "content": user_message} | |
], | |
max_completion_tokens=DEFAULT_MAX_TOKENS * 2 # Allow longer response for comprehensive analysis | |
) | |
return response.choices[0].message.content.strip() | |
except Exception as e: | |
logger.error(f"Error generating multimodal answer: {e}") | |
return "I apologize, but I encountered an error while generating the comprehensive answer." | |
# Global retriever instance | |
_retriever = None | |
def get_retriever() -> VisionRetriever: | |
"""Get or create global vision retriever instance.""" | |
global _retriever | |
if _retriever is None: | |
_retriever = VisionRetriever() | |
return _retriever | |
def query(question: str, image_path: Optional[str] = None, top_k: int = DEFAULT_TOP_K) -> Tuple[str, List[Dict]]: | |
""" | |
Main vision-based query function with unified signature. | |
Args: | |
question: User question | |
image_path: Path to image file (required for vision queries) | |
top_k: Number of relevant results to retrieve | |
Returns: | |
Tuple of (answer, citations) | |
""" | |
if not image_path: | |
return "Vision queries require an image. Please provide an image file.", [] | |
try: | |
retriever = get_retriever() | |
# Step 1: Analyze the provided image | |
logger.info(f"Analyzing image: {image_path}") | |
image_analysis = retriever.analyze_image_safety(image_path, question) | |
# Step 2: Classify the image | |
image_classification = classify_image(image_path) | |
# Step 3: Find similar images | |
similar_images = retriever.get_similar_images(image_path, top_k=3) | |
# Step 4: Retrieve relevant text documents | |
text_citations = retriever.retrieve_relevant_text(image_classification, question, top_k) | |
# Step 5: Generate comprehensive multimodal answer | |
answer = retriever.generate_multimodal_answer( | |
question, image_analysis, text_citations, similar_images | |
) | |
# Step 6: Prepare citations | |
citations = [] | |
# Add image analysis as primary citation | |
citations.append({ | |
'rank': 1, | |
'type': 'image_analysis', | |
'source': f"Analysis of {image_path.split('/')[-1] if '/' in image_path else image_path.split('\\')[-1]}", | |
'method': 'vision', | |
'classification': image_classification, | |
'score': 1.0 | |
}) | |
# Add text citations | |
for i, citation in enumerate(text_citations, 2): | |
citation_copy = citation.copy() | |
citation_copy['rank'] = i | |
citation_copy['method'] = 'vision_text' | |
citations.append(citation_copy) | |
# Add similar images | |
for i, img in enumerate(similar_images): | |
citations.append({ | |
'rank': len(citations) + 1, | |
'type': 'similar_image', | |
'source': img['metadata'].get('source', 'Image Database'), | |
'method': 'vision', | |
'classification': img.get('classification', 'unknown'), | |
'similarity_score': img.get('similarity_score', 0.0), | |
'image_id': img.get('image_id') | |
}) | |
logger.info(f"Vision query completed. Generated {len(citations)} citations.") | |
return answer, citations | |
except Exception as e: | |
logger.error(f"Error in vision query: {e}") | |
error_message = "I apologize, but I encountered an error while processing your vision-based question." | |
return error_message, [] | |
def query_image_only(image_path: str, question: str = None) -> Tuple[str, List[Dict]]: | |
""" | |
Analyze image without text retrieval (faster for simple image analysis). | |
Args: | |
image_path: Path to image file | |
question: Optional specific question about the image | |
Returns: | |
Tuple of (analysis, citations) | |
""" | |
try: | |
retriever = get_retriever() | |
# Analyze image | |
analysis = retriever.analyze_image_safety(image_path, question) | |
# Classify image | |
classification = classify_image(image_path) | |
# Create citation for image analysis | |
citations = [{ | |
'rank': 1, | |
'type': 'image_analysis', | |
'source': f"Analysis of {image_path.split('/')[-1] if '/' in image_path else image_path.split('\\')[-1]}", | |
'method': 'vision_only', | |
'classification': classification, | |
'score': 1.0 | |
}] | |
return analysis, citations | |
except Exception as e: | |
logger.error(f"Error in image-only analysis: {e}") | |
return "Error analyzing image.", [] | |
def query_with_details(question: str, image_path: Optional[str] = None, | |
top_k: int = DEFAULT_TOP_K) -> Tuple[str, List[Dict], List[Tuple]]: | |
""" | |
Vision query function that returns detailed chunk information (for compatibility). | |
Returns: | |
Tuple of (answer, citations, chunks) | |
""" | |
answer, citations = query(question, image_path, top_k) | |
# Convert citations to chunk format for backward compatibility | |
chunks = [] | |
for citation in citations: | |
if citation['type'] == 'image_analysis': | |
chunks.append(( | |
f"Image Analysis ({citation['classification']})", | |
citation['score'], | |
"Analysis of uploaded image for safety compliance", | |
citation['source'] | |
)) | |
elif citation['type'] == 'similar_image': | |
chunks.append(( | |
f"Similar Image (Score: {citation.get('similarity_score', 0):.3f})", | |
citation.get('similarity_score', 0), | |
f"Similar image classified as {citation['classification']}", | |
citation['source'] | |
)) | |
else: | |
chunks.append(( | |
f"Text Reference {citation['rank']}", | |
citation.get('score', 0.5), | |
citation.get('text', 'Referenced document'), | |
citation['source'] | |
)) | |
return answer, citations, chunks | |
if __name__ == "__main__": | |
# Test the vision system (requires an actual image file) | |
import sys | |
if len(sys.argv) > 1: | |
test_image_path = sys.argv[1] | |
test_question = "What safety issues can you identify in this image?" | |
print("Testing vision retrieval system...") | |
print(f"Image: {test_image_path}") | |
print(f"Question: {test_question}") | |
print("-" * 50) | |
try: | |
answer, citations = query(test_question, test_image_path) | |
print("Answer:") | |
print(answer) | |
print(f"\nCitations ({len(citations)}):") | |
for citation in citations: | |
print(f"- {citation['source']} (Type: {citation.get('type', 'unknown')})") | |
except Exception as e: | |
print(f"Error during testing: {e}") | |
else: | |
print("To test vision system, provide an image path as argument:") | |
print("python query_vision.py /path/to/image.jpg") |