Spaces:
Running
on
Zero
Running
on
Zero
import numpy as np | |
import logging | |
import traceback | |
from typing import List, Dict, Tuple, Optional, Union, Any | |
from PIL import Image | |
class ImageAnalyzer: | |
""" | |
專注於圖像分析和預處理,包括多尺度金字塔分析、視角分析、建築特徵識別和圖像增強等功能 | |
""" | |
def __init__(self): | |
""" | |
初始化圖像分析器 | |
""" | |
self.logger = logging.getLogger(__name__) | |
def get_image_hash(self, image: Union[Image.Image, np.ndarray]) -> int: | |
""" | |
為圖像生成簡單的 hash 值用於快取 | |
Args: | |
image: PIL Image 或 numpy 數組 | |
Returns: | |
int: 圖像的 hash 值 | |
""" | |
try: | |
if isinstance(image, np.ndarray): | |
# 對於 numpy 數組,降採樣並計算簡單 hash | |
small_img = image[::10, ::10] if image.ndim == 3 else image | |
return hash(small_img.tobytes()) | |
else: | |
# 對於 PIL 圖像,調整大小後轉換為 bytes | |
small_img = image.resize((32, 32)) | |
return hash(small_img.tobytes()) | |
except Exception as e: | |
self.logger.error(f"Error generating image hash: {e}") | |
self.logger.error(traceback.format_exc()) | |
return 0 | |
def enhance_features(self, image: Union[Image.Image, np.ndarray]) -> Image.Image: | |
""" | |
增強圖像特徵以改善地標檢測 | |
Args: | |
image: 輸入圖像 | |
Returns: | |
PIL.Image: 增強後的圖像 | |
""" | |
try: | |
# ensure PIL format | |
if not isinstance(image, Image.Image): | |
if isinstance(image, np.ndarray): | |
image = Image.fromarray(image) | |
else: | |
raise ValueError("Unsupported image format. Expected PIL Image or numpy array.") | |
# 轉換為numpy進行處理 | |
img_array = np.array(image) | |
# 跳過灰度圖像的處理 | |
if len(img_array.shape) < 3: | |
return image | |
# 應用自適應對比度增強 | |
try: | |
from skimage import color, exposure | |
# 轉換到LAB色彩空間 | |
if img_array.shape[2] == 4: # 處理RGBA | |
img_array = img_array[:,:,:3] | |
lab = color.rgb2lab(img_array[:,:,:3] / 255.0) | |
l_channel = lab[:,:,0] | |
# 增強L通道的對比度 | |
p2, p98 = np.percentile(l_channel, (2, 98)) | |
l_channel_enhanced = exposure.rescale_intensity(l_channel, in_range=(p2, p98)) | |
# 替換L通道並轉換回RGB | |
lab[:,:,0] = l_channel_enhanced | |
enhanced_img = color.lab2rgb(lab) * 255.0 | |
enhanced_img = enhanced_img.astype(np.uint8) | |
return Image.fromarray(enhanced_img) | |
except ImportError: | |
self.logger.warning("skimage not available for feature enhancement") | |
return image | |
except Exception as e: | |
self.logger.error(f"Error in feature enhancement: {e}") | |
self.logger.error(traceback.format_exc()) | |
return image | |
def analyze_viewpoint(self, image: Union[Image.Image, np.ndarray], | |
clip_model_manager) -> Dict[str, Any]: | |
""" | |
分析圖像視角以調整檢測參數 | |
Args: | |
image: 輸入圖像 | |
clip_model_manager: CLIP模型管理器實例 | |
Returns: | |
Dict: 視角分析結果 | |
""" | |
try: | |
viewpoint_prompts = { | |
"aerial_view": "an aerial view from above looking down", | |
"street_level": "a street level view looking up at a tall structure", | |
"eye_level": "an eye-level horizontal view of a landmark", | |
"distant": "a distant view of a landmark on the horizon", | |
"close_up": "a close-up detailed view of architectural features", | |
"interior": "an interior view inside a structure", | |
"angled_view": "an angled view of a structure", | |
"low_angle": "a low angle view looking up at a building" | |
} | |
# 計算相似度分數 | |
viewpoint_scores = self.calculate_similarity_scores(image, viewpoint_prompts, clip_model_manager) | |
# 找到主要視角 | |
dominant_viewpoint = max(viewpoint_scores.items(), key=lambda x: x[1]) | |
return { | |
"viewpoint_scores": viewpoint_scores, | |
"dominant_viewpoint": dominant_viewpoint[0], | |
"confidence": dominant_viewpoint[1] | |
} | |
except Exception as e: | |
self.logger.error(f"Error in viewpoint analysis: {e}") | |
self.logger.error(traceback.format_exc()) | |
return { | |
"viewpoint_scores": {}, | |
"dominant_viewpoint": "eye_level", | |
"confidence": 0.0 | |
} | |
def calculate_similarity_scores(self, image: Union[Image.Image, np.ndarray], | |
prompts: Dict[str, str], | |
clip_model_manager) -> Dict[str, float]: | |
""" | |
計算圖像與一組特定提示之間的相似度分數 | |
Args: | |
image: 輸入圖像 | |
prompts: 提示詞字典 {名稱: 提示文本} | |
clip_model_manager: CLIP模型管理器實例 | |
Returns: | |
Dict[str, float]: 每個提示的相似度分數 | |
""" | |
try: | |
# ensure PIL format | |
if not isinstance(image, Image.Image): | |
if isinstance(image, np.ndarray): | |
image = Image.fromarray(image) | |
else: | |
raise ValueError("Unsupported image format. Expected PIL Image or numpy array.") | |
# preprocess image | |
image_input = clip_model_manager.preprocess_image(image) | |
# get image features | |
image_features = clip_model_manager.encode_image(image_input) | |
# 計算與每個提示的similarity | |
scores = {} | |
prompt_texts = list(prompts.values()) | |
prompt_features = clip_model_manager.encode_single_text(prompt_texts) | |
# 計算相似度 | |
similarity = clip_model_manager.calculate_similarity(image_features, prompt_features) | |
# result | |
for i, (name, _) in enumerate(prompts.items()): | |
scores[name] = float(similarity[0][i]) | |
return scores | |
except Exception as e: | |
self.logger.error(f"Error calculating similarity scores: {e}") | |
self.logger.error(traceback.format_exc()) | |
return {} | |
def analyze_architectural_features(self, image: Union[Image.Image, np.ndarray], | |
clip_model_manager) -> Dict[str, Any]: | |
""" | |
分析圖像中結構的建築特徵,不硬編碼特定地標 | |
Args: | |
image: 輸入圖像 | |
clip_model_manager: CLIP模型管理器實例 | |
Returns: | |
Dict: 建築特徵分析結果 | |
""" | |
try: | |
# 定義通用建築特徵提示,適用於所有類型的地標 | |
architecture_prompts = { | |
"tall_structure": "a tall vertical structure standing alone", | |
"tiered_building": "a building with multiple stacked tiers or segments", | |
"historical_structure": "a building with historical architectural elements", | |
"modern_design": "a modern structure with contemporary architectural design", | |
"segmented_exterior": "a structure with visible segmented or sectioned exterior", | |
"viewing_platform": "a tall structure with observation area at the top", | |
"time_display": "a structure with timepiece features", | |
"glass_facade": "a building with prominent glass exterior surfaces", | |
"memorial_structure": "a monument or memorial structure", | |
"ancient_construction": "ancient constructed elements or archaeological features", | |
"natural_landmark": "a natural geographic formation or landmark", | |
"slanted_design": "a structure with non-vertical or leaning profile" | |
} | |
# 計算與通用建築模式的相似度分數 | |
context_scores = self.calculate_similarity_scores(image, architecture_prompts, clip_model_manager) | |
# 確定最相關的建築特徵 | |
top_features = sorted(context_scores.items(), key=lambda x: x[1], reverse=True)[:3] | |
# 計算特徵置信度 | |
context_confidence = sum(score for _, score in top_features) / 3 | |
# 根據頂級特徵確定主要建築類別 | |
architectural_categories = { | |
"tower": ["tall_structure", "viewing_platform", "time_display"], | |
"skyscraper": ["tall_structure", "modern_design", "glass_facade"], | |
"historical": ["historical_structure", "ancient_construction", "memorial_structure"], | |
"natural": ["natural_landmark"], | |
"distinctive": ["tiered_building", "segmented_exterior", "slanted_design"] | |
} | |
# 根據頂級特徵為每個類別評分 | |
category_scores = {} | |
for category, features in architectural_categories.items(): | |
category_score = 0 | |
for feature, score in context_scores.items(): | |
if feature in features: | |
category_score += score | |
category_scores[category] = category_score | |
primary_category = max(category_scores.items(), key=lambda x: x[1])[0] | |
return { | |
"architectural_features": top_features, | |
"context_confidence": context_confidence, | |
"primary_category": primary_category, | |
"category_scores": category_scores | |
} | |
except Exception as e: | |
self.logger.error(f"Error in architectural feature analysis: {e}") | |
self.logger.error(traceback.format_exc()) | |
return { | |
"architectural_features": [], | |
"context_confidence": 0.0, | |
"primary_category": "building", | |
"category_scores": {} | |
} | |
def perform_pyramid_analysis(self, image: Union[Image.Image, np.ndarray], | |
clip_model_manager, landmark_data_manager, | |
levels: int = 4, base_threshold: float = 0.25, | |
aspect_ratios: List[float] = [1.0, 0.75, 1.5]) -> Dict[str, Any]: | |
""" | |
對圖像執行多尺度金字塔分析以改善地標檢測 | |
Args: | |
image: 輸入圖像 | |
clip_model_manager: CLIP模型管理器實例 | |
landmark_data_manager: 地標數據管理器實例 | |
levels: 金字塔層級數 | |
base_threshold: 基礎置信度閾值 | |
aspect_ratios: 不同縱橫比列表 | |
Returns: | |
Dict: 金字塔分析結果 | |
""" | |
try: | |
# 確保圖像是PIL格式 | |
if not isinstance(image, Image.Image): | |
if isinstance(image, np.ndarray): | |
image = Image.fromarray(image) | |
else: | |
raise ValueError("Unsupported image format. Expected PIL Image or numpy array.") | |
width, height = image.size | |
pyramid_results = [] | |
# 獲取預計算的地標文本特徵 | |
landmark_prompts = landmark_data_manager.get_landmark_prompts() | |
if not landmark_prompts: | |
return { | |
"is_landmark": False, | |
"results": [], | |
"best_result": None | |
} | |
landmark_text_features = clip_model_manager.encode_text_batch(landmark_prompts) | |
# 對每個縮放和縱橫比組合進行處理 | |
for level in range(levels): | |
# 計算縮放因子 | |
scale_factor = 1.0 - (level * 0.2) | |
for aspect_ratio in aspect_ratios: | |
# 計算新尺寸,保持面積近似不變 | |
if aspect_ratio != 1.0: | |
# 保持面積近似不變的情況下調整縱橫比 | |
new_width = int(width * scale_factor * (1/aspect_ratio)**0.5) | |
new_height = int(height * scale_factor * aspect_ratio**0.5) | |
else: | |
new_width = int(width * scale_factor) | |
new_height = int(height * scale_factor) | |
# 調整圖像大小 | |
scaled_image = image.resize((new_width, new_height), Image.LANCZOS) | |
# 預處理圖像 | |
image_input = clip_model_manager.preprocess_image(scaled_image) | |
# 獲取圖像特徵 | |
image_features = clip_model_manager.encode_image(image_input) | |
# 計算相似度 | |
similarity = clip_model_manager.calculate_similarity(image_features, landmark_text_features) | |
# 找到最佳匹配 | |
best_idx = similarity[0].argmax().item() | |
best_score = similarity[0][best_idx] | |
if best_score >= base_threshold: | |
landmark_id, landmark_info = landmark_data_manager.get_landmark_by_index(best_idx) | |
if landmark_id: | |
pyramid_results.append({ | |
"landmark_id": landmark_id, | |
"landmark_name": landmark_info.get("name", "Unknown"), | |
"confidence": float(best_score), | |
"scale_factor": scale_factor, | |
"aspect_ratio": aspect_ratio, | |
"location": landmark_info.get("location", "Unknown Location") | |
}) | |
# 按置信度排序 | |
pyramid_results.sort(key=lambda x: x["confidence"], reverse=True) | |
return { | |
"is_landmark": len(pyramid_results) > 0, | |
"results": pyramid_results, | |
"best_result": pyramid_results[0] if pyramid_results else None | |
} | |
except Exception as e: | |
self.logger.error(f"Error in pyramid analysis: {e}") | |
self.logger.error(traceback.format_exc()) | |
return { | |
"is_landmark": False, | |
"results": [], | |
"best_result": None | |
} | |