Spaces:

DawnC
/

VisionScout

Running on Zero

App Files Files Community

DawnC commited on 9 days ago

Commit

e6a18b7

verified ·

1 Parent(s): 05b8fc5

Upload 59 files

Browse files

Refactoring the architecture and Improved scene understanding accuracy

Files changed (38) hide show

app.py +1 -1
clip_model_manager.py +276 -0
clip_zero_shot_classifier.py +668 -1194
component_initializer.py +319 -0
confidence_manager.py +283 -0
configuration_manager.py +418 -0
cultural_context_analyzer.py +637 -0
enhanced_scene_describer.py +1254 -0
feature_extractor.py +822 -0
functional_zone_identifier.py +938 -0
image_analyzer.py +365 -0
image_processor.py +29 -4
indoor_outdoor_classifier.py +755 -0
landmark_data_manager.py +283 -0
landmark_processing_manager.py +512 -0
lighting_analyzer.py +0 -0
lighting_condition_analyzer.py +854 -0
llm_enhancer.py +371 -1133
model_manager.py +358 -0
object_description_generator.py +1266 -0
object_extractor.py +358 -0
prompt_template_manager.py +547 -0
region_analyzer.py +487 -0
requirements.txt +0 -1
response_processor.py +1049 -0
result_cache_manager.py +234 -0
scene_analysis_coordinator.py +973 -0
scene_analyzer.py +0 -0
scene_scoring_engine.py +491 -0
scene_viewpoint_analyzer.py +311 -0
scene_zone_identifier.py +1728 -0
spatial_analyzer.py +301 -1753
template_manager.py +2150 -0
text_formatter.py +545 -0
text_quality_validator.py +452 -0
viewpoint_detector.py +437 -0
visualization_helper.py +0 -1
zone_evaluator.py +272 -0

app.py CHANGED Viewed

@@ -642,7 +642,7 @@ def create_interface():
                                 "room_01.jpg",
                                 "street_04.jpg",
                                 "street_05.jpg",
-                                "landmark_Louvre_01.jpg",
                                 ],
                             inputs=image_input,
                             label="Example Images"

                                 "room_01.jpg",
                                 "street_04.jpg",
                                 "street_05.jpg",
+                                "landmark_Louvre_01.jpg"
                                 ],
                             inputs=image_input,
                             label="Example Images"

clip_model_manager.py ADDED Viewed

	@@ -0,0 +1,276 @@

+import torch
+import clip
+import numpy as np
+import logging
+import traceback
+from typing import List, Dict, Tuple, Optional, Union, Any
+from PIL import Image
+class CLIPModelManager:
+    """
+    專門管理 CLIP 模型相關的操作，包括模型載入、設備管理、圖像和文本的特徵編碼等核心功能
+    """
+    def __init__(self, model_name: str = "ViT-B/16", device: str = None):
+        """
+        初始化 CLIP 模型管理器
+        Args:
+            model_name: CLIP模型名稱，默認為"ViT-B/16"
+            device: 運行設備，None則自動選擇
+        """
+        self.logger = logging.getLogger(__name__)
+        self.model_name = model_name
+        # 設置運行設備
+        if device is None:
+            self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        else:
+            self.device = device
+        self.model = None
+        self.preprocess = None
+        self._initialize_model()
+    def _initialize_model(self):
+        """
+        初始化CLIP模型
+        """
+        try:
+            self.logger.info(f"Initializing CLIP model ({self.model_name}) on {self.device}")
+            self.model, self.preprocess = clip.load(self.model_name, device=self.device)
+            self.logger.info("Successfully loaded CLIP model")
+        except Exception as e:
+            self.logger.error(f"Error loading CLIP model: {e}")
+            self.logger.error(traceback.format_exc())
+            raise
+    def encode_image(self, image_input: torch.Tensor) -> torch.Tensor:
+        """
+        編碼圖像特徵
+        Args:
+            image_input: 預處理後的圖像張量
+        Returns:
+            torch.Tensor: 標準化後的圖像特徵
+        """
+        try:
+            with torch.no_grad():
+                image_features = self.model.encode_image(image_input)
+                image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+                return image_features
+        except Exception as e:
+            self.logger.error(f"Error encoding image features: {e}")
+            self.logger.error(traceback.format_exc())
+            raise
+    def encode_text_batch(self, text_prompts: List[str], batch_size: int = 128) -> torch.Tensor:
+        """
+        批量編碼文本特徵，避免CUDA內存問題
+        Args:
+            text_prompts: 文本提示列表
+            batch_size: 批處理大小
+        Returns:
+            torch.Tensor: 標準化後的文本特徵
+        """
+        if not text_prompts:
+            return None
+        try:
+            with torch.no_grad():
+                features_list = []
+                for i in range(0, len(text_prompts), batch_size):
+                    batch_prompts = text_prompts[i:i+batch_size]
+                    text_tokens = clip.tokenize(batch_prompts).to(self.device)
+                    batch_features = self.model.encode_text(text_tokens)
+                    batch_features = batch_features / batch_features.norm(dim=-1, keepdim=True)
+                    features_list.append(batch_features)
+                # 連接所有批次
+                if len(features_list) > 1:
+                    text_features = torch.cat(features_list, dim=0)
+                else:
+                    text_features = features_list[0]
+                return text_features
+        except Exception as e:
+            self.logger.error(f"Error encoding text features: {e}")
+            self.logger.error(traceback.format_exc())
+            raise
+    def encode_single_text(self, text_prompts: List[str]) -> torch.Tensor:
+        """
+        編碼單個文本批次的特徵
+        Args:
+            text_prompts: 文本提示列表
+        Returns:
+            torch.Tensor: 標準化後的文本特徵
+        """
+        try:
+            with torch.no_grad():
+                text_tokens = clip.tokenize(text_prompts).to(self.device)
+                text_features = self.model.encode_text(text_tokens)
+                text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+                return text_features
+        except Exception as e:
+            self.logger.error(f"Error encoding single text batch: {e}")
+            self.logger.error(traceback.format_exc())
+            raise
+    def calculate_similarity(self, image_features: torch.Tensor, text_features: torch.Tensor) -> np.ndarray:
+        """
+        計算圖像和文本特徵之間的相似度
+        Args:
+            image_features: 圖像特徵張量
+            text_features: 文本特徵張量
+        Returns:
+            np.ndarray: 相似度分數數組
+        """
+        try:
+            with torch.no_grad():
+                similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
+                similarity = similarity.cpu().numpy() if self.device == "cuda" else similarity.numpy()
+                return similarity
+        except Exception as e:
+            self.logger.error(f"Error calculating similarity: {e}")
+            self.logger.error(traceback.format_exc())
+            raise
+    def preprocess_image(self, image: Union[Image.Image, np.ndarray]) -> torch.Tensor:
+        """
+        預處理圖像以供CLIP模型使用
+        Args:
+            image: PIL圖像或numpy數組
+        Returns:
+            torch.Tensor: 預處理後的圖像張量
+        """
+        try:
+            if not isinstance(image, Image.Image):
+                if isinstance(image, np.ndarray):
+                    image = Image.fromarray(image)
+                else:
+                    raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
+            image_input = self.preprocess(image).unsqueeze(0).to(self.device)
+            return image_input
+        except Exception as e:
+            self.logger.error(f"Error preprocessing image: {e}")
+            self.logger.error(traceback.format_exc())
+            raise
+    def process_image_region(self, image: Union[Image.Image, np.ndarray], box: List[float]) -> torch.Tensor:
+        """
+        處理圖像的特定區域
+        Args:
+            image: 原始圖像
+            box: 邊界框 [x1, y1, x2, y2]
+        Returns:
+            torch.Tensor: 區域圖像的特徵
+        """
+        try:
+            # 確保圖像是PIL格式
+            if not isinstance(image, Image.Image):
+                if isinstance(image, np.ndarray):
+                    image = Image.fromarray(image)
+                else:
+                    raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
+            # 裁剪區域
+            x1, y1, x2, y2 = map(int, box)
+            cropped_image = image.crop((x1, y1, x2, y2))
+            # 預處理並編碼
+            image_input = self.preprocess_image(cropped_image)
+            image_features = self.encode_image(image_input)
+            return image_features
+        except Exception as e:
+            self.logger.error(f"Error processing image region: {e}")
+            self.logger.error(traceback.format_exc())
+            raise
+    def batch_process_regions(self, image: Union[Image.Image, np.ndarray],
+                             boxes: List[List[float]]) -> torch.Tensor:
+        """
+        批量處理多個圖像區域
+        Args:
+            image: 原始圖像
+            boxes: 邊界框列表
+        Returns:
+            torch.Tensor: 所有區域的圖像特徵
+        """
+        try:
+            # ensure PIL format
+            if not isinstance(image, Image.Image):
+                if isinstance(image, np.ndarray):
+                    image = Image.fromarray(image)
+                else:
+                    raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
+            if not boxes:
+                return torch.empty(0)
+            # 裁剪並預處理所有區域
+            cropped_inputs = []
+            for box in boxes:
+                x1, y1, x2, y2 = map(int, box)
+                cropped_image = image.crop((x1, y1, x2, y2))
+                processed_image = self.preprocess(cropped_image).unsqueeze(0)
+                cropped_inputs.append(processed_image)
+            # 批量處理
+            batch_tensor = torch.cat(cropped_inputs).to(self.device)
+            image_features = self.encode_image(batch_tensor)
+            return image_features
+        except Exception as e:
+            self.logger.error(f"Error batch processing regions: {e}")
+            self.logger.error(traceback.format_exc())
+            raise
+    def is_model_loaded(self) -> bool:
+        """
+        檢查模型是否已成功載入
+        Returns:
+            bool: 模型載入狀態
+        """
+        return self.model is not None and self.preprocess is not None
+    def get_device(self) -> str:
+        """
+        獲取當前設備
+        Returns:
+            str: 設備名稱
+        """
+        return self.device
+    def get_model_name(self) -> str:
+        """
+        獲取模型名稱
+        Returns:
+            str: 模型名稱
+        """
+        return self.model_name

clip_zero_shot_classifier.py CHANGED Viewed

@@ -3,16 +3,24 @@ import torch
 import clip
 from PIL import Image
 import numpy as np
 from typing import List, Dict, Tuple, Optional, Union, Any
-from landmark_data import ALL_LANDMARKS, get_all_landmark_prompts
-from landmark_activities import LANDMARK_ACTIVITIES
 class CLIPZeroShotClassifier:
     """
-    使用CLIP模型進行零樣本分類，專注於識別世界知名地標。
-    作為YOLO檢測的補充，處理標準對象檢測無法識別的地標建築。
     """
     def __init__(self, model_name: str = "ViT-B/16", device: str = None):
         """
         初始化CLIP零樣本分類器
@@ -21,87 +29,38 @@ class CLIPZeroShotClassifier:
             model_name: CLIP模型名稱，默認為"ViT-B/16"
             device: 運行設備，None則自動選擇
         """
-        # 設置運行設備
-        if device is None:
-            self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        else:
-            self.device = device
-        print(f"Initializing CLIP Zero-Shot Landmark Classifier ({model_name}) on {self.device}")
-        try:
-            self.model, self.preprocess = clip.load(model_name, device=self.device)
-            print(f"Successfully loaded CLIP model")
-        except Exception as e:
-            print(f"Error loading CLIP model: {e}")
-            raise
-        # 加載地標數據
-        try:
-            self.landmark_data = ALL_LANDMARKS
-            self.landmark_prompts = get_all_landmark_prompts()
-            print(f"Loaded {len(self.landmark_prompts)} landmark prompts for classification")
-            # 預計算地標文本特徵
-            self.landmark_text_features = self._precompute_text_features(self.landmark_prompts)
-            # 創建地標ID到索引的映射，可快速查找
-            self.landmark_id_to_index = {landmark_id: i for i, landmark_id in enumerate(ALL_LANDMARKS.keys())}
-            # 初始化批處理參數
-            self.batch_size = 16  # 默認批處理大小
-            self.confidence_threshold_multipliers = {
-                "close_up": 0.9,     # 近景標準閾值
-                "partial": 0.6,      # 部分可見降低閾值要求
-                "distant": 0.5,      # 遠景更低閾值要求
-                "full_image": 0.7    # 整張圖像需要更高閾值
-            }
-            self.landmark_type_thresholds = {
-                "tower": 0.5,         # 塔型建築需要更高閾值
-                "skyscraper": 0.4,    # 摩天大樓使用較低閾值
-                "building": 0.55,     # 一般建築物閾值略微降低
-                "monument": 0.5,      # 紀念碑閾值
-                "natural": 0.6        # 自然地標可以使用較低閾值
-            }
-            # 初始化結果快取
-            self.results_cache = {}  # 使用圖像hash作為鍵
-            self.cache_max_size = 100  # 最大快取項目數
-        except ImportError:
-            print("Warning: landmark_data.py not found. Landmark classification will be limited")
-            self.landmark_data = {}
-            self.landmark_prompts = []
-            self.landmark_text_features = None
-            self.landmark_id_to_index = {}
-            self.results_cache = {}
-    def _get_image_hash(self, image):
-        """
-        為圖像生成簡單的 hash 值用於快取
-        Args:
-            image: PIL Image 或 numpy 數組
-        Returns:
-            str: 圖像的 hash 值
-        """
-        if isinstance(image, np.ndarray):
-            # 對於 numpy 數組，降採樣並計算簡單 hash
-            small_img = image[::10, ::10] if image.ndim == 3 else image
-            return hash(small_img.tobytes())
-        else:
-            # 對於 PIL 圖像，調整大小後轉換為 bytes
-            small_img = image.resize((32, 32))
-            return hash(small_img.tobytes())
-    def _manage_cache(self):
         """
-        管理結果快取大小
         """
-        if len(self.results_cache) > self.cache_max_size:
-            oldest_key = next(iter(self.results_cache))
-            del self.results_cache[oldest_key]
     def set_batch_size(self, batch_size: int):
         """
@@ -110,436 +69,179 @@ class CLIPZeroShotClassifier:
         Args:
             batch_size: 新的批處理大小
         """
-        self.batch_size = max(1, batch_size)
-        print(f"Batch size set to {self.batch_size}")
     def adjust_confidence_threshold(self, detection_type: str, multiplier: float):
         """
         調整特定檢測類型的置信度閾值乘數
-        Args:
             detection_type: 檢測類型 ('close_up', 'partial', 'distant', 'full_image')
             multiplier: 置信度閾值乘數
         """
-        if detection_type in self.confidence_threshold_multipliers:
-            self.confidence_threshold_multipliers[detection_type] = max(0.1, min(1.5, multiplier))
-            print(f"Adjusted confidence threshold multiplier for {detection_type} to {multiplier}")
-        else:
-            print(f"Unknown detection type: {detection_type}")
-    def _precompute_text_features(self, text_prompts: List[str]) -> torch.Tensor:
         """
-        預計算文本提示的CLIP特徵，提高批處理效率
         Args:
-            text_prompts: 文本提示列表
         Returns:
-            torch.Tensor: 預計算的文本特徵
-        """
-        if not text_prompts:
-            return None
-        with torch.no_grad():
-            # Process in batches to avoid CUDA memory issues
-            batch_size = 128  # Adjust based on GPU memory
-            features_list = []
-            for i in range(0, len(text_prompts), batch_size):
-                batch_prompts = text_prompts[i:i+batch_size]
-                text_tokens = clip.tokenize(batch_prompts).to(self.device)
-                batch_features = self.model.encode_text(text_tokens)
-                batch_features = batch_features / batch_features.norm(dim=-1, keepdim=True)
-                features_list.append(batch_features)
-            # Concatenate all batches
-            if len(features_list) > 1:
-                text_features = torch.cat(features_list, dim=0)
-            else:
-                text_features = features_list[0]
-        return text_features
-    def _perform_pyramid_analysis(self,
-                         image: Union[Image.Image, np.ndarray],
-                         levels: int = 4,
-                         base_threshold: float = 0.25,
-                         aspect_ratios: List[float] = [1.0, 0.75, 1.5]) -> Dict[str, Any]:
         """
-        Performs multi-scale pyramid analysis on the image to improve landmark detection.
-        Args:
-            image: Input image
-            levels: Number of pyramid levels
-            base_threshold: Base confidence threshold
-            aspect_ratios: Different aspect ratios to try (for tall buildings vs wide landscapes)
-        Returns:
-            Dict: Results of pyramid analysis
-        """
-        # Ensure image is PIL format
-        if not isinstance(image, Image.Image):
-            if isinstance(image, np.ndarray):
-                image = Image.fromarray(image)
-            else:
-                raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
-        width, height = image.size
-        pyramid_results = []
-        # 對每個縮放和縱橫比組合進行處理
-        for level in range(levels):
-            # 計算縮放因子
-            scale_factor = 1.0 - (level * 0.2)
-            for aspect_ratio in aspect_ratios:
-                # 計算新尺寸，保持面積近似不變
-                if aspect_ratio != 1.0:
-                    # 保持面積近似不變的情況下調整縱橫比
-                    new_width = int(width * scale_factor * (1/aspect_ratio)**0.5)
-                    new_height = int(height * scale_factor * aspect_ratio**0.5)
                 else:
-                    new_width = int(width * scale_factor)
-                    new_height = int(height * scale_factor)
-                # 調整圖像大小
-                scaled_image = image.resize((new_width, new_height), Image.LANCZOS)
-                # 預處理圖像
-                image_input = self.preprocess(scaled_image).unsqueeze(0).to(self.device)
-                # 獲取圖像特徵
-                with torch.no_grad():
-                    image_features = self.model.encode_image(image_input)
-                    image_features = image_features / image_features.norm(dim=-1, keepdim=True)
-                    # 計算相似度
-                    similarity = (100.0 * image_features @ self.landmark_text_features.T).softmax(dim=-1)
-                    similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
-                # 找到最佳匹配
-                best_idx = similarity.argmax().item()
-                best_score = similarity[best_idx]
-                if best_score >= base_threshold:
-                    landmark_id = list(self.landmark_data.keys())[best_idx]
-                    landmark_info = self.landmark_data[landmark_id]
-                    pyramid_results.append({
-                        "landmark_id": landmark_id,
-                        "landmark_name": landmark_info["name"],
-                        "confidence": float(best_score),
-                        "scale_factor": scale_factor,
-                        "aspect_ratio": aspect_ratio,
-                        "location": landmark_info["location"]
-                    })
-        # 按置信度排序
-        pyramid_results.sort(key=lambda x: x["confidence"], reverse=True)
-        return {
-            "is_landmark": len(pyramid_results) > 0,
-            "results": pyramid_results,
-            "best_result": pyramid_results[0] if pyramid_results else None
-        }
-    def _enhance_features(self, image: Union[Image.Image, np.ndarray]) -> Image.Image:
-        """
-        Enhances image features to improve landmark detection.
-        Args:
-            image: Input image
-        Returns:
-            PIL.Image: Enhanced image
-        """
-        # Ensure image is PIL format
-        if not isinstance(image, Image.Image):
-            if isinstance(image, np.ndarray):
-                image = Image.fromarray(image)
-            else:
-                raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
-        # Convert to numpy for processing
-        img_array = np.array(image)
-        # Skip processing for grayscale images
-        if len(img_array.shape) < 3:
-            return image
-        # Apply adaptive contrast enhancement
-        # Convert to LAB color space
-        from skimage import color, exposure
-        try:
-            # Convert to LAB color space
-            if img_array.shape[2] == 4:  # Handle RGBA
-                img_array = img_array[:,:,:3]
-            lab = color.rgb2lab(img_array[:,:,:3] / 255.0)
-            l_channel = lab[:,:,0]
-            # Enhance contrast of L channel
-            p2, p98 = np.percentile(l_channel, (2, 98))
-            l_channel_enhanced = exposure.rescale_intensity(l_channel, in_range=(p2, p98))
-            # Replace L channel and convert back to RGB
-            lab[:,:,0] = l_channel_enhanced
-            enhanced_img = color.lab2rgb(lab) * 255.0
-            enhanced_img = enhanced_img.astype(np.uint8)
-            return Image.fromarray(enhanced_img)
-        except ImportError:
-            print("Warning: skimage not available for feature enhancement")
-            return image
-        except Exception as e:
-            print(f"Error in feature enhancement: {e}")
-            return image
-    def _determine_landmark_type(self, landmark_id):
-        """
-        自動判斷地標類型，基於地標數據和命名
-        Returns:
-            str: 地標類型，用於調整閾值
-        """
-        if not landmark_id:
-            return "building"  # 預設類型
-        # 獲取地標詳細數據
-        landmark_data = self.landmark_data if hasattr(self, 'landmark_data') else {}
-        landmark_info = landmark_data.get(landmark_id, {})
-        # 獲取地標相關文本
-        landmark_id_lower = landmark_id.lower()
-        landmark_name = landmark_info.get("name", "").lower()
-        landmark_location = landmark_info.get("location", "").lower()
-        landmark_aliases = [alias.lower() for alias in landmark_info.get("aliases", [])]
-        # 合併所有文本數據用於特徵判斷
-        combined_text = " ".join([landmark_id_lower, landmark_name] + landmark_aliases)
-        # 地標類型的特色特徵
-        type_features = {
-            "skyscraper": ["skyscraper", "tall", "tower", "高樓", "摩天", "大厦", "タワー"],
-            "tower": ["tower", "bell", "clock", "塔", "鐘樓", "タワー", "campanile"],
-            "monument": ["monument", "memorial", "statue", "紀念", "雕像", "像", "memorial"],
-            "natural": ["mountain", "lake", "canyon", "falls", "beach", "山", "湖", "峽谷", "瀑布", "海灘"],
-            "temple": ["temple", "shrine", "寺", "神社", "廟"],
-            "palace": ["palace", "castle", "宮", "城", "皇宮", "宫殿"],
-            "distinctive": ["unique", "leaning", "slanted", "傾斜", "斜", "獨特", "傾く"]
-        }
-        # 檢查是否位於亞洲地區
-        asian_regions = ["china", "japan", "korea", "taiwan", "singapore", "vietnam", "thailand",
-                        "hong kong", "中國", "日本", "韓國", "台灣", "新加坡", "越南", "泰國", "香港"]
-        is_asian = any(region in landmark_location for region in asian_regions)
-        # 判斷地標類型
-        best_type = None
-        max_matches = 0
-        for type_name, features in type_features.items():
-            # 計算特徵詞匹配數量
-            matches = sum(1 for feature in features if feature in combined_text)
-            if matches > max_matches:
-                max_matches = matches
-                best_type = type_name
-        # 處理亞洲地區特例
-        if is_asian and best_type == "tower":
-            best_type = "skyscraper"  # 亞洲地區的塔型建築閾值較低
-        # 特例處理：檢測傾斜建築
-        if any(term in combined_text for term in ["leaning", "slanted", "tilt", "inclined", "斜", "傾斜"]):
-            return "distinctive"  # 傾斜建築需要特殊處理
-        return best_type if best_type and max_matches > 0 else "building"  # 預設為一般建築
-    def classify_image_region(self,
-                    image: Union[Image.Image, np.ndarray],
-                    box: List[float],
-                    threshold: float = 0.25,
-                    detection_type: str = "close_up") -> Dict[str, Any]:
-        """
-        對圖像的特定區域進行地標分類，具有增強的多尺度和部分識別能力
-        Args:
-            image: 原始圖像 (PIL Image 或 numpy數組)
-            box: 邊界框 [x1, y1, x2, y2]
-            threshold: 基礎分類置信度閾值
-            detection_type: 檢測類型，影響置信度調整
-        Returns:
-            Dict: 地標分類結果
-        """
-        # 確保圖像是PIL格式
-        if not isinstance(image, Image.Image):
-            if isinstance(image, np.ndarray):
-                image = Image.fromarray(image)
-            else:
-                raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
-        # 生成圖像區域的hash用於快取
-        region_key = (self._get_image_hash(image), tuple(box), detection_type)
-        if region_key in self.results_cache:
-            return self.results_cache[region_key]
-        # 裁剪區域
-        x1, y1, x2, y2 = map(int, box)
-        cropped_image = image.crop((x1, y1, x2, y2))
-        enhanced_image = self._enhance_features(cropped_image)
-        # 分析視角信息
-        viewpoint_info = self._analyze_viewpoint(enhanced_image)
-        dominant_viewpoint = viewpoint_info["dominant_viewpoint"]
-        # 計算區域信息
-        region_width = x2 - x1
-        region_height = y2 - y1
-        image_width, image_height = image.size
-        # 根據區域大小判斷可能的檢測類型
-        region_area_ratio = (region_width * region_height) / (image_width * image_height)
-        if detection_type == "auto":
-            if region_area_ratio > 0.5:
-                detection_type = "close_up"
-            elif region_area_ratio > 0.2:
-                detection_type = "partial"
-            else:
-                detection_type = "distant"
-        # 根據視角調整檢測類型
-        if dominant_viewpoint == "close_up" and detection_type != "close_up":
-            detection_type = "close_up"
-        elif dominant_viewpoint == "distant" and detection_type != "distant":
-            detection_type = "distant"
-        elif dominant_viewpoint == "angled_view":
-            detection_type = "partial"  # 角度視圖可能是部分可見
-        # 調整置信度閾值
-        base_multiplier = self.confidence_threshold_multipliers.get(detection_type, 1.0)
-        adjusted_threshold = threshold * base_multiplier
-        # 調整多尺度處理的尺度範圍和縱橫比 - 增強對傾斜建築的支持
-        scales = [1.0]  # 默認尺度
-        # 基於視角選擇合適的尺度和縱橫比
-        if detection_type in ["partial", "distant"]:
-            scales = [0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3]  # 標準範圍
-        # 如果是特殊視角，進一步調整尺度和縱橫比 - 新增
-        if dominant_viewpoint in ["angled_view", "low_angle"]:
-            scales = [0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4]  # 更寬的範圍
-        # 準備縱橫比 - 同時支持水平和垂直地標
-        aspect_ratios = [1.0, 0.8, 1.2]  # 標準縱橫比
-        # 針對可能的傾斜建築增加更多縱橫比 - 新增
-        if dominant_viewpoint in ["angled_view", "unique_feature"]:
-            aspect_ratios = [0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.5]  # 更多樣的縱橫比
-        best_result = {
-            "landmark_id": None,
-            "landmark_name": None,
-            "confidence": 0.0,
-            "is_landmark": False
-        }
-        # 多尺度和縱橫比分析
-        for scale in scales:
-            for aspect_ratio in aspect_ratios:
-                # 縮放裁剪區域
-                current_width, current_height = cropped_image.size
-                # 計算新尺寸，保持面積不變但調整縱橫比
-                if aspect_ratio != 1.0:
-                    new_width = int(current_width * scale * (1/aspect_ratio)**0.5)
-                    new_height = int(current_height * scale * aspect_ratio**0.5)
-                else:
-                    new_width = int(current_width * scale)
-                    new_height = int(current_height * scale)
-                # 確保尺寸至少為1像素
-                new_width = max(1, new_width)
-                new_height = max(1, new_height)
-                # 縮放圖像
-                try:
-                    scaled_image = cropped_image.resize((new_width, new_height), Image.LANCZOS)
-                except Exception as e:
-                    print(f"Failed to resize image to {new_width}x{new_height}: {e}")
-                    continue
-                # 預處理裁剪圖像
-                try:
-                    image_input = self.preprocess(scaled_image).unsqueeze(0).to(self.device)
-                except Exception as e:
-                    print(f"Failed to preprocess image: {e}")
-                    continue
-                # 獲取圖像特徵
-                with torch.no_grad():
-                    try:
-                        image_features = self.model.encode_image(image_input)
-                        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
-                        # 計算與地標提示的相似度
-                        similarity = (100.0 * image_features @ self.landmark_text_features.T).softmax(dim=-1)
-                        similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
                         # 找到最佳匹配
-                        best_idx = similarity.argmax().item()
-                        best_score = similarity[best_idx]
                         # 如果當前尺度結果更好，則更新
                         if best_score > best_result["confidence"]:
-                            landmark_id = list(self.landmark_data.keys())[best_idx]
-                            landmark_info = self.landmark_data[landmark_id]
-                            best_result = {
-                                "landmark_id": landmark_id,
-                                "landmark_name": landmark_info["name"],
-                                "location": landmark_info["location"],
-                                "confidence": float(best_score),
-                                "is_landmark": best_score >= adjusted_threshold,
-                                "scale_used": scale,
-                                "aspect_ratio_used": aspect_ratio,
-                                "viewpoint": dominant_viewpoint
-                            }
-                            # 添加額外可用信息
-                            for key in ["year_built", "architectural_style", "significance"]:
-                                if key in landmark_info:
-                                    best_result[key] = landmark_info[key]
                     except Exception as e:
-                        print(f"Error in calculating similarity: {e}")
                         continue
-        # 只有在有識別出地標ID且信心度足夠高時才應用地標類型閾值調整
-        if best_result["landmark_id"]:
-            landmark_type = self._determine_landmark_type(best_result["landmark_id"])
-            # 檢測是否為特殊類型的建築如斜塔
-            if landmark_type == "distinctive":
-                # 特殊建築的閾值降低25%
-                type_multiplier = 0.75
-            else:
-                # 使用已有的類型閾值
-                type_multiplier = self.landmark_type_thresholds.get(landmark_type, 1.0) / 0.5
-            # 更新判斷是否為地標的標準
-            final_threshold = adjusted_threshold * type_multiplier
-            best_result["is_landmark"] = best_result["confidence"] >= final_threshold
-            best_result["landmark_type"] = landmark_type  # 添加地標類型信息
-            best_result["threshold_applied"] = final_threshold  # 記錄應用的閾值
-        # 快取結果
-        self.results_cache[region_key] = best_result
-        self._manage_cache()
-        return best_result
     def classify_batch_regions(self,
                               image: Union[Image.Image, np.ndarray],
@@ -556,73 +258,76 @@ class CLIPZeroShotClassifier:
         Returns:
             List[Dict]: 分類結果列表
         """
-        if not self.landmark_text_features is not None:
-            return [{"is_landmark": False, "confidence": 0.0} for _ in boxes]
-        # 確保圖像是PIL格式
-        if not isinstance(image, Image.Image):
-            if isinstance(image, np.ndarray):
-                image = Image.fromarray(image)
-            else:
-                raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
-        # 無框可處理時
-        if not boxes:
-            return []
-        # 裁剪並預處理所有區域
-        cropped_inputs = []
-        for box in boxes:
-            x1, y1, x2, y2 = map(int, box)
-            cropped_image = image.crop((x1, y1, x2, y2))
-            processed_image = self.preprocess(cropped_image).unsqueeze(0)
-            cropped_inputs.append(processed_image)
-        # batch process
-        batch_tensor = torch.cat(cropped_inputs).to(self.device)
-        # batch encoding
-        with torch.no_grad():
-            image_features = self.model.encode_image(batch_tensor)
-            image_features = image_features / image_features.norm(dim=-1, keepdim=True)
             # 計算相似度
-            similarity = (100.0 * image_features @ self.landmark_text_features.T).softmax(dim=-1)
-            similarity = similarity.cpu().numpy() if self.device == "cuda" else similarity.numpy()
-        # 處理每個區域的結果
-        results = []
-        for i, sim in enumerate(similarity):
-            best_idx = sim.argmax().item()
-            best_score = sim[best_idx]
-            if best_score >= threshold:
-                landmark_id = list(self.landmark_data.keys())[best_idx]
-                landmark_info = self.landmark_data[landmark_id]
-                results.append({
-                    "landmark_id": landmark_id,
-                    "landmark_name": landmark_info["name"],
-                    "location": landmark_info["location"],
-                    "confidence": float(best_score),
-                    "is_landmark": True,
-                    "box": boxes[i]
-                })
-            else:
-                results.append({
-                    "landmark_id": None,
-                    "landmark_name": None,
-                    "confidence": float(best_score),
-                    "is_landmark": False,
-                    "box": boxes[i]
-                })
-        return results
     def search_entire_image(self,
-                        image: Union[Image.Image, np.ndarray],
-                        threshold: float = 0.35,
-                        detailed_analysis: bool = False) -> Dict[str, Any]:
         """
         檢查整張圖像是否包含地標，具有增強的分析能力
@@ -634,92 +339,103 @@ class CLIPZeroShotClassifier:
         Returns:
             Dict: 地標分類結果
         """
-        # 確保圖像是PIL格式
-        if not isinstance(image, Image.Image):
-            if isinstance(image, np.ndarray):
-                image = Image.fromarray(image)
-            else:
-                raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
-        # 檢查快取
-        image_key = (self._get_image_hash(image), "entire_image", detailed_analysis)
-        if image_key in self.results_cache:
-            return self.results_cache[image_key]
-        # 調整閾值
-        adjusted_threshold = threshold * self.confidence_threshold_multipliers.get("full_image", 1.0)
-        # 預處理圖像
-        image_input = self.preprocess(image).unsqueeze(0).to(self.device)
-        # 獲取圖像特徵
-        with torch.no_grad():
-            image_features = self.model.encode_image(image_input)
-            image_features = image_features / image_features.norm(dim=-1, keepdim=True)
-            # 計算與地標提示的相似度
-            similarity = (100.0 * image_features @ self.landmark_text_features.T).softmax(dim=-1)
-            similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
-        # 找到最佳匹配
-        best_idx = similarity.argmax().item()
-        best_score = similarity[best_idx]
-        # top3 landmark
-        top_indices = similarity.argsort()[-3:][::-1]
-        top_landmarks = []
-        for idx in top_indices:
-            score = similarity[idx]
-            landmark_id = list(self.landmark_data.keys())[idx]
-            landmark_info = self.landmark_data[landmark_id]
-            landmark_result = {
-                "landmark_id": landmark_id,
-                "landmark_name": landmark_info["name"],
-                "location": landmark_info["location"],
-                "confidence": float(score)
-            }
-            # 添加額外可用信息
-            if "year_built" in landmark_info:
-                landmark_result["year_built"] = landmark_info["year_built"]
-            if "architectural_style" in landmark_info:
-                landmark_result["architectural_style"] = landmark_info["architectural_style"]
-            if "significance" in landmark_info:
-                landmark_result["significance"] = landmark_info["significance"]
-            top_landmarks.append(landmark_result)
-        # main result
-        result = {}
-        if best_score >= adjusted_threshold:
-            landmark_id = list(self.landmark_data.keys())[best_idx]
-            landmark_info = self.landmark_data[landmark_id]
-            # 應用地標類型特定閾值
-            landmark_type = self._determine_landmark_type(landmark_id)
-            type_multiplier = self.landmark_type_thresholds.get(landmark_type, 1.0) / 0.5
-            final_threshold = adjusted_threshold * type_multiplier
-            if best_score >= final_threshold:
-                result = {
-                    "landmark_id": landmark_id,
-                    "landmark_name": landmark_info["name"],
-                    "location": landmark_info["location"],
-                    "confidence": float(best_score),
-                    "is_landmark": True,
-                    "landmark_type": landmark_type,
-                    "top_landmarks": top_landmarks
-                }
-                # 添加額外可用信息
-                if "year_built" in landmark_info:
-                    result["year_built"] = landmark_info["year_built"]
-                if "architectural_style" in landmark_info:
-                    result["architectural_style"] = landmark_info["architectural_style"]
-                if "significance" in landmark_info:
-                    result["significance"] = landmark_info["significance"]
             else:
                 result = {
                     "landmark_id": None,
@@ -729,266 +445,49 @@ class CLIPZeroShotClassifier:
                     "top_landmarks": top_landmarks
                 }
-        # 如果請求詳細分析且是地標，進一步分析圖像區域
-        if detailed_analysis and result.get("is_landmark", False):
-            # 創建不同區域進行更深入分析
-            width, height = image.size
-            regions = [
-                # 中心區域
-                [width * 0.25, height * 0.25, width * 0.75, height * 0.75],
-                # 左半部
-                [0, 0, width * 0.5, height],
-                # 右半部
-                [width * 0.5, 0, width, height],
-                # 上半部
-                [0, 0, width, height * 0.5],
-                # 下半部
-                [0, height * 0.5, width, height]
-            ]
-            region_results = []
-            for i, box in enumerate(regions):
-                region_result = self.classify_image_region(
-                    image,
-                    box,
-                    threshold=threshold * 0.9,
-                    detection_type="partial"
-                )
-                if region_result["is_landmark"]:
-                    region_result["region_name"] = ["center", "left", "right", "top", "bottom"][i]
-                    region_results.append(region_result)
-            # 添加區域分析結果
-            if region_results:
-                result["region_analyses"] = region_results
-        # 快取結果
-        self.results_cache[image_key] = result
-        self._manage_cache()
-        return result
-    def enhanced_landmark_detection(self,
-                              image: Union[Image.Image, np.ndarray],
-                              threshold: float = 0.3) -> Dict[str, Any]:
-        """
-        Enhanced landmark detection using multiple analysis techniques.
-        Args:
-            image: Input image
-            threshold: Base confidence threshold
-        Returns:
-            Dict: Comprehensive landmark detection results
-        """
-        # Ensure image is PIL format
-        if not isinstance(image, Image.Image):
-            if isinstance(image, np.ndarray):
-                image = Image.fromarray(image)
-            else:
-                raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
-        # Phase 1: Analyze viewpoint to adjust detection parameters
-        viewpoint_info = self._analyze_viewpoint(image)
-        viewpoint = viewpoint_info["dominant_viewpoint"]
-        # Adjust threshold based on viewpoint
-        if viewpoint == "distant":
-            adjusted_threshold = threshold * 0.7  # Lower threshold for distant views
-        elif viewpoint == "close_up":
-            adjusted_threshold = threshold * 1.1  # Higher threshold for close-ups
-        else:
-            adjusted_threshold = threshold
-        # Phase 2: Perform multi-scale pyramid analysis
-        pyramid_results = self._perform_pyramid_analysis(image, levels=3, base_threshold=adjusted_threshold)
-        # Phase 3: Perform grid-based region analysis
-        grid_results = []
-        width, height = image.size
-        # Create adaptive grid based on viewpoint
-        if viewpoint == "distant":
-            grid_size = 3  # Coarser grid for distant views
-        elif viewpoint == "close_up":
-            grid_size = 5  # Finer grid for close-ups
-        else:
-            grid_size = 4  # Default grid size
-        # Generate grid regions
-        for i in range(grid_size):
-            for j in range(grid_size):
-                box = [
-                    width * (j/grid_size),
-                    height * (i/grid_size),
-                    width * ((j+1)/grid_size),
-                    height * ((i+1)/grid_size)
                 ]
-                # Apply feature enhancement
-                region_result = self.classify_image_region(
-                    image,
-                    box,
-                    threshold=adjusted_threshold,
-                    detection_type="auto"
-                )
-                if region_result["is_landmark"]:
-                    region_result["grid_position"] = (i, j)
-                    grid_results.append(region_result)
-        # Phase 4: Cross-validate and combine results
-        all_detections = []
-        # Add pyramid results
-        if pyramid_results["is_landmark"] and pyramid_results["best_result"]:
-            all_detections.append({
-                "source": "pyramid",
-                "landmark_id": pyramid_results["best_result"]["landmark_id"],
-                "landmark_name": pyramid_results["best_result"]["landmark_name"],
-                "confidence": pyramid_results["best_result"]["confidence"],
-                "scale_factor": pyramid_results["best_result"].get("scale_factor", 1.0)
-            })
-        # Add grid results
-        for result in grid_results:
-            all_detections.append({
-                "source": "grid",
-                "landmark_id": result["landmark_id"],
-                "landmark_name": result["landmark_name"],
-                "confidence": result["confidence"],
-                "grid_position": result.get("grid_position", (0, 0))
-            })
-        # Search entire image
-        full_image_result = self.search_entire_image(image, threshold=adjusted_threshold)
-        if full_image_result and full_image_result.get("is_landmark", False):
-            all_detections.append({
-                "source": "full_image",
-                "landmark_id": full_image_result["landmark_id"],
-                "landmark_name": full_image_result["landmark_name"],
-                "confidence": full_image_result["confidence"]
-            })
-        # Group by landmark_id and calculate aggregate confidence
-        landmark_groups = {}
-        for detection in all_detections:
-            landmark_id = detection["landmark_id"]
-            if landmark_id not in landmark_groups:
-                landmark_groups[landmark_id] = {
-                    "landmark_id": landmark_id,
-                    "landmark_name": detection["landmark_name"],
-                    "detections": [],
-                    "sources": set()
-                }
-            landmark_groups[landmark_id]["detections"].append(detection)
-            landmark_groups[landmark_id]["sources"].add(detection["source"])
-        # Calculate aggregate confidence for each landmark
-        for landmark_id, group in landmark_groups.items():
-            detections = group["detections"]
-            # Base confidence is the maximum confidence from any source
-            max_confidence = max(d["confidence"] for d in detections)
-            # Bonus for detection from multiple sources
-            source_count = len(group["sources"])
-            source_bonus = min(0.15, (source_count - 1) * 0.05)  # Up to 15% bonus
-            # Consistency bonus for multiple detections of the same landmark
-            detection_count = len(detections)
-            consistency_bonus = min(0.1, (detection_count - 1) * 0.02)  # Up to 10% bonus
-            # Calculate final confidence
-            aggregate_confidence = min(1.0, max_confidence + source_bonus + consistency_bonus)
-            group["confidence"] = aggregate_confidence
-            group["detection_count"] = detection_count
-            group["source_count"] = source_count
-        # Sort landmarks by confidence
-        sorted_landmarks = sorted(
-            landmark_groups.values(),
-            key=lambda x: x["confidence"],
-            reverse=True
-        )
-        return {
-            "is_landmark_scene": len(sorted_landmarks) > 0,
-            "detected_landmarks": sorted_landmarks,
-            "viewpoint_info": viewpoint_info,
-            "primary_landmark": sorted_landmarks[0] if sorted_landmarks else None
-        }
-    def _analyze_architectural_features(self, image):
-        """
-        Analyzes the architectural features of a structure in the image without hardcoding specific landmarks.
-        Args:
-            image: Input image
-        Returns:
-            Dict: Architectural feature analysis results
-        """
-        # Define universal architectural feature prompts that apply to all types of landmarks
-        architecture_prompts = {
-            "tall_structure": "a tall vertical structure standing alone",
-            "tiered_building": "a building with multiple stacked tiers or segments",
-            "historical_structure": "a building with historical architectural elements",
-            "modern_design": "a modern structure with contemporary architectural design",
-            "segmented_exterior": "a structure with visible segmented or sectioned exterior",
-            "viewing_platform": "a tall structure with observation area at the top",
-            "time_display": "a structure with timepiece features",
-            "glass_facade": "a building with prominent glass exterior surfaces",
-            "memorial_structure": "a monument or memorial structure",
-            "ancient_construction": "ancient constructed elements or archaeological features",
-            "natural_landmark": "a natural geographic formation or landmark",
-            "slanted_design": "a structure with non-vertical or leaning profile"
-        }
-        # Calculate similarity scores against universal architectural patterns
-        context_scores = self.calculate_similarity_scores(image, architecture_prompts)
-        # Determine most relevant architectural features
-        top_features = sorted(context_scores.items(), key=lambda x: x[1], reverse=True)[:3]
-        # Calculate feature confidence
-        context_confidence = sum(score for _, score in top_features) / 3
-        # Determine primary architectural category based on top features
-        architectural_categories = {
-            "tower": ["tall_structure", "viewing_platform", "time_display"],
-            "skyscraper": ["tall_structure", "modern_design", "glass_facade"],
-            "historical": ["historical_structure", "ancient_construction", "memorial_structure"],
-            "natural": ["natural_landmark"],
-            "distinctive": ["tiered_building", "segmented_exterior", "slanted_design"]
-        }
-        # Score each category based on the top features
-        category_scores = {}
-        for category, features in architectural_categories.items():
-            category_score = 0
-            for feature, score in context_scores.items():
-                if feature in features:
-                    category_score += score
-            category_scores[category] = category_score
-        primary_category = max(category_scores.items(), key=lambda x: x[1])[0]
-        return {
-            "architectural_features": top_features,
-            "context_confidence": context_confidence,
-            "primary_category": primary_category,
-            "category_scores": category_scores
-        }
     def intelligent_landmark_search(self,
-                                image: Union[Image.Image, np.ndarray],
-                                yolo_boxes: Optional[List[List[float]]] = None,
-                                base_threshold: float = 0.25) -> Dict[str, Any]:
         """
-        對圖像進行智能地標搜索，綜合整張圖像分析和區域分析
         Args:
             image: 原始圖像
@@ -998,158 +497,121 @@ class CLIPZeroShotClassifier:
         Returns:
             Dict: 包含所有檢測結果的綜合分析
         """
-        # 確保圖像是PIL格式
-        if not isinstance(image, Image.Image):
-            if isinstance(image, np.ndarray):
-                image = Image.fromarray(image)
-            else:
-                raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
-        # No YOLO 框時，可以稍微降低閾值以提高召回率
-        actual_threshold = base_threshold * 0.85 if yolo_boxes is None or len(yolo_boxes) == 0 else base_threshold
-        # 首先對整張圖像進行分析
-        try:
             full_image_result = self.search_entire_image(
                 image,
                 threshold=actual_threshold,
-                detailed_analysis=True  # 確保詳細分析開啟
             )
-            # No YOLO 框，則進行多尺度分析以提高檢測機會
             if (yolo_boxes is None or len(yolo_boxes) == 0) and (not full_image_result or not full_image_result.get("is_landmark", False)):
-                print("No YOLO boxes provided, attempting multi-scale pyramid analysis")
-                try:
-                    if hasattr(self, '_perform_pyramid_analysis'):
-                        pyramid_results = self._perform_pyramid_analysis(
-                            image,
-                            levels=4,  #
-                            base_threshold=actual_threshold,
-                            aspect_ratios=[1.0, 0.75, 1.5, 0.5, 2.0]
                         )
-                        if pyramid_results and pyramid_results.get("is_landmark", False) and pyramid_results.get("best_result", {}).get("confidence", 0) > actual_threshold:
-                            # 使用金字塔分析結果增強或替代全圖結果
-                            if not full_image_result or not full_image_result.get("is_landmark", False):
-                                full_image_result = {
-                                    "is_landmark": True,
-                                    "landmark_id": pyramid_results["best_result"]["landmark_id"],
-                                    "landmark_name": pyramid_results["best_result"]["landmark_name"],
-                                    "confidence": pyramid_results["best_result"]["confidence"],
-                                    "location": pyramid_results["best_result"].get("location", "Unknown Location")
-                                }
-                                print(f"Pyramid analysis detected landmark: {pyramid_results['best_result']['landmark_name']} with confidence {pyramid_results['best_result']['confidence']:.3f}")
-                    else:
-                        print("Pyramid analysis not available, skipping multi-scale detection")
-                except Exception as e:
-                    print(f"Error in pyramid analysis: {e}")
-        except Exception as e:
-            print(f"Error in search_entire_image: {e}")
-            import traceback
-            traceback.print_exc()
-            full_image_result = None
-        # 初始化結果字典
-        result = {
-            "full_image_analysis": full_image_result if full_image_result else {},
-            "is_landmark_scene": False,  # 默認值
-            "detected_landmarks": []
-        }
-        # 上下文感知比較，處理接近的排名結果
-        if full_image_result and "top_landmarks" in full_image_result and len(full_image_result["top_landmarks"]) >= 2:
-            top_landmarks = full_image_result["top_landmarks"]
-            # 檢查前兩個結果是否非常接近（信心度差異小於 0.1）
-            if len(top_landmarks) >= 2 and abs(top_landmarks[0]["confidence"] - top_landmarks[1]["confidence"]) < 0.1:
-                # 對於接近的結果，使用通用建築特徵分析進行區分
-                try:
-                    # 分析建築特徵
-                    if hasattr(self, '_analyze_architectural_features'):
-                        architectural_analysis = self._analyze_architectural_features(image)
-                        top_features = architectural_analysis.get("architectural_features", [])
-                        primary_category = architectural_analysis.get("primary_category", "")
-                        # 根據建築特徵調整地標置信度
-                        for i, landmark in enumerate(top_landmarks[:2]):
-                            if i >= len(top_landmarks):
-                                continue
-                            landmark_id = landmark.get("landmark_id", "").lower()
-                            confidence_boost = 0
-                            # 使用主要建築類別來調整置信度，使用通用條件而非特定地標名稱
-                            if primary_category == "tower" and any(term in landmark_id for term in ["tower", "spire", "needle"]):
-                                confidence_boost += 0.05
-                            elif primary_category == "skyscraper" and any(term in landmark_id for term in ["building", "skyscraper", "tall"]):
-                                confidence_boost += 0.05
-                            elif primary_category == "historical" and any(term in landmark_id for term in ["monument", "castle", "palace", "temple"]):
-                                confidence_boost += 0.05
-                            elif primary_category == "distinctive" and any(term in landmark_id for term in ["unusual", "unique", "special", "famous"]):
-                                confidence_boost += 0.05
-                            # 根據特定特徵進一步微調，使用通用特徵描述而非特定地標
-                            for feature, score in top_features:
-                                if feature == "time_display" and "clock" in landmark_id:
-                                    confidence_boost += 0.03
-                                elif feature == "segmented_exterior" and "segmented" in landmark_id:
-                                    confidence_boost += 0.03
-                                elif feature == "slanted_design" and "leaning" in landmark_id:
-                                    confidence_boost += 0.03
-                            # 應用信心度調整
-                            if confidence_boost > 0 and i < len(top_landmarks):
-                                top_landmarks[i]["confidence"] += confidence_boost
-                                print(f"Boosted {landmark['landmark_name']} confidence by {confidence_boost:.2f} based on architectural features ({primary_category})")
-                        # 重新排序
-                        top_landmarks.sort(key=lambda x: x["confidence"], reverse=True)
-                        full_image_result["top_landmarks"] = top_landmarks
-                        if top_landmarks:
-                            full_image_result["landmark_id"] = top_landmarks[0]["landmark_id"]
-                            full_image_result["landmark_name"] = top_landmarks[0]["landmark_name"]
-                            full_image_result["confidence"] = top_landmarks[0]["confidence"]
-                            full_image_result["location"] = top_landmarks[0].get("location", "Unknown Location")
-                except Exception as e:
-                    print(f"Error in architectural feature analysis: {e}")
-                    import traceback
-                    traceback.print_exc()
-        if full_image_result and full_image_result.get("is_landmark", False):
-            result["is_landmark_scene"] = True
-            landmark_id = full_image_result.get("landmark_id", "unknown")
-            # extract landmark info
-            landmark_specific_info = self._extract_landmark_specific_info(landmark_id)
-            landmark_info = {
-                "landmark_id": landmark_id,
-                "landmark_name": full_image_result.get("landmark_name", "Unknown Landmark"),
-                "confidence": full_image_result.get("confidence", 0.0),
-                "location": full_image_result.get("location", "Unknown Location"),
-                "region_type": "full_image",
-                "box": [0, 0, getattr(image, 'width', 0), getattr(image, 'height', 0)]
-            }
-            # 整合地標特定info，確保正確的名稱被使用
-            landmark_info.update(landmark_specific_info)
-            # 如果特定信息中有更準確的地標名稱，使用它
-            if landmark_specific_info.get("landmark_name"):
-                landmark_info["landmark_name"] = landmark_specific_info["landmark_name"]
-            result["detected_landmarks"].append(landmark_info)
-            # 確保地標特定活動被正確設置為主要結果
-            if landmark_specific_info.get("has_specific_activities", False):
-                result["primary_landmark_activities"] = landmark_specific_info.get("landmark_specific_activities", [])
-                print(f"Set primary landmark activities: {len(result['primary_landmark_activities'])} activities for {landmark_info['landmark_name']}")
-        # 如果提供了YOLO邊界框，分析這些區域
-        if yolo_boxes and len(yolo_boxes) > 0:
-            for box in yolo_boxes:
-                try:
-                    if hasattr(self, 'classify_image_region'):
                         box_result = self.classify_image_region(
                             image,
                             box,
@@ -1157,13 +619,10 @@ class CLIPZeroShotClassifier:
                             detection_type="auto"
                         )
-                        # 如果檢測到地標
                         if box_result and box_result.get("is_landmark", False):
-                            # 檢查是否與已檢測的地標重複
                             is_duplicate = False
                             for existing in result["detected_landmarks"]:
                                 if existing.get("landmark_id") == box_result.get("landmark_id"):
-                                    # 如果新的置信度更高，則更新
                                     if box_result.get("confidence", 0) > existing.get("confidence", 0):
                                         existing.update({
                                             "confidence": box_result.get("confidence", 0),
@@ -1173,7 +632,6 @@ class CLIPZeroShotClassifier:
                                     is_duplicate = True
                                     break
-                            # 如果不是重複的，添加到列表
                             if not is_duplicate:
                                 result["detected_landmarks"].append({
                                     "landmark_id": box_result.get("landmark_id", "unknown"),
@@ -1183,234 +641,250 @@ class CLIPZeroShotClassifier:
                                     "region_type": "yolo_box",
                                     "box": box
                                 })
                 except Exception as e:
-                    print(f"Error in analyzing YOLO box: {e}")
-                    continue
-        # 最後，執行額外的網格搜索以捕獲可能被遺漏的地標
-        # 但只有在尚未發現地標或僅發現低置信度地標時
-        should_do_grid_search = (
-            len(result["detected_landmarks"]) == 0 or
-            max([landmark.get("confidence", 0) for landmark in result["detected_landmarks"]], default=0) < 0.5
-        )
-        if should_do_grid_search and hasattr(self, 'classify_image_region'):
-            try:
-                # 創建5x5網格
-                width, height = getattr(image, 'size', (getattr(image, 'width', 0), getattr(image, 'height', 0)))
-                if not isinstance(width, (int, float)) or width <= 0:
-                    width = getattr(image, 'width', 0)
-                if not isinstance(height, (int, float)) or height <= 0:
-                    height = getattr(image, 'height', 0)
-                if width > 0 and height > 0:
-                    grid_boxes = []
-                    for i in range(5):
-                        for j in range(5):
-                            grid_boxes.append([
-                                width * (j/5), height * (i/5),
-                                width * ((j+1)/5), height * ((i+1)/5)
-                            ])
-                    # 分析每個網格區域
-                    for box in grid_boxes:
-                        try:
-                            grid_result = self.classify_image_region(
-                                image,
-                                box,
-                                threshold=base_threshold * 0.9,  # 稍微降低網格搜索閾值
-                                detection_type="partial"
-                            )
-                            # 如果檢測到地標
-                            if grid_result and grid_result.get("is_landmark", False):
-                                # 檢查是否與已檢測的地標重複
-                                is_duplicate = False
-                                for existing in result["detected_landmarks"]:
-                                    if existing.get("landmark_id") == grid_result.get("landmark_id"):
-                                        is_duplicate = True
-                                        break
-                                # 如果不是重複的，添加到列表
-                                if not is_duplicate:
-                                    result["detected_landmarks"].append({
-                                        "landmark_id": grid_result.get("landmark_id", "unknown"),
-                                        "landmark_name": grid_result.get("landmark_name", "Unknown Landmark"),
-                                        "confidence": grid_result.get("confidence", 0.0),
-                                        "location": grid_result.get("location", "Unknown Location"),
-                                        "region_type": "grid",
-                                        "box": box
-                                    })
-                        except Exception as e:
-                            print(f"Error in analyzing grid region: {e}")
-                            continue
-            except Exception as e:
-                print(f"Error in grid search: {e}")
-                import traceback
-                traceback.print_exc()
-        # 按置信度排序檢測結果
-        result["detected_landmarks"].sort(key=lambda x: x.get("confidence", 0), reverse=True)
-        # 更新整體場景類型判斷
-        if len(result["detected_landmarks"]) > 0:
-            result["is_landmark_scene"] = True
-            result["primary_landmark"] = result["detected_landmarks"][0]
-            # 添加 clip_analysis_on_full_image 結果，以便給 LLM 提供更多上下文
-            if full_image_result and "clip_analysis" in full_image_result:
-                result["clip_analysis_on_full_image"] = full_image_result["clip_analysis"]
-        return result
-    def _extract_landmark_specific_info(self, landmark_id: str) -> Dict[str, Any]:
         """
-        提取特定地標的詳細信息，包括特色模板和活動建議
         Args:
-            landmark_id: 地標ID
         Returns:
-            Dict: 地標特定信息
         """
-        if not landmark_id or landmark_id == "unknown":
-            return {"has_specific_activities": False}
-        specific_info = {"has_specific_activities": False}
-        # 從 ALL_LANDMARKS 或 self.landmark_data 中提取基本信息
-        landmark_data_source = None
-        # 優先嘗試從類屬性獲取
-        if hasattr(self, 'landmark_data') and self.landmark_data and landmark_id in self.landmark_data:
-            landmark_data_source = self.landmark_data[landmark_id]
-            print(f"Using landmark data from class attribute for {landmark_id}")
-        else:
-            try:
-                if landmark_id in ALL_LANDMARKS:
-                    landmark_data_source = ALL_LANDMARKS[landmark_id]
-                    print(f"Using landmark data from ALL_LANDMARKS for {landmark_id}")
-            except ImportError:
-                print("Warning: Could not import ALL_LANDMARKS from landmark_data")
-            except Exception as e:
-                print(f"Error accessing ALL_LANDMARKS: {e}")
-        # 處理地標基本數據
-        if landmark_data_source:
-            # 提取正確的地標名稱
-            if "name" in landmark_data_source:
-                specific_info["landmark_name"] = landmark_data_source["name"]
-            # 提取所有可用的 prompts 作為特色模板
-            if "prompts" in landmark_data_source:
-                specific_info["feature_templates"] = landmark_data_source["prompts"][:5]
-                specific_info["primary_template"] = landmark_data_source["prompts"][0]
-            # 提取別名info
-            if "aliases" in landmark_data_source:
-                specific_info["aliases"] = landmark_data_source["aliases"]
-            # 提取位置信息
-            if "location" in landmark_data_source:
-                specific_info["location"] = landmark_data_source["location"]
-            # 提取其他相關信息
-            for key in ["year_built", "architectural_style", "significance", "description"]:
-                if key in landmark_data_source:
-                    specific_info[key] = landmark_data_source[key]
-        # 嘗試從 LANDMARK_ACTIVITIES 中提取活動建議
         try:
-            if landmark_id in LANDMARK_ACTIVITIES:
-                activities = LANDMARK_ACTIVITIES[landmark_id]
-                specific_info["landmark_specific_activities"] = activities
-                specific_info["has_specific_activities"] = True
-                print(f"Found {len(activities)} specific activities for landmark {landmark_id}")
-            else:
-                print(f"No specific activities found for landmark {landmark_id} in LANDMARK_ACTIVITIES")
-                specific_info["has_specific_activities"] = False
-        except ImportError:
-            print("Warning: Could not import LANDMARK_ACTIVITIES from landmark_activities")
-            specific_info["has_specific_activities"] = False
-        except Exception as e:
-            print(f"Error loading landmark activities for {landmark_id}: {e}")
-            specific_info["has_specific_activities"] = False
-        return specific_info
-    def _analyze_viewpoint(self, image: Union[Image.Image, np.ndarray]) -> Dict[str, float]:
-        """
-        Analyzes the image viewpoint to adjust detection parameters.
-        Args:
-            image: Input image
-        Returns:
-            Dict: Viewpoint analysis results
-        """
-        viewpoint_prompts = {
-            "aerial_view": "an aerial view from above looking down",
-            "street_level": "a street level view looking up at a tall structure",
-            "eye_level": "an eye-level horizontal view of a landmark",
-            "distant": "a distant view of a landmark on the horizon",
-            "close_up": "a close-up detailed view of architectural features",
-            "interior": "an interior view inside a structure"
-        }
-        # Calculate similarity scores
-        viewpoint_scores = self.calculate_similarity_scores(image, viewpoint_prompts)
-        # Find dominant viewpoint
-        dominant_viewpoint = max(viewpoint_scores.items(), key=lambda x: x[1])
-        return {
-            "viewpoint_scores": viewpoint_scores,
-            "dominant_viewpoint": dominant_viewpoint[0],
-            "confidence": dominant_viewpoint[1]
-        }
-    def calculate_similarity_scores(self, image: Union[Image.Image, np.ndarray],
-                                prompts: Dict[str, str]) -> Dict[str, float]:
-        """
-        計算圖像與一組特定提示之間的相似度分數
-        Args:
-            image: 輸入圖像
-            prompts: 提示詞字典 {名稱: 提示文本}
-        Returns:
-            Dict[str, float]: 每個提示的相似度分數
-        """
-        # 確保圖像是PIL格式
-        if not isinstance(image, Image.Image):
-            if isinstance(image, np.ndarray):
-                image = Image.fromarray(image)
             else:
-                raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
-        # 預處理圖像
-        image_input = self.preprocess(image).unsqueeze(0).to(self.device)
-        # 獲取圖像特徵
-        with torch.no_grad():
-            image_features = self.model.encode_image(image_input)
-            image_features = image_features / image_features.norm(dim=-1, keepdim=True)
-        # 計算與每個提示的相似度
-        scores = {}
-        prompt_texts = list(prompts.values())
-        prompt_tokens = clip.tokenize(prompt_texts).to(self.device)
-        with torch.no_grad():
-            prompt_features = self.model.encode_text(prompt_tokens)
-            prompt_features = prompt_features / prompt_features.norm(dim=-1, keepdim=True)
-            # calculate similarity
-            similarity = (100.0 * image_features @ prompt_features.T).softmax(dim=-1)
-            similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
-        # 填充結果字典
-        for i, (name, _) in enumerate(prompts.items()):
-            scores[name] = float(similarity[i])
-        return scores

 import clip
 from PIL import Image
 import numpy as np
+import logging
+import traceback
 from typing import List, Dict, Tuple, Optional, Union, Any
+from clip_model_manager import CLIPModelManager
+from landmark_data_manager import LandmarkDataManager
+from image_analyzer import ImageAnalyzer
+from confidence_manager import ConfidenceManager
+from result_cache_manager import ResultCacheManager
 class CLIPZeroShotClassifier:
     """
+    使用CLIP模型進行zero shot，專注於辨識世界知名地標。
+    作為YOLO的補充，處理YOLO無法辨識到的地標。
+    這是一個總窗口class，協調各個組件的工作以提供統一的對外接口。
     """
     def __init__(self, model_name: str = "ViT-B/16", device: str = None):
         """
         初始化CLIP零樣本分類器
             model_name: CLIP模型名稱，默認為"ViT-B/16"
             device: 運行設備，None則自動選擇
         """
+        self.logger = logging.getLogger(__name__)
+        # 初始化各個組件
+        self.clip_model_manager = CLIPModelManager(model_name, device)
+        self.landmark_data_manager = LandmarkDataManager()
+        self.image_analyzer = ImageAnalyzer()
+        self.confidence_manager = ConfidenceManager()
+        self.cache_manager = ResultCacheManager()
+        # 預計算地標文本特徵
+        self.landmark_text_features = None
+        self._precompute_landmark_features()
+        self.logger.info(f"Initializing CLIP Zero-Shot Landmark Classifier ({model_name}) on {self.clip_model_manager.get_device()}")
+    def _precompute_landmark_features(self):
         """
+        預計算地標文本特徵，提高批處理效率
         """
+        try:
+            if self.landmark_data_manager.is_landmark_enabled():
+                landmark_prompts = self.landmark_data_manager.get_landmark_prompts()
+                if landmark_prompts:
+                    self.landmark_text_features = self.clip_model_manager.encode_text_batch(landmark_prompts)
+                    self.logger.info(f"Precomputed text features for {len(landmark_prompts)} landmark prompts")
+                else:
+                    self.logger.warning("No landmark prompts available for precomputation")
+            else:
+                self.logger.warning("Landmark data not enabled, skipping feature precomputation")
+        except Exception as e:
+            self.logger.error(f"Error precomputing landmark features: {e}")
+            self.logger.error(traceback.format_exc())
     def set_batch_size(self, batch_size: int):
         """
         Args:
             batch_size: 新的批處理大小
         """
+        self.confidence_manager.set_batch_size(batch_size)
     def adjust_confidence_threshold(self, detection_type: str, multiplier: float):
         """
         調整特定檢測類型的置信度閾值乘數
+        Args
             detection_type: 檢測類型 ('close_up', 'partial', 'distant', 'full_image')
             multiplier: 置信度閾值乘數
         """
+        self.confidence_manager.adjust_confidence_threshold(detection_type, multiplier)
+    def classify_image_region(self,
+                            image: Union[Image.Image, np.ndarray],
+                            box: List[float],
+                            threshold: float = 0.25,
+                            detection_type: str = "close_up") -> Dict[str, Any]:
         """
+        對圖像的特定區域進行地標分類，具有增強的多尺度和部分識別能力
         Args:
+            image: 原始圖像 (PIL Image 或 numpy數組)
+            box: 邊界框 [x1, y1, x2, y2]
+            threshold: 基礎分類置信度閾值
+            detection_type: 檢測類型，影響置信度調整
         Returns:
+            Dict: 地標分類結果
         """
+        try:
+            if not self.landmark_data_manager.is_landmark_enabled():
+                return {"is_landmark": False, "confidence": 0.0}
+            # 確保圖像是PIL格式
+            if not isinstance(image, Image.Image):
+                if isinstance(image, np.ndarray):
+                    image = Image.fromarray(image)
                 else:
+                    raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
+            # 生成圖像區域的hash用於快取
+            image_hash = self.image_analyzer.get_image_hash(image)
+            region_key = self.cache_manager.get_region_cache_key(image_hash, tuple(box), detection_type)
+            # 檢查快取
+            cached_result = self.cache_manager.get_cached_result(region_key)
+            if cached_result is not None:
+                return cached_result
+            # 裁剪區域
+            x1, y1, x2, y2 = map(int, box)
+            cropped_image = image.crop((x1, y1, x2, y2))
+            enhanced_image = self.image_analyzer.enhance_features(cropped_image)
+            # 分析視角信息
+            viewpoint_info = self.image_analyzer.analyze_viewpoint(enhanced_image, self.clip_model_manager)
+            dominant_viewpoint = viewpoint_info["dominant_viewpoint"]
+            # 計算區域信息
+            region_width = x2 - x1
+            region_height = y2 - y1
+            image_width, image_height = image.size
+            # 根據區域大小判斷可能的檢測類型
+            if detection_type == "auto":
+                detection_type = self.confidence_manager.determine_detection_type_from_region(
+                    region_width, region_height, image_width, image_height
+                )
+            # 根據視角調整檢測類型
+            detection_type = self.confidence_manager.adjust_detection_type_by_viewpoint(detection_type, dominant_viewpoint)
+            # 調整置信度閾值
+            adjusted_threshold = self.confidence_manager.calculate_adjusted_threshold(threshold, detection_type)
+            # 準備多尺度和縱橫比分析
+            scales = [1.0]
+            if detection_type in ["partial", "distant"]:
+                scales = [0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3]
+            if dominant_viewpoint in ["angled_view", "low_angle"]:
+                scales = [0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4]
+            aspect_ratios = [1.0, 0.8, 1.2]
+            if dominant_viewpoint in ["angled_view", "unique_feature"]:
+                aspect_ratios = [0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.5]
+            best_result = {
+                "landmark_id": None,
+                "landmark_name": None,
+                "confidence": 0.0,
+                "is_landmark": False
+            }
+            # 多尺度和縱橫比分析
+            for scale in scales:
+                for aspect_ratio in aspect_ratios:
+                    try:
+                        # 縮放裁剪區域
+                        current_width, current_height = cropped_image.size
+                        if aspect_ratio != 1.0:
+                            new_width = int(current_width * scale * (1/aspect_ratio)**0.5)
+                            new_height = int(current_height * scale * aspect_ratio**0.5)
+                        else:
+                            new_width = int(current_width * scale)
+                            new_height = int(current_height * scale)
+                        new_width = max(1, new_width)
+                        new_height = max(1, new_height)
+                        scaled_image = cropped_image.resize((new_width, new_height), Image.LANCZOS)
+                        # 預處理並獲取特徵
+                        image_input = self.clip_model_manager.preprocess_image(scaled_image)
+                        image_features = self.clip_model_manager.encode_image(image_input)
+                        # 計算相似度
+                        similarity = self.clip_model_manager.calculate_similarity(image_features, self.landmark_text_features)
                         # 找到最佳匹配
+                        best_idx = similarity[0].argmax().item()
+                        best_score = similarity[0][best_idx]
                         # 如果當前尺度結果更好，則更新
                         if best_score > best_result["confidence"]:
+                            landmark_id, landmark_info = self.landmark_data_manager.get_landmark_by_index(best_idx)
+                            if landmark_id:
+                                # 先從 LandmarkDataManager 拿 location
+                                loc = landmark_info.get("location", "")
+                                # 如果 loc 為空，就從全域 ALL_LANDMARKS 補上
+                                if not loc and landmark_id in ALL_LANDMARKS:
+                                    loc = ALL_LANDMARKS[landmark_id].get("location", "")
+                                best_result = {
+                                    "landmark_id": landmark_id,
+                                    "landmark_name": landmark_info.get("name", "Unknown"),
+                                    "location": loc or "Unknown Location",
+                                    "confidence": float(best_score),
+                                    "is_landmark": best_score >= adjusted_threshold,
+                                    "scale_used": scale,
+                                    "aspect_ratio_used": aspect_ratio,
+                                    "viewpoint": dominant_viewpoint
+                                }
+                                # 添加額外可用信息
+                                for key in ["year_built", "architectural_style", "significance"]:
+                                    if key in landmark_info:
+                                        best_result[key] = landmark_info[key]
                     except Exception as e:
+                        self.logger.error(f"Error in scale analysis: {e}")
                         continue
+            # 應用地標類型閾值調整
+            if best_result["landmark_id"]:
+                landmark_type = self.landmark_data_manager.determine_landmark_type(best_result["landmark_id"])
+                final_threshold = self.confidence_manager.calculate_final_threshold(adjusted_threshold, detection_type, landmark_type)
+                best_result["is_landmark"] = self.confidence_manager.evaluate_confidence(best_result["confidence"], final_threshold)
+                best_result["landmark_type"] = landmark_type
+                best_result["threshold_applied"] = final_threshold
+            # 快取結果
+            self.cache_manager.set_cached_result(region_key, best_result)
+            return best_result
+        except Exception as e:
+            self.logger.error(f"Error in classify_image_region: {e}")
+            self.logger.error(traceback.format_exc())
+            return {"is_landmark": False, "confidence": 0.0}
     def classify_batch_regions(self,
                               image: Union[Image.Image, np.ndarray],
         Returns:
             List[Dict]: 分類結果列表
         """
+        try:
+            if not self.landmark_data_manager.is_landmark_enabled() or self.landmark_text_features is None:
+                return [{"is_landmark": False, "confidence": 0.0} for _ in boxes]
+            # 確保圖像是PIL格式
+            if not isinstance(image, Image.Image):
+                if isinstance(image, np.ndarray):
+                    image = Image.fromarray(image)
+                else:
+                    raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
+            if not boxes:
+                return []
+            # 批量處理所有區域
+            batch_features = self.clip_model_manager.batch_process_regions(image, boxes)
             # 計算相似度
+            similarity = self.clip_model_manager.calculate_similarity(batch_features, self.landmark_text_features)
+            # 處理每個區域的結果
+            results = []
+            for i, sim in enumerate(similarity):
+                best_idx = sim.argmax().item()
+                best_score = sim[best_idx]
+                if best_score >= threshold:
+                    landmark_id, landmark_info = self.landmark_data_manager.get_landmark_by_index(best_idx)
+                    if landmark_id:
+                        # 如果landmark_info["location"] 為空，則從 ALL_LANDMARKS 補
+                        loc = landmark_info.get("location", "")
+                        if not loc and landmark_id in ALL_LANDMARKS:
+                            loc = ALL_LANDMARKS[landmark_id].get("location", "")
+                        results.append({
+                            "landmark_id": landmark_id,
+                            "landmark_name": landmark_info.get("name", "Unknown"),
+                            "location": loc or "Unknown Location",
+                            "confidence": float(best_score),
+                            "is_landmark": True,
+                            "box": boxes[i]
+                        })
+                    else:
+                        results.append({
+                            "landmark_id": None,
+                            "landmark_name": None,
+                            "confidence": float(best_score),
+                            "is_landmark": False,
+                            "box": boxes[i]
+                        })
+                else:
+                    results.append({
+                        "landmark_id": None,
+                        "landmark_name": None,
+                        "confidence": float(best_score),
+                        "is_landmark": False,
+                        "box": boxes[i]
+                    })
+            return results
+        except Exception as e:
+            self.logger.error(f"Error in classify_batch_regions: {e}")
+            self.logger.error(traceback.format_exc())
+            return [{"is_landmark": False, "confidence": 0.0} for _ in boxes]
     def search_entire_image(self,
+                           image: Union[Image.Image, np.ndarray],
+                           threshold: float = 0.35,
+                           detailed_analysis: bool = False) -> Dict[str, Any]:
         """
         檢查整張圖像是否包含地標，具有增強的分析能力
         Returns:
             Dict: 地標分類結果
         """
+        try:
+            if not self.landmark_data_manager.is_landmark_enabled() or self.landmark_text_features is None:
+                return {"is_landmark": False, "confidence": 0.0}
+            # 確保圖像是PIL格式
+            if not isinstance(image, Image.Image):
+                if isinstance(image, np.ndarray):
+                    image = Image.fromarray(image)
+                else:
+                    raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
+            # 檢查cache
+            image_hash = self.image_analyzer.get_image_hash(image)
+            image_key = self.cache_manager.get_image_cache_key(image_hash, "entire_image", detailed_analysis)
+            cached_result = self.cache_manager.get_cached_result(image_key)
+            if cached_result is not None:
+                return cached_result
+            # 調整閾值
+            adjusted_threshold = self.confidence_manager.calculate_adjusted_threshold(threshold, "full_image")
+            # 預處理並獲取特徵
+            image_input = self.clip_model_manager.preprocess_image(image)
+            image_features = self.clip_model_manager.encode_image(image_input)
+            # calculate相似度
+            similarity = self.clip_model_manager.calculate_similarity(image_features, self.landmark_text_features)
+            # 找到最佳匹配
+            best_idx = similarity[0].argmax().item()
+            best_score = similarity[0][best_idx]
+            # 獲取top3地標
+            top_indices = similarity[0].argsort()[-3:][::-1]
+            top_landmarks = []
+            for idx in top_indices:
+                score = similarity[0][idx]
+                landmark_id, landmark_info = self.landmark_data_manager.get_landmark_by_index(idx)
+                if landmark_id:
+                    # 補 location
+                    loc_top = landmark_info.get("location", "")
+                    if not loc_top and landmark_id in ALL_LANDMARKS:
+                        loc_top = ALL_LANDMARKS[landmark_id].get("location", "")
+                    landmark_result = {
+                        "landmark_id": landmark_id,
+                        "landmark_name": landmark_info.get("name", "Unknown"),
+                        "location": loc_top or "Unknown Location",
+                        "confidence": float(score)
+                    }
+                    # 加額外可用信息
+                    for key in ["year_built", "architectural_style", "significance"]:
+                        if key in landmark_info:
+                            landmark_result[key] = landmark_info[key]
+                    top_landmarks.append(landmark_result)
+            # main result
+            result = {}
+            if best_score >= adjusted_threshold:
+                landmark_id, landmark_info = self.landmark_data_manager.get_landmark_by_index(best_idx)
+                if landmark_id:
+                    # 應用地標類型特定閾值
+                    landmark_type = self.landmark_data_manager.determine_landmark_type(landmark_id)
+                    final_threshold = self.confidence_manager.calculate_final_threshold(adjusted_threshold, "full_image", landmark_type)
+                    if self.confidence_manager.evaluate_confidence(best_score, final_threshold):
+                        # 補 location
+                        loc_main = landmark_info.get("location", "")
+                        if not loc_main and landmark_id in ALL_LANDMARKS:
+                            loc_main = ALL_LANDMARKS[landmark_id].get("location", "")
+                        result = {
+                            "landmark_id": landmark_id,
+                            "landmark_name": landmark_info.get("name", "Unknown"),
+                            "location": loc_main or "Unknown Location",
+                            "confidence": float(best_score),
+                            "is_landmark": True,
+                            "landmark_type": landmark_type,
+                            "top_landmarks": top_landmarks
+                        }
+                        # 添加額外可用信息
+                        for key in ["year_built", "architectural_style", "significance"]:
+                            if key in landmark_info:
+                                result[key] = landmark_info[key]
+                    else:
+                        result = {
+                            "landmark_id": None,
+                            "landmark_name": None,
+                            "confidence": float(best_score),
+                            "is_landmark": False,
+                            "top_landmarks": top_landmarks
+                        }
             else:
                 result = {
                     "landmark_id": None,
                     "top_landmarks": top_landmarks
                 }
+            # 詳細分析
+            if detailed_analysis and result.get("is_landmark", False):
+                width, height = image.size
+                regions = [
+                    [width * 0.25, height * 0.25, width * 0.75, height * 0.75],
+                    [0, 0, width * 0.5, height],
+                    [width * 0.5, 0, width, height],
+                    [0, 0, width, height * 0.5],
+                    [0, height * 0.5, width, height]
                 ]
+                region_results = []
+                for i, box in enumerate(regions):
+                    region_result = self.classify_image_region(
+                        image,
+                        box,
+                        threshold=threshold * 0.9,
+                        detection_type="partial"
+                    )
+                    if region_result["is_landmark"]:
+                        region_result["region_name"] = ["center", "left", "right", "top", "bottom"][i]
+                        region_results.append(region_result)
+                if region_results:
+                    result["region_analyses"] = region_results
+            # 快取結果
+            self.cache_manager.set_cached_result(image_key, result)
+            return result
+        except Exception as e:
+            self.logger.error(f"Error in search_entire_image: {e}")
+            self.logger.error(traceback.format_exc())
+            return {"is_landmark": False, "confidence": 0.0}
     def intelligent_landmark_search(self,
+                                  image: Union[Image.Image, np.ndarray],
+                                  yolo_boxes: Optional[List[List[float]]] = None,
+                                  base_threshold: float = 0.25) -> Dict[str, Any]:
         """
+        對圖像進行地標搜索，綜合整張圖像分析和區域分析
         Args:
             image: 原始圖像
         Returns:
             Dict: 包含所有檢測結果的綜合分析
         """
+        try:
+            if not self.landmark_data_manager.is_landmark_enabled():
+                return {
+                    "full_image_analysis": {},
+                    "is_landmark_scene": False,
+                    "detected_landmarks": []
+                }
+            # 確保圖像是PIL格式
+            if not isinstance(image, Image.Image):
+                if isinstance(image, np.ndarray):
+                    image = Image.fromarray(image)
+                else:
+                    raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
+            # 調整閾值
+            actual_threshold = base_threshold * 0.85 if yolo_boxes is None or len(yolo_boxes) == 0 else base_threshold
+            # 首先對整張圖像進行分析
             full_image_result = self.search_entire_image(
                 image,
                 threshold=actual_threshold,
+                detailed_analysis=True
             )
+            # 如果沒有YOLO框且全圖分析未發現地標，進行金字塔分析
             if (yolo_boxes is None or len(yolo_boxes) == 0) and (not full_image_result or not full_image_result.get("is_landmark", False)):
+                self.logger.info("No YOLO boxes provided, attempting multi-scale pyramid analysis")
+                pyramid_results = self.image_analyzer.perform_pyramid_analysis(
+                    image,
+                    self.clip_model_manager,
+                    self.landmark_data_manager,
+                    levels=4,
+                    base_threshold=actual_threshold,
+                    aspect_ratios=[1.0, 0.75, 1.5, 0.5, 2.0]
+                )
+                if pyramid_results and pyramid_results.get("is_landmark", False) and pyramid_results.get("best_result", {}).get("confidence", 0) > actual_threshold:
+                    if not full_image_result or not full_image_result.get("is_landmark", False):
+                        full_image_result = {
+                            "is_landmark": True,
+                            "landmark_id": pyramid_results["best_result"]["landmark_id"],
+                            "landmark_name": pyramid_results["best_result"]["landmark_name"],
+                            "confidence": pyramid_results["best_result"]["confidence"],
+                            "location": pyramid_results["best_result"].get("location", "Unknown Location")
+                        }
+                        self.logger.info(f"Pyramid analysis detected landmark: {pyramid_results['best_result']['landmark_name']} with confidence {pyramid_results['best_result']['confidence']:.3f}")
+            # 初始化結果dict
+            result = {
+                "full_image_analysis": full_image_result if full_image_result else {},
+                "is_landmark_scene": False,
+                "detected_landmarks": []
+            }
+            # 處理上下文感知比較
+            if full_image_result and "top_landmarks" in full_image_result and len(full_image_result["top_landmarks"]) >= 2:
+                top_landmarks = full_image_result["top_landmarks"]
+                if len(top_landmarks) >= 2 and abs(top_landmarks[0]["confidence"] - top_landmarks[1]["confidence"]) < 0.1:
+                    architectural_analysis = self.image_analyzer.analyze_architectural_features(image, self.clip_model_manager)
+                    for i, landmark in enumerate(top_landmarks[:2]):
+                        if i >= len(top_landmarks):
+                            continue
+                        adjusted_confidence = self.confidence_manager.apply_architectural_boost(
+                            landmark["confidence"],
+                            architectural_analysis,
+                            landmark.get("landmark_id", "")
                         )
+                        if adjusted_confidence != landmark["confidence"]:
+                            top_landmarks[i]["confidence"] = adjusted_confidence
+                    # 重新排序
+                    top_landmarks.sort(key=lambda x: x["confidence"], reverse=True)
+                    full_image_result["top_landmarks"] = top_landmarks
+                    if top_landmarks:
+                        full_image_result["landmark_id"] = top_landmarks[0]["landmark_id"]
+                        full_image_result["landmark_name"] = top_landmarks[0]["landmark_name"]
+                        full_image_result["confidence"] = top_landmarks[0]["confidence"]
+                        full_image_result["location"] = top_landmarks[0].get("location", "Unknown Location")
+            # 處理全圖結果
+            if full_image_result and full_image_result.get("is_landmark", False):
+                result["is_landmark_scene"] = True
+                landmark_id = full_image_result.get("landmark_id", "unknown")
+                landmark_specific_info = self.landmark_data_manager.extract_landmark_specific_info(landmark_id)
+                landmark_info = {
+                    "landmark_id": landmark_id,
+                    "landmark_name": full_image_result.get("landmark_name", "Unknown Landmark"),
+                    "confidence": full_image_result.get("confidence", 0.0),
+                    "location": full_image_result.get("location", "Unknown Location"),
+                    "region_type": "full_image",
+                    "box": [0, 0, getattr(image, 'width', 0), getattr(image, 'height', 0)]
+                }
+                landmark_info.update(landmark_specific_info)
+                if landmark_specific_info.get("landmark_name"):
+                    landmark_info["landmark_name"] = landmark_specific_info["landmark_name"]
+                result["detected_landmarks"].append(landmark_info)
+                if landmark_specific_info.get("has_specific_activities", False):
+                    result["primary_landmark_activities"] = landmark_specific_info.get("landmark_specific_activities", [])
+                    self.logger.info(f"Set primary landmark activities: {len(result['primary_landmark_activities'])} activities for {landmark_info['landmark_name']}")
+            # 處理YOLO邊界框
+            if yolo_boxes and len(yolo_boxes) > 0:
+                for box in yolo_boxes:
+                    try:
                         box_result = self.classify_image_region(
                             image,
                             box,
                             detection_type="auto"
                         )
                         if box_result and box_result.get("is_landmark", False):
                             is_duplicate = False
                             for existing in result["detected_landmarks"]:
                                 if existing.get("landmark_id") == box_result.get("landmark_id"):
                                     if box_result.get("confidence", 0) > existing.get("confidence", 0):
                                         existing.update({
                                             "confidence": box_result.get("confidence", 0),
                                     is_duplicate = True
                                     break
                             if not is_duplicate:
                                 result["detected_landmarks"].append({
                                     "landmark_id": box_result.get("landmark_id", "unknown"),
                                     "region_type": "yolo_box",
                                     "box": box
                                 })
+                    except Exception as e:
+                        self.logger.error(f"Error in analyzing YOLO box: {e}")
+                        continue
+            # 網格搜索（如果需要）
+            should_do_grid_search = (
+                len(result["detected_landmarks"]) == 0 or
+                max([landmark.get("confidence", 0) for landmark in result["detected_landmarks"]], default=0) < 0.5
+            )
+            if should_do_grid_search:
+                try:
+                    width, height = getattr(image, 'size', (getattr(image, 'width', 0), getattr(image, 'height', 0)))
+                    if not isinstance(width, (int, float)) or width <= 0:
+                        width = getattr(image, 'width', 0)
+                    if not isinstance(height, (int, float)) or height <= 0:
+                        height = getattr(image, 'height', 0)
+                    if width > 0 and height > 0:
+                        grid_boxes = []
+                        for i in range(5):
+                            for j in range(5):
+                                grid_boxes.append([
+                                    width * (j/5), height * (i/5),
+                                    width * ((j+1)/5), height * ((i+1)/5)
+                                ])
+                        for box in grid_boxes:
+                            try:
+                                grid_result = self.classify_image_region(
+                                    image,
+                                    box,
+                                    threshold=base_threshold * 0.9,
+                                    detection_type="partial"
+                                )
+                                if grid_result and grid_result.get("is_landmark", False):
+                                    is_duplicate = False
+                                    for existing in result["detected_landmarks"]:
+                                        if existing.get("landmark_id") == grid_result.get("landmark_id"):
+                                            is_duplicate = True
+                                            break
+                                    if not is_duplicate:
+                                        result["detected_landmarks"].append({
+                                            "landmark_id": grid_result.get("landmark_id", "unknown"),
+                                            "landmark_name": grid_result.get("landmark_name", "Unknown Landmark"),
+                                            "confidence": grid_result.get("confidence", 0.0),
+                                            "location": grid_result.get("location", "Unknown Location"),
+                                            "region_type": "grid",
+                                            "box": box
+                                        })
+                            except Exception as e:
+                                self.logger.error(f"Error in analyzing grid region: {e}")
+                                continue
                 except Exception as e:
+                    self.logger.error(f"Error in grid search: {e}")
+                    self.logger.error(traceback.format_exc())
+            # 按置信度排序檢測結果
+            result["detected_landmarks"].sort(key=lambda x: x.get("confidence", 0), reverse=True)
+            # 更新整體場景類型判斷
+            if len(result["detected_landmarks"]) > 0:
+                result["is_landmark_scene"] = True
+                result["primary_landmark"] = result["detected_landmarks"][0]
+                if full_image_result and "clip_analysis" in full_image_result:
+                    result["clip_analysis_on_full_image"] = full_image_result["clip_analysis"]
+            return result
+        except Exception as e:
+            self.logger.error(f"Error in intelligent_landmark_search: {e}")
+            self.logger.error(traceback.format_exc())
+            return {
+                "full_image_analysis": {},
+                "is_landmark_scene": False,
+                "detected_landmarks": []
+            }
+    def enhanced_landmark_detection(self,
+                                  image: Union[Image.Image, np.ndarray],
+                                  threshold: float = 0.3) -> Dict[str, Any]:
         """
+        使用多種分析技術進行增強地標檢測
         Args:
+            image: 輸入圖像
+            threshold: 基礎置信度閾值
         Returns:
+            Dict: 綜合地標檢測結果
         """
         try:
+            if not self.landmark_data_manager.is_landmark_enabled():
+                return {"is_landmark_scene": False, "detected_landmarks": []}
+            # 確保圖像是PIL格式
+            if not isinstance(image, Image.Image):
+                if isinstance(image, np.ndarray):
+                    image = Image.fromarray(image)
+                else:
+                    raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
+            # 1: 分析視角以調整檢測參數
+            viewpoint_info = self.image_analyzer.analyze_viewpoint(image, self.clip_model_manager)
+            viewpoint = viewpoint_info["dominant_viewpoint"]
+            # 根據視角調整閾值
+            if viewpoint == "distant":
+                adjusted_threshold = threshold * 0.7
+            elif viewpoint == "close_up":
+                adjusted_threshold = threshold * 1.1
+            else:
+                adjusted_threshold = threshold
+            # 2: 執行多尺度金字塔分析
+            pyramid_results = self.image_analyzer.perform_pyramid_analysis(
+                image,
+                self.clip_model_manager,
+                self.landmark_data_manager,
+                levels=3,
+                base_threshold=adjusted_threshold
+            )
+            # 3: 執行基於網格的區域分析
+            grid_results = []
+            width, height = image.size
+            # 根據視角創建自適應網格
+            if viewpoint == "distant":
+                grid_size = 3
+            elif viewpoint == "close_up":
+                grid_size = 5
             else:
+                grid_size = 4
+            # 生成網格區域
+            for i in range(grid_size):
+                for j in range(grid_size):
+                    box = [
+                        width * (j/grid_size),
+                        height * (i/grid_size),
+                        width * ((j+1)/grid_size),
+                        height * ((i+1)/grid_size)
+                    ]
+                    region_result = self.classify_image_region(
+                        image,
+                        box,
+                        threshold=adjusted_threshold,
+                        detection_type="auto"
+                    )
+                    if region_result["is_landmark"]:
+                        region_result["grid_position"] = (i, j)
+                        grid_results.append(region_result)
+            # 4: 交叉驗證並合併結果
+            all_detections = []
+            # 添加金字塔結果
+            if pyramid_results["is_landmark"] and pyramid_results["best_result"]:
+                all_detections.append({
+                    "source": "pyramid",
+                    "landmark_id": pyramid_results["best_result"]["landmark_id"],
+                    "landmark_name": pyramid_results["best_result"]["landmark_name"],
+                    "confidence": pyramid_results["best_result"]["confidence"],
+                    "scale_factor": pyramid_results["best_result"].get("scale_factor", 1.0)
+                })
+            # 添加網格結果
+            for result in grid_results:
+                all_detections.append({
+                    "source": "grid",
+                    "landmark_id": result["landmark_id"],
+                    "landmark_name": result["landmark_name"],
+                    "confidence": result["confidence"],
+                    "grid_position": result.get("grid_position", (0, 0))
+                })
+            # 搜索整張圖像
+            full_image_result = self.search_entire_image(image, threshold=adjusted_threshold)
+            if full_image_result and full_image_result.get("is_landmark", False):
+                all_detections.append({
+                    "source": "full_image",
+                    "landmark_id": full_image_result["landmark_id"],
+                    "landmark_name": full_image_result["landmark_name"],
+                    "confidence": full_image_result["confidence"]
+                })
+            # 按地標ID分組並計算總體置信度
+            landmark_groups = {}
+            for detection in all_detections:
+                landmark_id = detection["landmark_id"]
+                if landmark_id not in landmark_groups:
+                    landmark_groups[landmark_id] = {
+                        "landmark_id": landmark_id,
+                        "landmark_name": detection["landmark_name"],
+                        "detections": [],
+                        "sources": set()
+                    }
+                landmark_groups[landmark_id]["detections"].append(detection)
+                landmark_groups[landmark_id]["sources"].add(detection["source"])
+            # 計算每���地標的總體置信度
+            for landmark_id, group in landmark_groups.items():
+                detections = group["detections"]
+                # 基礎置信度是任何來源的最大置信度
+                max_confidence = max(d["confidence"] for d in detections)
+                # 多來源檢測獎勵
+                source_count = len(group["sources"])
+                source_bonus = min(0.15, (source_count - 1) * 0.05)
+                # 一致性獎勵
+                detection_count = len(detections)
+                consistency_bonus = min(0.1, (detection_count - 1) * 0.02)
+                # 計算最終置信度
+                aggregate_confidence = min(1.0, max_confidence + source_bonus + consistency_bonus)
+                group["confidence"] = aggregate_confidence
+                group["detection_count"] = detection_count
+                group["source_count"] = source_count
+            # 照信心度排序地標
+            sorted_landmarks = sorted(
+                landmark_groups.values(),
+                key=lambda x: x["confidence"],
+                reverse=True
+            )
+            return {
+                "is_landmark_scene": len(sorted_landmarks) > 0,
+                "detected_landmarks": sorted_landmarks,
+                "viewpoint_info": viewpoint_info,
+                "primary_landmark": sorted_landmarks[0] if sorted_landmarks else None
+            }
+        except Exception as e:
+            self.logger.error(f"Error in enhanced_landmark_detection: {e}")
+            self.logger.error(traceback.format_exc())
+            return {"is_landmark_scene": False, "detected_landmarks": []}

component_initializer.py ADDED Viewed

	@@ -0,0 +1,319 @@

+import os
+import traceback
+import logging
+from typing import Dict, Optional, Any, Tuple
+from spatial_analyzer import SpatialAnalyzer
+from scene_description import SceneDescriptor
+from enhance_scene_describer import EnhancedSceneDescriber
+from clip_analyzer import CLIPAnalyzer
+from clip_zero_shot_classifier import CLIPZeroShotClassifier
+from llm_enhancer import LLMEnhancer
+from landmark_activities import LANDMARK_ACTIVITIES
+from scene_type import SCENE_TYPES
+from object_categories import OBJECT_CATEGORIES
+class ComponentInitializer:
+    """
+    負責初始化和管理 SceneAnalyzer 的所有子組件。
+    處理組件初始化失敗的情況並提供優雅的降級機制。
+    """
+    def __init__(self, class_names: Dict[int, str] = None, use_llm: bool = True,
+                 use_clip: bool = True, enable_landmark: bool = True,
+                 llm_model_path: str = None):
+        """
+        初始化組件管理器。
+        Args:
+            class_names: YOLO 類別 ID 到名稱的映射字典
+            use_llm: 是否啟用 LLM 增強功能
+            use_clip: 是否啟用 CLIP 分析功能
+            enable_landmark: 是否啟用地標檢測功能
+            llm_model_path: LLM 模型路徑（可選）
+        """
+        self.logger = logging.getLogger(__name__)
+        # 存儲初始化參數
+        self.class_names = class_names
+        self.use_llm = use_llm
+        self.use_clip = use_clip
+        self.enable_landmark = enable_landmark
+        self.llm_model_path = llm_model_path
+        # 初始化組件容器
+        self.components = {}
+        self.data_structures = {}
+        self.initialization_status = {}
+        # 初始化所有組件
+        self._initialize_all_components()
+    def _initialize_all_components(self):
+        """初始化所有必要的組件和數據結構。"""
+        try:
+            # 1. 首先載入數據
+            self._load_data_structures()
+            # 2. 初始化核心分析組件
+            self._initialize_core_analyzers()
+            # 3. 初始化 CLIP 相關內容
+            if self.use_clip:
+                self._initialize_clip_components()
+            # 4. 初始化 LLM 組件
+            if self.use_llm:
+                self._initialize_llm_components()
+            self.logger.info("All components initialized successfully")
+        except Exception as e:
+            self.logger.error(f"Error during component initialization: {e}")
+            traceback.print_exc()
+            raise
+    def _load_data_structures(self):
+        """載入必要的數據結構。"""
+        data_loaders = {
+            'LANDMARK_ACTIVITIES': self._load_landmark_activities,
+            'SCENE_TYPES': self._load_scene_types,
+            'OBJECT_CATEGORIES': self._load_object_categories
+        }
+        for data_name, loader_func in data_loaders.items():
+            try:
+                self.data_structures[data_name] = loader_func()
+                self.initialization_status[data_name] = True
+                self.logger.info(f"Loaded {data_name} successfully")
+            except Exception as e:
+                self.logger.warning(f"Failed to load {data_name}: {e}")
+                self.data_structures[data_name] = {}
+                self.initialization_status[data_name] = False
+    def _load_landmark_activities(self) -> Dict:
+        """載入地標活動數據。"""
+        try:
+            return LANDMARK_ACTIVITIES
+        except ImportError as e:
+            self.logger.warning(f"Could not import LANDMARK_ACTIVITIES: {e}")
+            return {}
+    def _load_scene_types(self) -> Dict:
+        """載入場景類型數據。"""
+        try:
+            return SCENE_TYPES
+        except ImportError as e:
+            self.logger.warning(f"Could not import SCENE_TYPES: {e}")
+            return {}
+    def _load_object_categories(self) -> Dict:
+        """載入物體類別數據。"""
+        try:
+            return OBJECT_CATEGORIES
+        except ImportError as e:
+            self.logger.warning(f"Could not import OBJECT_CATEGORIES: {e}")
+            return {}
+    def _initialize_core_analyzers(self):
+        """初始化核心分析組件。"""
+        # 初始化 SpatialAnalyzer
+        try:
+            self.components['spatial_analyzer'] = SpatialAnalyzer(
+                class_names=self.class_names,
+                object_categories=self.data_structures.get('OBJECT_CATEGORIES', {})
+            )
+            self.initialization_status['spatial_analyzer'] = True
+            self.logger.info("Initialized SpatialAnalyzer successfully")
+        except Exception as e:
+            self.logger.error(f"Error initializing SpatialAnalyzer: {e}")
+            traceback.print_exc()
+            self.initialization_status['spatial_analyzer'] = False
+            self.components['spatial_analyzer'] = None
+        # 初始化 SceneDescriptor
+        try:
+            self.components['descriptor'] = SceneDescriptor(
+                scene_types=self.data_structures.get('SCENE_TYPES', {}),
+                object_categories=self.data_structures.get('OBJECT_CATEGORIES', {})
+            )
+            self.initialization_status['descriptor'] = True
+            self.logger.info("Initialized SceneDescriptor successfully")
+        except Exception as e:
+            self.logger.error(f"Error initializing SceneDescriptor: {e}")
+            traceback.print_exc()
+            self.initialization_status['descriptor'] = False
+            self.components['descriptor'] = None
+        # 初始化 EnhancedSceneDescriber
+        try:
+            if self.components.get('spatial_analyzer'):
+                self.components['scene_describer'] = EnhancedSceneDescriber(
+                    scene_types=self.data_structures.get('SCENE_TYPES', {}),
+                    spatial_analyzer_instance=self.components['spatial_analyzer']
+                )
+                self.initialization_status['scene_describer'] = True
+                self.logger.info("Initialized EnhancedSceneDescriber successfully")
+            else:
+                self.logger.warning("Cannot initialize EnhancedSceneDescriber without SpatialAnalyzer")
+                self.initialization_status['scene_describer'] = False
+                self.components['scene_describer'] = None
+        except Exception as e:
+            self.logger.error(f"Error initializing EnhancedSceneDescriber: {e}")
+            traceback.print_exc()
+            self.initialization_status['scene_describer'] = False
+            self.components['scene_describer'] = None
+    def _initialize_clip_components(self):
+        """初始化 CLIP 相關組件。"""
+        # 初始化 CLIPAnalyzer
+        try:
+            self.components['clip_analyzer'] = CLIPAnalyzer()
+            self.initialization_status['clip_analyzer'] = True
+            self.logger.info("Initialized CLIPAnalyzer successfully")
+            # 如果啟用地標檢測，初始化 CLIPZeroShotClassifier
+            if self.enable_landmark:
+                self._initialize_landmark_classifier()
+        except Exception as e:
+            self.logger.warning(f"Could not initialize CLIP analyzer: {e}")
+            self.logger.info("Scene analysis will proceed without CLIP. Install CLIP with 'pip install clip' for enhanced scene understanding.")
+            self.use_clip = False
+            self.initialization_status['clip_analyzer'] = False
+            self.components['clip_analyzer'] = None
+    def _initialize_landmark_classifier(self):
+        """初始化地標分類器。"""
+        try:
+            # 嘗試使用已載入的 CLIP 模型實例
+            if (self.components.get('clip_analyzer') and
+                hasattr(self.components['clip_analyzer'], 'get_clip_instance')):
+                model, preprocess, device = self.components['clip_analyzer'].get_clip_instance()
+                self.components['landmark_classifier'] = CLIPZeroShotClassifier(device=device)
+                self.logger.info("Initialized landmark classifier with shared CLIP model")
+            else:
+                self.components['landmark_classifier'] = CLIPZeroShotClassifier()
+                self.logger.info("Initialized landmark classifier with independent CLIP model")
+            # 配置地標檢測器參數
+            self._configure_landmark_classifier()
+            self.initialization_status['landmark_classifier'] = True
+        except (ImportError, Exception) as e:
+            self.logger.warning(f"Could not initialize landmark classifier: {e}")
+            self.initialization_status['landmark_classifier'] = False
+            self.components['landmark_classifier'] = None
+            # 不完全禁用地標檢測，允許運行時重新嘗試
+    def _configure_landmark_classifier(self):
+        """配置地標分類器的參數。"""
+        if self.components.get('landmark_classifier'):
+            try:
+                classifier = self.components['landmark_classifier']
+                classifier.set_batch_size(8)
+                classifier.adjust_confidence_threshold("full_image", 0.8)
+                classifier.adjust_confidence_threshold("distant", 0.65)
+                self.logger.info("Landmark detection enabled with optimized settings")
+            except Exception as e:
+                self.logger.warning(f"Error configuring landmark classifier: {e}")
+    def _initialize_llm_components(self):
+        """初始化 LLM 組件。"""
+        try:
+            self.components['llm_enhancer'] = LLMEnhancer(model_path=self.llm_model_path)
+            self.initialization_status['llm_enhancer'] = True
+            self.logger.info("LLM enhancer initialized successfully")
+        except Exception as e:
+            self.logger.warning(f"Could not initialize LLM enhancer: {e}")
+            self.logger.info("Scene analysis will proceed without LLM. Make sure required packages are installed.")
+            self.use_llm = False
+            self.initialization_status['llm_enhancer'] = False
+            self.components['llm_enhancer'] = None
+    def get_component(self, component_name: str) -> Optional[Any]:
+        """
+        獲取指定的組件實例。
+        Args:
+            component_name: 組件名稱
+        Returns:
+            組件實例或 None（如果未初始化成功）
+        """
+        return self.components.get(component_name)
+    def get_data_structure(self, data_name: str) -> Dict:
+        """
+        獲取指定的數據結構。
+        Args:
+            data_name: 數據結構名稱
+        Returns:
+            數據結構字典
+        """
+        return self.data_structures.get(data_name, {})
+    def is_component_available(self, component_name: str) -> bool:
+        """
+        檢查指定組件是否可用。
+        Args:
+            component_name: 組件名稱
+        Returns:
+            組件是否可用
+        """
+        return self.initialization_status.get(component_name, False)
+    def get_initialization_summary(self) -> Dict[str, bool]:
+        """
+        獲取所有組件的初始化狀態摘要。
+        Returns:
+            組件名稱到初始化狀態的映射
+        """
+        return self.initialization_status.copy()
+    def reinitialize_component(self, component_name: str) -> bool:
+        """
+        重新初始化指定的組件。
+        Args:
+            component_name: 要重新初始化的組件名稱
+        Returns:
+            重新初始化是否成功
+        """
+        try:
+            if component_name == 'landmark_classifier' and self.use_clip and self.enable_landmark:
+                self._initialize_landmark_classifier()
+                return self.initialization_status.get('landmark_classifier', False)
+            else:
+                self.logger.warning(f"Reinitializing {component_name} is not supported")
+                return False
+        except Exception as e:
+            self.logger.error(f"Error reinitializing {component_name}: {e}")
+            return False
+    def update_landmark_enable_status(self, enable_landmark: bool):
+        """
+        更新地標檢測的啟用狀態。
+        Args:
+            enable_landmark: 是否啟用地標檢測
+        """
+        self.enable_landmark = enable_landmark
+        # 如果啟用地標檢測但分類器不可用，嘗試重新初始化
+        if enable_landmark and not self.is_component_available('landmark_classifier'):
+            if self.use_clip:
+                self.reinitialize_component('landmark_classifier')
+        # 更新相關組件的狀態
+        for component_name in ['scene_describer', 'clip_analyzer', 'landmark_classifier']:
+            component = self.get_component(component_name)
+            if component and hasattr(component, 'enable_landmark'):
+                component.enable_landmark = enable_landmark

confidence_manager.py ADDED Viewed

	@@ -0,0 +1,283 @@

+import logging
+import traceback
+from typing import Dict, Any, Optional
+class ConfidenceManager:
+    """
+    專門管理信心度相關邏輯，包括動態閾值調整、信心度乘數管理和地標類型特定的閾值處理
+    """
+    def __init__(self):
+        """
+        初始化置信度管理器
+        """
+        self.logger = logging.getLogger(__name__)
+        # 初始化批處理參數
+        self.batch_size = 16  # 默認批處理大小
+        # 置信度閾值乘數配置
+        self.confidence_threshold_multipliers = {
+            "close_up": 0.9,     # 近景標準閾值
+            "partial": 0.6,      # 部分可見降低閾值要求
+            "distant": 0.5,      # 遠景更低閾值要求
+            "full_image": 0.7    # 整張圖像需要更高閾值
+        }
+        # 地標類型閾值配置
+        self.landmark_type_thresholds = {
+            "tower": 0.5,         # 塔型建築需要更高閾值
+            "skyscraper": 0.4,    # 摩天大樓使用較低閾值
+            "building": 0.55,     # 一般的建築物閾值略微降低
+            "monument": 0.5,      # 紀念碑閾值
+            "natural": 0.6        # 自然景觀可以使用較低閾值
+        }
+    def set_batch_size(self, batch_size: int):
+        """
+        設置批處理大小
+        Args:
+            batch_size: 新的批處理大小
+        """
+        self.batch_size = max(1, batch_size)
+        self.logger.info(f"Batch size set to {self.batch_size}")
+    def adjust_confidence_threshold(self, detection_type: str, multiplier: float):
+        """
+        調整特定檢測類型的信心度的threshold
+        Args:
+            detection_type: 檢測類型 ('close_up', 'partial', 'distant', 'full_image')
+            multiplier: 置信度閾值乘數
+        """
+        if detection_type in self.confidence_threshold_multipliers:
+            self.confidence_threshold_multipliers[detection_type] = max(0.1, min(1.5, multiplier))
+            self.logger.info(f"Adjusted confidence threshold multiplier for {detection_type} to {multiplier}")
+        else:
+            self.logger.warning(f"Unknown detection type: {detection_type}")
+    def get_detection_type_multiplier(self, detection_type: str) -> float:
+        """
+        獲取檢測類型的置信度乘數
+        Args:
+            detection_type: 檢測類型
+        Returns:
+            float: 置信度乘數
+        """
+        return self.confidence_threshold_multipliers.get(detection_type, 1.0)
+    def get_landmark_type_threshold(self, landmark_type: str) -> float:
+        """
+        獲取地標類型的閾值
+        Args:
+            landmark_type: 地標類型
+        Returns:
+            float: 地標類型閾值
+        """
+        return self.landmark_type_thresholds.get(landmark_type, 0.5)
+    def calculate_adjusted_threshold(self, base_threshold: float, detection_type: str) -> float:
+        """
+        根據檢測類型計算調整後的閾值
+        Args:
+            base_threshold: 基礎閾值
+            detection_type: 檢測type
+        Returns:
+            float: 調整後的閾值
+        """
+        try:
+            base_multiplier = self.get_detection_type_multiplier(detection_type)
+            adjusted_threshold = base_threshold * base_multiplier
+            return adjusted_threshold
+        except Exception as e:
+            self.logger.error(f"Error calculating adjusted threshold: {e}")
+            self.logger.error(traceback.format_exc())
+            return base_threshold
+    def calculate_final_threshold(self, base_threshold: float, detection_type: str,
+                                landmark_type: str) -> float:
+        """
+        計算最終閾值，結合檢測類型和地標類型
+        Args:
+            base_threshold: 基礎閾值
+            detection_type: 檢測type
+            landmark_type: 地標type
+        Returns:
+            float: 最終閾值
+        """
+        try:
+            # 根據檢測類型調整
+            adjusted_threshold = self.calculate_adjusted_threshold(base_threshold, detection_type)
+            # 根據地標類型進一步調整
+            if landmark_type == "distinctive":
+                # 特殊建築的閾值降低25%
+                type_multiplier = 0.75
+            else:
+                # 使用已有的類型閾值
+                type_multiplier = self.get_landmark_type_threshold(landmark_type) / 0.5
+            final_threshold = adjusted_threshold * type_multiplier
+            return final_threshold
+        except Exception as e:
+            self.logger.error(f"Error calculating final threshold: {e}")
+            self.logger.error(traceback.format_exc())
+            return base_threshold
+    def evaluate_confidence(self, confidence: float, threshold: float) -> bool:
+        """
+        評估置信度是否達到閾值
+        Args:
+            confidence: 信心度score
+            threshold: 閾值
+        Returns:
+            bool: 是否達到閾值
+        """
+        return confidence >= threshold
+    def apply_architectural_boost(self, confidence: float, architectural_analysis: Dict[str, Any],
+                                landmark_id: str) -> float:
+        """
+        根據建築特徵分析調整信心度
+        Args:
+            confidence: 原始置信度
+            architectural_analysis: 建築特徵分析結果
+            landmark_id: 地標ID
+        Returns:
+            float: 調整後的信心度
+        """
+        try:
+            confidence_boost = 0
+            landmark_id_lower = landmark_id.lower()
+            top_features = architectural_analysis.get("architectural_features", [])
+            primary_category = architectural_analysis.get("primary_category", "")
+            # 使用主要建築類別來調整置信度，使用通用條件而非特定地標名稱
+            if primary_category == "tower" and any(term in landmark_id_lower for term in ["tower", "spire", "needle"]):
+                confidence_boost += 0.05
+            elif primary_category == "skyscraper" and any(term in landmark_id_lower for term in ["building", "skyscraper", "tall"]):
+                confidence_boost += 0.05
+            elif primary_category == "historical" and any(term in landmark_id_lower for term in ["monument", "castle", "palace", "temple"]):
+                confidence_boost += 0.05
+            elif primary_category == "distinctive" and any(term in landmark_id_lower for term in ["unusual", "unique", "special", "famous"]):
+                confidence_boost += 0.05
+            # 根據特定特徵進一步微調，使用通用特徵描述而非特定地標
+            for feature, score in top_features:
+                if feature == "time_display" and "clock" in landmark_id_lower:
+                    confidence_boost += 0.03
+                elif feature == "segmented_exterior" and "segmented" in landmark_id_lower:
+                    confidence_boost += 0.03
+                elif feature == "slanted_design" and "leaning" in landmark_id_lower:
+                    confidence_boost += 0.03
+            # 應用信心度調整
+            if confidence_boost > 0:
+                adjusted_confidence = confidence + confidence_boost
+                self.logger.info(f"Boosted confidence by {confidence_boost:.2f} based on architectural features ({primary_category})")
+                return adjusted_confidence
+            return confidence
+        except Exception as e:
+            self.logger.error(f"Error applying architectural boost: {e}")
+            self.logger.error(traceback.format_exc())
+            return confidence
+    def determine_detection_type_from_region(self, region_width: int, region_height: int,
+                                           image_width: int, image_height: int) -> str:
+        """
+        根據區域大小自動判斷檢測類型
+        Args:
+            region_width: 區域寬度
+            region_height: 區域高度
+            image_width: 圖像寬度
+            image_height: 圖像高度
+        Returns:
+            str: 檢測類型
+        """
+        try:
+            region_area_ratio = (region_width * region_height) / (image_width * image_height)
+            if region_area_ratio > 0.5:
+                return "close_up"
+            elif region_area_ratio > 0.2:
+                return "partial"
+            else:
+                return "distant"
+        except Exception as e:
+            self.logger.error(f"Error determining detection type from region: {e}")
+            self.logger.error(traceback.format_exc())
+            return "partial"
+    def adjust_detection_type_by_viewpoint(self, detection_type: str, dominant_viewpoint: str) -> str:
+        """
+        根據視角調整檢測類型
+        Args:
+            detection_type: 原始檢測類型
+            dominant_viewpoint: 主要視角
+        Returns:
+            str: 調整後的檢測類型
+        """
+        try:
+            if dominant_viewpoint == "close_up" and detection_type != "close_up":
+                return "close_up"
+            elif dominant_viewpoint == "distant" and detection_type != "distant":
+                return "distant"
+            elif dominant_viewpoint == "angled_view":
+                return "partial"  # 角度視圖可能是部分可見
+            else:
+                return detection_type
+        except Exception as e:
+            self.logger.error(f"Error adjusting detection type by viewpoint: {e}")
+            self.logger.error(traceback.format_exc())
+            return detection_type
+    def get_batch_size(self) -> int:
+        """
+        獲取當前批處理大小
+        Returns:
+            int: 批處理大小
+        """
+        return self.batch_size
+    def get_all_threshold_multipliers(self) -> Dict[str, float]:
+        """
+        獲取所有置信度閾值乘數
+        Returns:
+            Dict[str, float]: 閾��乘數字典
+        """
+        return self.confidence_threshold_multipliers.copy()
+    def get_all_landmark_type_thresholds(self) -> Dict[str, float]:
+        """
+        獲取所有地標類型閾值
+        Returns:
+            Dict[str, float]: 地標類型閾值字典
+        """
+        return self.landmark_type_thresholds.copy()

configuration_manager.py ADDED Viewed

	@@ -0,0 +1,418 @@

+from typing import Dict, Any, List, Tuple, Optional, Union
+import json
+import os
+from dataclasses import dataclass, field
+from pathlib import Path
+@dataclass
+class FeatureThresholds:
+    """Configuration class for feature extraction thresholds."""
+    dark_pixel_threshold: float = 50.0
+    bright_pixel_threshold: float = 220.0
+    sky_blue_hue_min: float = 95.0
+    sky_blue_hue_max: float = 135.0
+    sky_blue_sat_min: float = 40.0
+    sky_blue_val_min: float = 90.0
+    gray_sat_max: float = 70.0
+    gray_val_min: float = 60.0
+    gray_val_max: float = 220.0
+    light_source_abs_thresh: float = 220.0
+@dataclass
+class IndoorOutdoorThresholds:
+    """Configuration class for indoor/outdoor classification thresholds."""
+    sky_blue_dominance_thresh: float = 0.18
+    sky_brightness_ratio_thresh: float = 1.25
+    openness_top_thresh: float = 0.68
+    sky_texture_complexity_thresh: float = 0.35
+    ceiling_likelihood_thresh: float = 0.4
+    boundary_clarity_thresh: float = 0.38
+    brightness_uniformity_thresh_indoor: float = 0.6
+    brightness_uniformity_thresh_outdoor: float = 0.40
+    many_bright_spots_thresh: int = 6
+    dim_scene_for_spots_thresh: float = 115.0
+    home_pattern_thresh_strong: float = 2.0
+    home_pattern_thresh_moderate: float = 1.0
+    warm_indoor_max_brightness_thresh: float = 135.0
+    aerial_top_dark_ratio_thresh: float = 0.9
+    aerial_top_complex_thresh: float = 0.60
+    aerial_min_avg_brightness_thresh: float = 65.0
+@dataclass
+class LightingThresholds:
+    """Configuration class for lighting condition analysis thresholds."""
+    outdoor_night_thresh_brightness: float = 80.0
+    outdoor_night_lights_thresh: int = 2
+    outdoor_dusk_dawn_thresh_brightness: float = 130.0
+    outdoor_dusk_dawn_color_thresh: float = 0.10
+    outdoor_day_bright_thresh: float = 140.0
+    outdoor_day_blue_thresh: float = 0.05
+    outdoor_day_cloudy_thresh: float = 120.0
+    outdoor_day_gray_thresh: float = 0.18
+    indoor_bright_thresh: float = 130.0
+    indoor_moderate_thresh: float = 95.0
+    commercial_min_brightness_thresh: float = 105.0
+    commercial_min_spots_thresh: int = 3
+    stadium_min_spots_thresh: int = 6
+    neon_yellow_orange_thresh: float = 0.12
+    neon_bright_spots_thresh: int = 4
+    neon_avg_saturation_thresh: float = 60.0
+@dataclass
+class WeightingFactors:
+    """Configuration class for feature weighting factors."""
+    # Sky/Openness weights (negative values push towards outdoor)
+    sky_blue_dominance_w: float = 3.5
+    sky_brightness_ratio_w: float = 3.0
+    openness_top_w: float = 2.8
+    sky_texture_w: float = 2.0
+    # Ceiling/Enclosure weights (positive values push towards indoor)
+    ceiling_likelihood_w: float = 1.5
+    boundary_clarity_w: float = 1.2
+    # Brightness weights
+    brightness_uniformity_w: float = 0.6
+    brightness_non_uniformity_outdoor_w: float = 1.0
+    brightness_non_uniformity_indoor_penalty_w: float = 0.1
+    # Light source weights
+    circular_lights_w: float = 1.2
+    indoor_light_score_w: float = 0.8
+    many_bright_spots_indoor_w: float = 0.3
+    # Color atmosphere weights
+    warm_atmosphere_indoor_w: float = 0.15
+    # Environment pattern weights
+    home_env_strong_w: float = 1.5
+    home_env_moderate_w: float = 0.7
+    # Structural pattern weights
+    aerial_street_w: float = 2.5
+    places365_outdoor_scene_w: float = 4.0
+    places365_indoor_scene_w: float = 3.0
+    places365_attribute_w: float = 1.5
+@dataclass
+class OverrideFactors:
+    """Configuration class for override and reduction factors."""
+    sky_override_factor_ceiling: float = 0.1
+    sky_override_factor_boundary: float = 0.2
+    sky_override_factor_uniformity: float = 0.15
+    sky_override_factor_lights: float = 0.05
+    sky_override_factor_p365_indoor_decision: float = 0.3
+    aerial_enclosure_reduction_factor: float = 0.75
+    ceiling_sky_override_factor: float = 0.1
+    p365_outdoor_reduces_enclosure_factor: float = 0.3
+    p365_indoor_boosts_ceiling_factor: float = 1.5
+@dataclass
+class ColorRanges:
+    """Configuration class for color range definitions."""
+    warm_hue_ranges: List[Tuple[float, float]] = field(
+        default_factory=lambda: [(0, 50), (330, 360)]
+    )
+    cool_hue_ranges: List[Tuple[float, float]] = field(
+        default_factory=lambda: [(90, 270)]
+    )
+@dataclass
+class AlgorithmParameters:
+    """Configuration class for algorithm-specific parameters."""
+    indoor_score_sigmoid_scale: float = 0.3
+    indoor_decision_threshold: float = 0.5
+    places365_high_confidence_thresh: float = 0.75
+    places365_moderate_confidence_thresh: float = 0.5
+    places365_attribute_confidence_thresh: float = 0.6
+    include_diagnostics: bool = True
+class ConfigurationManager:
+    """
+    這主要是管理光線分析的參數，會有很多不同情況, 做parameters配置
+    This class provides type-safe access to all configuration parameters,
+    supports loading from external files, and includes validation mechanisms.
+    """
+    def __init__(self, config_path: Optional[Union[str, Path]] = None):
+        """
+        Initialize the configuration manager.
+        Args:
+            config_path: Optional path to external configuration file.
+                        If None, uses default configuration.
+        """
+        self._feature_thresholds = FeatureThresholds()
+        self._indoor_outdoor_thresholds = IndoorOutdoorThresholds()
+        self._lighting_thresholds = LightingThresholds()
+        self._weighting_factors = WeightingFactors()
+        self._override_factors = OverrideFactors()
+        self._color_ranges = ColorRanges()
+        self._algorithm_parameters = AlgorithmParameters()
+        if config_path is not None:
+            self.load_from_file(config_path)
+    @property
+    def feature_thresholds(self) -> FeatureThresholds:
+        """Get feature extraction thresholds."""
+        return self._feature_thresholds
+    @property
+    def indoor_outdoor_thresholds(self) -> IndoorOutdoorThresholds:
+        """Get indoor/outdoor classification thresholds."""
+        return self._indoor_outdoor_thresholds
+    @property
+    def lighting_thresholds(self) -> LightingThresholds:
+        """Get lighting condition analysis thresholds."""
+        return self._lighting_thresholds
+    @property
+    def weighting_factors(self) -> WeightingFactors:
+        """Get feature weighting factors."""
+        return self._weighting_factors
+    @property
+    def override_factors(self) -> OverrideFactors:
+        """Get override and reduction factors."""
+        return self._override_factors
+    @property
+    def color_ranges(self) -> ColorRanges:
+        """Get color range definitions."""
+        return self._color_ranges
+    @property
+    def algorithm_parameters(self) -> AlgorithmParameters:
+        """Get algorithm-specific parameters."""
+        return self._algorithm_parameters
+    def get_legacy_config_dict(self) -> Dict[str, Any]:
+        """
+        Generate legacy configuration dictionary for backward compatibility.
+        Returns:
+            Dictionary containing all configuration parameters in the original format.
+        """
+        config_dict = {}
+        # Feature thresholds
+        for field_name, field_value in self._feature_thresholds.__dict__.items():
+            config_dict[field_name] = field_value
+        # Indoor/outdoor thresholds
+        for field_name, field_value in self._indoor_outdoor_thresholds.__dict__.items():
+            config_dict[field_name] = field_value
+        # Lighting thresholds
+        for field_name, field_value in self._lighting_thresholds.__dict__.items():
+            config_dict[field_name] = field_value
+        # Override factors
+        for field_name, field_value in self._override_factors.__dict__.items():
+            config_dict[field_name] = field_value
+        # Color ranges
+        for field_name, field_value in self._color_ranges.__dict__.items():
+            config_dict[field_name] = field_value
+        # Algorithm parameters
+        for field_name, field_value in self._algorithm_parameters.__dict__.items():
+            config_dict[field_name] = field_value
+        # Weighting factors - stored under 'indoor_outdoor_weights' key
+        config_dict["indoor_outdoor_weights"] = self._weighting_factors.__dict__.copy()
+        return config_dict
+    def load_from_file(self, config_path: Union[str, Path]) -> None:
+        """
+        Load configuration from external JSON file.
+        Args:
+            config_path: Path to the configuration file.
+        Raises:
+            FileNotFoundError: If the configuration file doesn't exist.
+            ValueError: If the configuration file contains invalid data.
+        """
+        config_path = Path(config_path)
+        if not config_path.exists():
+            raise FileNotFoundError(f"Configuration file not found: {config_path}")
+        try:
+            with open(config_path, 'r', encoding='utf-8') as file:
+                config_data = json.load(file)
+            self._update_from_dict(config_data)
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Invalid JSON in configuration file: {e}")
+        except Exception as e:
+            raise ValueError(f"Error loading configuration: {e}")
+    def save_to_file(self, config_path: Union[str, Path]) -> None:
+        """
+        Save current configuration to JSON file.
+        Args:
+            config_path: Path where to save the configuration file.
+        """
+        config_path = Path(config_path)
+        config_path.parent.mkdir(parents=True, exist_ok=True)
+        config_dict = self.get_legacy_config_dict()
+        with open(config_path, 'w', encoding='utf-8') as file:
+            json.dump(config_dict, file, indent=2, ensure_ascii=False)
+    def _update_from_dict(self, config_data: Dict[str, Any]) -> None:
+        """
+        Update configuration from dictionary data.
+        Args:
+            config_data: Dictionary containing configuration parameters.
+        """
+        # Update feature thresholds
+        self._update_dataclass_from_dict(self._feature_thresholds, config_data)
+        # Update indoor/outdoor thresholds
+        self._update_dataclass_from_dict(self._indoor_outdoor_thresholds, config_data)
+        # Update lighting thresholds
+        self._update_dataclass_from_dict(self._lighting_thresholds, config_data)
+        # Update override factors
+        self._update_dataclass_from_dict(self._override_factors, config_data)
+        # Update color ranges
+        self._update_dataclass_from_dict(self._color_ranges, config_data)
+        # Update algorithm parameters
+        self._update_dataclass_from_dict(self._algorithm_parameters, config_data)
+        # Update weighting factors from nested dictionary
+        if "indoor_outdoor_weights" in config_data:
+            self._update_dataclass_from_dict(
+                self._weighting_factors,
+                config_data["indoor_outdoor_weights"]
+            )
+    def _update_dataclass_from_dict(self, dataclass_instance: object, data_dict: Dict[str, Any]) -> None:
+        """
+        Update dataclass instance fields from dictionary.
+        Args:
+            dataclass_instance: The dataclass instance to update.
+            data_dict: Dictionary containing the update values.
+        """
+        for field_name, field_value in data_dict.items():
+            if hasattr(dataclass_instance, field_name):
+                # Type validation could be added here
+                setattr(dataclass_instance, field_name, field_value)
+    def validate_configuration(self) -> List[str]:
+        """
+        Validate the current configuration for logical consistency.
+        Returns:
+            List of validation error messages. Empty list if configuration is valid.
+        """
+        errors = []
+        # Validate threshold ranges
+        ft = self._feature_thresholds
+        if ft.dark_pixel_threshold >= ft.bright_pixel_threshold:
+            errors.append("Dark pixel threshold must be less than bright pixel threshold")
+        if ft.sky_blue_hue_min >= ft.sky_blue_hue_max:
+            errors.append("Sky blue hue min must be less than sky blue hue max")
+        if ft.gray_val_min >= ft.gray_val_max:
+            errors.append("Gray value min must be less than gray value max")
+        # Validate probability thresholds
+        ap = self._algorithm_parameters
+        if not (0.0 <= ap.indoor_decision_threshold <= 1.0):
+            errors.append("Indoor decision threshold must be between 0 and 1")
+        if not (0.0 <= ap.places365_high_confidence_thresh <= 1.0):
+            errors.append("Places365 high confidence threshold must be between 0 and 1")
+        # Validate color ranges
+        for warm_range in self._color_ranges.warm_hue_ranges:
+            if warm_range[0] >= warm_range[1]:
+                errors.append(f"Invalid warm hue range: {warm_range}")
+        for cool_range in self._color_ranges.cool_hue_ranges:
+            if cool_range[0] >= cool_range[1]:
+                errors.append(f"Invalid cool hue range: {cool_range}")
+        return errors
+    def get_threshold_value(self, threshold_name: str) -> Any:
+        """
+        Get a specific threshold value by name.
+        Args:
+            threshold_name: Name of the threshold parameter.
+        Returns:
+            The threshold value.
+        Raises:
+            AttributeError: If the threshold name doesn't exist.
+        """
+        # Search through all configuration sections
+        for config_section in [
+            self._feature_thresholds,
+            self._indoor_outdoor_thresholds,
+            self._lighting_thresholds,
+            self._override_factors,
+            self._algorithm_parameters
+        ]:
+            if hasattr(config_section, threshold_name):
+                return getattr(config_section, threshold_name)
+        # Check weighting factors
+        if hasattr(self._weighting_factors, threshold_name):
+            return getattr(self._weighting_factors, threshold_name)
+        raise AttributeError(f"Threshold '{threshold_name}' not found")
+    def update_threshold(self, threshold_name: str, value: Any) -> None:
+        """
+        Update a specific threshold value.
+        Args:
+            threshold_name: Name of the threshold parameter.
+            value: New value for the threshold.
+        Raises:
+            AttributeError: If the threshold name doesn't exist.
+        """
+        # Search through all configuration sections
+        for config_section in [
+            self._feature_thresholds,
+            self._indoor_outdoor_thresholds,
+            self._lighting_thresholds,
+            self._override_factors,
+            self._algorithm_parameters,
+            self._weighting_factors
+        ]:
+            if hasattr(config_section, threshold_name):
+                setattr(config_section, threshold_name, value)
+                return
+        raise AttributeError(f"Threshold '{threshold_name}' not found")

cultural_context_analyzer.py ADDED Viewed

	@@ -0,0 +1,637 @@

+import logging
+import traceback
+import random
+from typing import Dict, List, Optional, Any
+from cultural_templates import CULTURAL_TEMPLATES
+class CulturalContextError(Exception):
+    """文化語境分析過程中的自定義異常"""
+    pass
+class CulturalContextAnalyzer:
+    """
+    文化語境分析器 - 檢測場景中的文化特徵並生成相關的描述
+    該類別負責識別場景中的文化語境線索，包括建築風格、標誌特徵
+    和物件配置，然後生成適當的文化描述元素。
+    """
+    def __init__(self, cultural_templates: Optional[Dict] = None):
+        """
+        初始化文化語境分析器
+        Args:
+            cultural_templates: 可選的自定義文化模板，如果提供則會與默認模板合併
+        """
+        self.logger = logging.getLogger(self.__class__.__name__)
+        try:
+            # 載入文化模板
+            self.cultural_templates = self._load_cultural_templates()
+            # 如果提供了自定義模板，進行合併
+            if cultural_templates:
+                self._merge_custom_templates(cultural_templates)
+            # 初始化場景類型到文化語境的映射
+            self.scene_cultural_mapping = self._initialize_scene_cultural_mapping()
+            self.logger.info("CulturalContextAnalyzer initialized with %d cultural templates",
+                           len(self.cultural_templates))
+        except Exception as e:
+            error_msg = f"Failed to initialize CulturalContextAnalyzer: {str(e)}"
+            self.logger.error(f"{error_msg}\n{traceback.format_exc()}")
+            raise CulturalContextError(error_msg) from e
+    def _load_cultural_templates(self) -> Dict:
+        """
+        載入文化模板
+        Returns:
+            Dict: 文化模板字典
+        Raises:
+            CulturalContextError: 當模板載入失敗時
+        """
+        try:
+            self.logger.debug("Loading cultural templates")
+            # 從配置模組載入文化模板
+            templates = CULTURAL_TEMPLATES.copy()
+            # 確保模板結構正確
+            self._validate_cultural_templates(templates)
+            # 如果沒有載入到模板，使用默認模板
+            if not templates:
+                self.logger.warning("No cultural templates loaded, using defaults")
+                templates = self._get_default_cultural_templates()
+            self.logger.debug("Successfully loaded %d cultural template categories", len(templates))
+            return templates
+        except ImportError as e:
+            self.logger.warning(f"Failed to import cultural templates: {str(e)}, using defaults")
+            return self._get_default_cultural_templates()
+        except Exception as e:
+            error_msg = f"Error loading cultural templates: {str(e)}"
+            self.logger.error(f"{error_msg}\n{traceback.format_exc()}")
+            raise CulturalContextError(error_msg) from e
+    def _get_default_cultural_templates(self) -> Dict:
+        """
+        獲取默認文化模板
+        Returns:
+            Dict: 默認文化模板字典
+        """
+        return {
+            "asian": {
+                "elements": [
+                    "traditional architectural elements",
+                    "cultural signage",
+                    "Asian design features",
+                    "oriental decorative patterns",
+                    "traditional building materials",
+                    "characteristic roofline styles",
+                    "cultural landscaping elements"
+                ],
+                "description": "The scene displays distinctive Asian cultural characteristics with {elements}."
+            },
+            "european": {
+                "elements": [
+                    "classical architecture",
+                    "European design elements",
+                    "historic features",
+                    "traditional stonework",
+                    "characteristic window styles",
+                    "ornamental facades",
+                    "heritage building elements"
+                ],
+                "description": "The scene exhibits European architectural and cultural elements including {elements}."
+            },
+            "american": {
+                "elements": [
+                    "modern architectural styles",
+                    "contemporary design features",
+                    "commercial signage",
+                    "urban planning elements",
+                    "standardized building designs"
+                ],
+                "description": "The scene shows American urban characteristics featuring {elements}."
+            },
+            "mediterranean": {
+                "elements": [
+                    "coastal architectural styles",
+                    "warm climate adaptations",
+                    "traditional building colors",
+                    "characteristic outdoor spaces"
+                ],
+                "description": "The scene reflects Mediterranean cultural influences with {elements}."
+            }
+        }
+    def _validate_cultural_templates(self, templates: Dict):
+        """
+        驗證文化模板結構
+        Args:
+            templates: 要驗證的模板字典
+        Raises:
+            CulturalContextError: 當模板結構無效時
+        """
+        try:
+            for culture, template_data in templates.items():
+                if not isinstance(template_data, dict):
+                    self.logger.warning(f"Invalid cultural template structure for '{culture}': not a dictionary")
+                    continue
+                required_keys = ["elements", "description"]
+                for key in required_keys:
+                    if key not in template_data:
+                        self.logger.warning(f"Missing required key '{key}' in cultural template '{culture}'")
+                # 驗證元素列表
+                if "elements" in template_data:
+                    if not isinstance(template_data["elements"], list):
+                        self.logger.warning(f"Cultural template '{culture}' elements should be a list")
+                    elif not template_data["elements"]:
+                        self.logger.warning(f"Cultural template '{culture}' has empty elements list")
+                # 驗證描述模板
+                if "description" in template_data:
+                    if not isinstance(template_data["description"], str):
+                        self.logger.warning(f"Cultural template '{culture}' description should be a string")
+                    elif "{elements}" not in template_data["description"]:
+                        self.logger.warning(f"Cultural template '{culture}' description missing {{elements}} placeholder")
+            self.logger.debug("Cultural templates validation completed")
+        except Exception as e:
+            self.logger.warning(f"Error validating cultural templates: {str(e)}")
+    def _merge_custom_templates(self, custom_templates: Dict):
+        """
+        合併自定義文化模板
+        Args:
+            custom_templates: 自定義模板字典
+        """
+        try:
+            for culture, template_data in custom_templates.items():
+                if culture in self.cultural_templates:
+                    # 合併現有文化的模板
+                    if isinstance(self.cultural_templates[culture], dict) and isinstance(template_data, dict):
+                        # 合併元素列表
+                        if "elements" in template_data and "elements" in self.cultural_templates[culture]:
+                            existing_elements = self.cultural_templates[culture]["elements"]
+                            new_elements = template_data["elements"]
+                            if isinstance(existing_elements, list) and isinstance(new_elements, list):
+                                self.cultural_templates[culture]["elements"] = existing_elements + new_elements
+                        # 更新其他鍵值
+                        for key, value in template_data.items():
+                            if key != "elements":
+                                self.cultural_templates[culture][key] = value
+                    else:
+                        self.cultural_templates[culture] = template_data
+                else:
+                    # 添加新的文化模板
+                    self.cultural_templates[culture] = template_data
+                self.logger.debug(f"Merged custom template for culture: {culture}")
+            self.logger.info("Successfully merged custom cultural templates")
+        except Exception as e:
+            self.logger.warning(f"Error merging custom cultural templates: {str(e)}")
+    def _initialize_scene_cultural_mapping(self) -> Dict[str, str]:
+        """
+        初始化場景類型到文化語境的display
+        Returns:
+            Dict[str, str]: 場景類型到文化語境的映射字典
+        """
+        return {
+            "asian_commercial_street": "asian",
+            "asian_night_market": "asian",
+            "asian_temple_area": "asian",
+            "chinese_restaurant": "asian",
+            "japanese_restaurant": "asian",
+            "korean_restaurant": "asian",
+            "european_plaza": "european",
+            "european_cafe": "european",
+            "mediterranean_restaurant": "mediterranean",
+            "american_diner": "american",
+            "american_fast_food": "american"
+        }
+    def detect_cultural_context(self, scene_type: str, detected_objects: List[Dict]) -> Optional[str]:
+        """
+        檢測場景的文化語境
+        Args:
+            scene_type: 識別的場景類型
+            detected_objects: 檢測到的物件列表
+        Returns:
+            Optional[str]: 檢測到的文化語境（asian, european等）或None
+        """
+        try:
+            self.logger.debug(f"Detecting cultural context for scene_type: {scene_type}")
+            # 檢查場景類型是否直接指示文化語境
+            if scene_type in self.scene_cultural_mapping:
+                cultural_context = self.scene_cultural_mapping[scene_type]
+                self.logger.debug(f"Direct cultural mapping found: {scene_type} -> {cultural_context}")
+                return cultural_context
+            # 基於場景類型名稱的模式匹配
+            cultural_context = self._detect_from_scene_name_patterns(scene_type)
+            if cultural_context:
+                self.logger.debug(f"Cultural context detected from name patterns: {cultural_context}")
+                return cultural_context
+            # 基於檢測物件的文化特徵分析
+            cultural_context = self._detect_from_object_analysis(detected_objects)
+            if cultural_context:
+                self.logger.debug(f"Cultural context detected from object analysis: {cultural_context}")
+                return cultural_context
+            # 沒有檢測到特定文化語境
+            self.logger.debug("No specific cultural context detected")
+            return None
+        except Exception as e:
+            self.logger.warning(f"Error detecting cultural context: {str(e)}")
+            return None
+    def _detect_from_scene_name_patterns(self, scene_type: str) -> Optional[str]:
+        """
+        基於場景類型名稱模式檢測文化語境
+        Args:
+            scene_type: 場景類型名稱
+        Returns:
+            Optional[str]: 檢測到的文化語境或None
+        """
+        try:
+            scene_lower = scene_type.lower()
+            # Asia
+            asian_keywords = [
+                "asian", "chinese", "japanese", "korean", "thai", "vietnamese",
+                "temple", "pagoda", "zen", "oriental", "bamboo", "tatami"
+            ]
+            # Europe
+            european_keywords = [
+                "european", "french", "italian", "spanish", "german", "british",
+                "plaza", "piazza", "cathedral", "gothic", "baroque", "renaissance",
+                "cafe", "bistro", "pub"
+            ]
+            # 地中海文化
+            mediterranean_keywords = [
+                "mediterranean", "greek", "turkish", "coastal", "terrace",
+                "villa", "courtyard"
+            ]
+            # 美國
+            american_keywords = [
+                "american", "diner", "fast_food", "mall", "suburban",
+                "downtown", "strip_mall"
+            ]
+            # 檢查各文化的key word
+            if any(keyword in scene_lower for keyword in asian_keywords):
+                return "asian"
+            elif any(keyword in scene_lower for keyword in european_keywords):
+                return "european"
+            elif any(keyword in scene_lower for keyword in mediterranean_keywords):
+                return "mediterranean"
+            elif any(keyword in scene_lower for keyword in american_keywords):
+                return "american"
+            return None
+        except Exception as e:
+            self.logger.warning(f"Error detecting cultural context from scene name patterns: {str(e)}")
+            return None
+    def _detect_from_object_analysis(self, detected_objects: List[Dict]) -> Optional[str]:
+        """
+        基於檢測物件分析文化特徵
+        Args:
+            detected_objects: 檢測到的物件列表
+        Returns:
+            Optional[str]: 檢測到的文化語境或None
+        """
+        try:
+            if not detected_objects:
+                return None
+            # 統計文化相關物件
+            cultural_indicators = {
+                "asian": 0,
+                "european": 0,
+                "american": 0,
+                "mediterranean": 0
+            }
+            for obj in detected_objects:
+                class_name = obj.get("class_name", "").lower()
+                # Asia 特色
+                if any(indicator in class_name for indicator in [
+                    "lantern", "chopsticks", "rice", "noodles", "tea",
+                    "bamboo", "pagoda", "shrine", "torii"
+                ]):
+                    cultural_indicators["asian"] += 1
+                # 歐洲的特色
+                elif any(indicator in class_name for indicator in [
+                    "wine", "cheese", "bread", "fountain", "column",
+                    "statue", "cathedral", "clock_tower"
+                ]):
+                    cultural_indicators["european"] += 1
+                # 地中海的特色
+                elif any(indicator in class_name for indicator in [
+                    "olive", "terracotta", "pergola", "villa",
+                    "coastal", "maritime"
+                ]):
+                    cultural_indicators["mediterranean"] += 1
+                # 美國的特色
+                elif any(indicator in class_name for indicator in [
+                    "burger", "pizza", "hotdog", "soda",
+                    "drive_through", "parking_lot"
+                ]):
+                    cultural_indicators["american"] += 1
+            # 找出得分最高的文化語境
+            if max(cultural_indicators.values()) > 0:
+                dominant_culture = max(cultural_indicators.items(), key=lambda x: x[1])[0]
+                max_score = cultural_indicators[dominant_culture]
+                # 需要至少2個指標物件才算有效檢測
+                if max_score >= 2:
+                    return dominant_culture
+            return None
+        except Exception as e:
+            self.logger.warning(f"Error detecting cultural context from object analysis: {str(e)}")
+            return None
+    def generate_cultural_elements(self, cultural_context: str) -> str:
+        """
+        為檢測到的文化語境生成描述元素
+        Args:
+            cultural_context: 檢測到的文化語境
+        Returns:
+            str: 文化元素描述
+        Raises:
+            CulturalContextError: 當文化元素生成失敗時
+        """
+        try:
+            if not cultural_context:
+                return ""
+            self.logger.debug(f"Generating cultural elements for context: {cultural_context}")
+            # 獲取該文化語境的模板
+            if cultural_context not in self.cultural_templates:
+                self.logger.warning(f"No template found for cultural context: {cultural_context}")
+                return ""
+            template = self.cultural_templates[cultural_context]
+            elements = template.get("elements", [])
+            if not elements:
+                self.logger.warning(f"No elements found for cultural context: {cultural_context}")
+                return ""
+            # 選擇1-2個隨機元素
+            num_elements = min(len(elements), random.randint(1, 2))
+            selected_elements = random.sample(elements, num_elements)
+            # 格式化元素列表
+            if len(selected_elements) == 1:
+                elements_text = selected_elements[0]
+            else:
+                elements_text = " and ".join(selected_elements)
+            # 填充模板
+            description_template = template.get("description", "")
+            if not description_template:
+                return f"The scene displays {cultural_context} cultural characteristics."
+            # 替換佔位符
+            cultural_description = description_template.format(elements=elements_text)
+            self.logger.debug(f"Generated cultural description: {cultural_description}")
+            return cultural_description
+        except Exception as e:
+            error_msg = f"Error generating cultural elements for context '{cultural_context}': {str(e)}"
+            self.logger.error(f"{error_msg}\n{traceback.format_exc()}")
+            raise CulturalContextError(error_msg) from e
+    def get_cultural_template(self, cultural_context: str) -> Dict[str, Any]:
+        """
+        獲取指定文化語境的模板
+        Args:
+            cultural_context: 文化語境名稱
+        Returns:
+            Dict[str, Any]: 文化模板字典
+        """
+        try:
+            if cultural_context in self.cultural_templates:
+                return self.cultural_templates[cultural_context].copy()
+            # 返回備用模板
+            self.logger.warning(f"Cultural template not found for '{cultural_context}', using fallback")
+            return {
+                "elements": ["various cultural elements"],
+                "description": f"The scene displays {cultural_context} cultural characteristics."
+            }
+        except Exception as e:
+            self.logger.warning(f"Error getting cultural template for '{cultural_context}': {str(e)}")
+            return {
+                "elements": ["various elements"],
+                "description": "The scene displays cultural characteristics."
+            }
+    def add_cultural_template(self, cultural_context: str, template: Dict[str, Any]):
+        """
+        添加或更新文化模板
+        Args:
+            cultural_context: 文化語境名稱
+            template: 文化模板字典
+        Raises:
+            CulturalContextError: 當模板格式無效時
+        """
+        try:
+            # 驗證模板格式
+            if not isinstance(template, dict):
+                raise CulturalContextError("Template must be a dictionary")
+            required_keys = ["elements", "description"]
+            for key in required_keys:
+                if key not in template:
+                    raise CulturalContextError(f"Template missing required key: {key}")
+            if not isinstance(template["elements"], list):
+                raise CulturalContextError("Template 'elements' must be a list")
+            if not isinstance(template["description"], str):
+                raise CulturalContextError("Template 'description' must be a string")
+            # 添加模板
+            self.cultural_templates[cultural_context] = template.copy()
+            self.logger.info(f"Added cultural template for context: {cultural_context}")
+        except CulturalContextError:
+            raise
+        except Exception as e:
+            error_msg = f"Error adding cultural template for '{cultural_context}': {str(e)}"
+            self.logger.error(f"{error_msg}\n{traceback.format_exc()}")
+            raise CulturalContextError(error_msg) from e
+    def get_supported_cultures(self) -> List[str]:
+        """
+        獲取所有支援的文化語境列表
+        Returns:
+            List[str]: 支援的文化語境名稱列表
+        """
+        return list(self.cultural_templates.keys())
+    def has_cultural_context(self, cultural_context: str) -> bool:
+        """
+        檢查是否支援指定的文化語境
+        Args:
+            cultural_context: 文化語境名稱
+        Returns:
+            bool: 是否支援該文化語境
+        """
+        return cultural_context in self.cultural_templates
+    def analyze_cultural_diversity(self, detected_objects: List[Dict]) -> Dict[str, int]:
+        """
+        分析場景中的文化多樣性
+        Args:
+            detected_objects: 檢測到的物件列表
+        Returns:
+            Dict[str, int]: 各文化語境的指標物件計數
+        """
+        try:
+            cultural_scores = {culture: 0 for culture in self.cultural_templates.keys()}
+            if not detected_objects:
+                return cultural_scores
+            for obj in detected_objects:
+                class_name = obj.get("class_name", "").lower()
+                # 為每個文化語境計算指標分數
+                for culture in cultural_scores:
+                    if self._is_cultural_indicator(class_name, culture):
+                        cultural_scores[culture] += 1
+            self.logger.debug(f"Cultural diversity analysis: {cultural_scores}")
+            return cultural_scores
+        except Exception as e:
+            self.logger.warning(f"Error analyzing cultural diversity: {str(e)}")
+            return {culture: 0 for culture in self.cultural_templates.keys()}
+    def _is_cultural_indicator(self, object_name: str, culture: str) -> bool:
+        """
+        檢查物件名稱是否為特定文化的指標
+        Args:
+            object_name: 物件名稱
+            culture: 文化語境
+        Returns:
+            bool: 是否為該文化的指標物件
+        """
+        try:
+            cultural_keywords = {
+                "asian": [
+                    "lantern", "chopsticks", "rice", "noodles", "tea",
+                    "bamboo", "pagoda", "shrine", "torii", "kimono",
+                    "sushi", "ramen", "dim_sum"
+                ],
+                "european": [
+                    "wine", "cheese", "bread", "fountain", "column",
+                    "statue", "cathedral", "clock_tower", "baguette",
+                    "croissant", "espresso", "gelato"
+                ],
+                "mediterranean": [
+                    "olive", "terracotta", "pergola", "villa",
+                    "coastal", "maritime", "cypress", "vineyard"
+                ],
+                "american": [
+                    "burger", "pizza", "hotdog", "soda",
+                    "drive_through", "parking_lot", "diner",
+                    "strip_mall", "suburb"
+                ]
+            }
+            if culture not in cultural_keywords:
+                return False
+            keywords = cultural_keywords[culture]
+            return any(keyword in object_name for keyword in keywords)
+        except Exception as e:
+            self.logger.warning(f"Error checking cultural indicator for {object_name}, {culture}: {str(e)}")
+            return False
+    def get_template_summary(self) -> Dict[str, Dict[str, Any]]:
+        """
+        獲取所有文化模板的摘要信息
+        Returns:
+            Dict[str, Dict[str, Any]]: 文化模板摘要
+        """
+        try:
+            summary = {}
+            for culture, template in self.cultural_templates.items():
+                summary[culture] = {
+                    "element_count": len(template.get("elements", [])),
+                    "has_description": bool(template.get("description", "")),
+                    "sample_elements": template.get("elements", [])[:3]  # 前3個元素作為樣本
+                }
+            return summary
+        except Exception as e:
+            self.logger.warning(f"Error generating template summary: {str(e)}")
+            return {}

enhanced_scene_describer.py ADDED Viewed

	@@ -0,0 +1,1254 @@

+import os
+import re
+import json
+import logging
+import random
+import numpy as np
+from typing import Dict, List, Tuple, Any, Optional
+from scene_type import SCENE_TYPES
+from scene_detail_templates import SCENE_DETAIL_TEMPLATES
+from object_template_fillers import OBJECT_TEMPLATE_FILLERS
+from lighting_conditions import LIGHTING_CONDITIONS
+from viewpoint_templates import VIEWPOINT_TEMPLATES
+from cultural_templates import CULTURAL_TEMPLATES
+from confidence_templates import CONFIDENCE_TEMPLATES
+from landmark_data import ALL_LANDMARKS
+from region_analyzer import RegionAnalyzer
+from viewpoint_detector import ViewpointDetector, ViewpointDetectionError
+from template_manager import TemplateManager, TemplateLoadingError, TemplateFillError
+from object_description_generator import ObjectDescriptionGenerator, ObjectDescriptionError
+from cultural_context_analyzer import CulturalContextAnalyzer, CulturalContextError
+from text_formatter import TextFormatter, TextFormattingError
+class EnhancedSceneDescriberError(Exception):
+    """場景描述生成過程中的自定義異常"""
+    pass
+class EnhancedSceneDescriber:
+    """
+    增強場景描述器 - 提供詳細自然語言場景描述的主要窗口，其他相關class匯集於此
+    此class會協調多個專門組件來生成高質量的場景描述，包括視角檢測、
+    模板管理、物件描述、文化語境分析和文本格式化。
+    """
+    def __init__(self, templates_db: Optional[Dict] = None, scene_types: Optional[Dict] = None, spatial_analyzer_instance: Optional[Any] = None):
+        """
+        初始化增強場景描述器
+        Args:
+            templates_db: 可選的自定義模板數據庫
+            scene_types: 場景類型定義字典
+            spatial_analyzer_instance: 空間分析器實例（保持兼容性）
+        """
+        self.logger = logging.getLogger(self.__class__.__name__)
+        self.logger.setLevel(logging.INFO)
+        # 如果沒有logger，就加一個
+        if not self.logger.hasHandlers():
+            handler = logging.StreamHandler()
+            formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+            handler.setFormatter(formatter)
+            self.logger.addHandler(handler)
+        try:
+            # 載入場景類型定義
+            self.scene_types = scene_types or self._load_default_scene_types()
+            # 初始化子組件
+            self._initialize_components(templates_db)
+            # 保存空間分析器實例以保持兼容性
+            self.spatial_analyzer_instance = spatial_analyzer_instance
+            self.logger.info("EnhancedSceneDescriber initialized successfully with %d scene types",
+                           len(self.scene_types))
+        except Exception as e:
+            error_msg = f"Failed to initialize EnhancedSceneDescriber: {str(e)}"
+            self.logger.error(f"{error_msg}\n{e.__class__.__name__}: {str(e)}")
+            raise EnhancedSceneDescriberError(error_msg) from e
+    def _load_default_scene_types(self) -> Dict:
+        """
+        載入默認場景類型
+        Returns:
+            Dict: 場景類型定義
+        """
+        try:
+            return SCENE_TYPES
+        except Exception as e:
+            self.logger.error(f"Failed to import SCENE_TYPES: {str(e)}")
+            return {}  # 返回空字典
+    def _initialize_components(self, templates_db: Optional[Dict]):
+        """
+        初始化所有子組件
+        Args:
+            templates_db: 可選的模板數據庫
+        """
+        try:
+            # 初始化視角檢測器
+            self.viewpoint_detector = ViewpointDetector()
+            # 初始化區域分析器
+            self.region_analyzer = RegionAnalyzer()
+            # 初始化模板管理器
+            self.template_manager = TemplateManager(custom_templates_db=templates_db)
+            # 初始化物件描述生成器，傳入區域分析器
+            self.object_description_generator = ObjectDescriptionGenerator(
+                region_analyzer=self.region_analyzer
+            )
+            # 初始化文化語境分析器
+            self.cultural_context_analyzer = CulturalContextAnalyzer()
+            # 初始化文本格式化器
+            self.text_formatter = TextFormatter()
+            self.logger.debug("All components initialized successfully")
+        except Exception as e:
+            error_msg = f"Component initialization failed: {str(e)}"
+            self.logger.error(error_msg)
+            # 初始化基本組件而不是拋出異常
+            self._initialize_fallback_components()
+    def generate_description(self, scene_type: str, detected_objects: List[Dict], confidence: float,
+                           lighting_info: Dict, functional_zones: List[str], enable_landmark: bool = True,
+                           scene_scores: Optional[Dict] = None, spatial_analysis: Optional[Dict] = None,
+                           image_dimensions: Optional[Tuple[int, int]] = None, # 改為 Tuple
+                           places365_info: Optional[Dict] = None,
+                           object_statistics: Optional[Dict] = None) -> str:
+        try:
+            traffic_list = [obj for obj in detected_objects if obj.get("class_name", "") == "traffic light"]
+            # print(f"[DEBUG] generate_description 一開始接收到的 traffic light 數量: {len(traffic_list)}") # 原始的 print
+            self.logger.debug(f"Initial traffic light count in generate_description: {len(traffic_list)}") # 改用 logger
+            # for idx, tl in enumerate(traffic_list): # 這部分 log 可能過於詳細，先註解
+            #     self.logger.debug(f"    idx={idx}, confidence={tl.get('confidence', 0):.4f}, bbox={tl.get('bbox')}, region={tl.get('region')}")
+            if scene_type == "unknown" or confidence < 0.4:
+                generic_desc = self._generate_generic_description(detected_objects, lighting_info)
+                return self.text_formatter.format_final_description(generic_desc)
+            current_detected_objects = detected_objects
+            if not enable_landmark:
+                current_detected_objects = [obj for obj in detected_objects if not obj.get("is_landmark", False)]
+            places365_context = ""
+            if places365_info and places365_info.get('confidence', 0) > 0.3:
+                scene_label = places365_info.get('scene_label', '')
+                attributes = places365_info.get('attributes', [])
+                is_indoor = places365_info.get('is_indoor', None)
+                if scene_label:
+                    places365_context = f"Scene context: {scene_label}"
+                    if attributes:
+                        places365_context += f" with characteristics: {', '.join(attributes[:3])}"
+                    if is_indoor is not None:
+                        indoor_outdoor = "indoor" if is_indoor else "outdoor"
+                        places365_context += f" ({indoor_outdoor} environment)"
+                self.logger.debug(f"Enhanced description incorporating Places365 context: {places365_context}")
+            landmark_objects_in_scene = [obj for obj in current_detected_objects if obj.get("is_landmark", False)]
+            has_landmark_in_scene = len(landmark_objects_in_scene) > 0
+            if enable_landmark and (scene_type in ["tourist_landmark", "natural_landmark", "historical_monument"] or has_landmark_in_scene):
+                landmark_desc = self._generate_landmark_description(
+                    scene_type, current_detected_objects, confidence,
+                    lighting_info, functional_zones, landmark_objects_in_scene
+                )
+                return self.text_formatter.format_final_description(landmark_desc)
+            viewpoint = self.viewpoint_detector.detect_viewpoint(current_detected_objects)
+            current_scene_type = scene_type
+            if viewpoint == "aerial":
+                if "intersection" in current_scene_type.lower() or self._is_intersection(current_detected_objects):
+                    current_scene_type = "aerial_view_intersection"
+                elif any(keyword in current_scene_type.lower() for keyword in ["commercial", "shopping", "retail"]):
+                    current_scene_type = "aerial_view_commercial_area"
+                elif any(keyword in current_scene_type.lower() for keyword in ["plaza", "square"]):
+                    current_scene_type = "aerial_view_plaza"
+                else:
+                    current_scene_type = "aerial_view_general"
+            current_scene_type = self._sanitize_scene_type_for_description(current_scene_type)
+            # 偵測文化背景資訊
+            cultural_context = None
+            if viewpoint != "aerial":
+                cultural_context = self.cultural_context_analyzer.detect_cultural_context(current_scene_type, current_detected_objects)
+             # 設定基礎描述
+            base_description = "A scene"
+            if viewpoint == "aerial":
+                if current_scene_type in self.scene_types: # 確保 self.scene_types 已有
+                    base_description = self.scene_types.get(current_scene_type, {}).get("description", "An aerial view showing the layout and movement patterns from above")
+                else:
+                    base_description = "An aerial view showing the layout and movement patterns from above"
+            elif current_scene_type in self.scene_types: # 確保 self.scene_types 已有
+                 base_description = self.scene_types.get(current_scene_type, {}).get("description", "A scene")
+            # 假設 template_manager 內部可以處理 List[str] 的 functional_zones
+            selected_template = self.template_manager.get_template_by_scene_type(
+                scene_type=current_scene_type,
+                detected_objects=current_detected_objects,
+                functional_zones=functional_zones or [] # 傳入 List[str]
+            )
+            # 用於 fill_template 中的某些佔位符
+            processed_functional_zones = {}
+            if functional_zones:
+                if isinstance(functional_zones, dict): # 如果外部傳入的就是dict
+                     processed_functional_zones = functional_zones
+                elif isinstance(functional_zones, list): # 如果是 list of strings
+                     processed_functional_zones = {f"zone_{i}": {"description": zone_desc} for i, zone_desc in enumerate(functional_zones)}
+            # 組織場景資料
+            scene_data = {
+                "detected_objects": current_detected_objects,
+                "functional_zones": processed_functional_zones, # 傳入處理過的字典
+                "scene_type": current_scene_type,
+                "object_statistics": object_statistics or {},
+                "lighting_info": lighting_info,
+                "spatial_analysis": spatial_analysis,
+                "places365_info": places365_info
+            }
+            # 應用模板產生核心場景描述
+            core_scene_details = self.template_manager.apply_template(selected_template, scene_data)
+            # 組合基礎描述與核心場景細節
+            description = base_description
+            if core_scene_details and core_scene_details.strip():
+                cleaned_scene_details = self._validate_and_clean_scene_details(core_scene_details)
+                if base_description.lower() == "a scene" and len(cleaned_scene_details) > len(base_description):
+                    description = cleaned_scene_details
+                else:
+                    description = self.text_formatter.smart_append(description, cleaned_scene_details)
+            elif not core_scene_details and not description: # 如果兩者都為空
+                description = self._generate_generic_description(current_detected_objects, lighting_info)
+            # 添加次要描述資訊
+            if current_scene_type in self.scene_types and "secondary_description" in self.scene_types[current_scene_type]:
+                secondary_desc = self.scene_types[current_scene_type]["secondary_description"]
+                if secondary_desc:
+                    description = self.text_formatter.smart_append(description, secondary_desc)
+            # 處理人物相關的描述
+            people_objs = [obj for obj in current_detected_objects if obj.get("class_id") == 0]
+            if people_objs:
+                people_count = len(people_objs)
+                if people_count == 1: people_phrase = "a single person"
+                elif 1 < people_count <= 3: people_phrase = f"{people_count} people"
+                elif 3 < people_count <= 7: people_phrase = "several people"
+                else: people_phrase = "multiple people"
+                if not any(p_word in description.lower() for p_word in ["person", "people", "pedestrian"]):
+                    description = self.text_formatter.smart_append(description, f"The scene includes {people_phrase}.")
+            # 添加文化背景元素(非空中視角）
+            if cultural_context and viewpoint != "aerial":
+                cultural_elements = self.cultural_context_analyzer.generate_cultural_elements(cultural_context)
+                if cultural_elements:
+                    description = self.text_formatter.smart_append(description, cultural_elements)
+            # 處理光照條件描述
+            lighting_description_text = ""
+            if lighting_info and "time_of_day" in lighting_info:
+                lighting_type = lighting_info["time_of_day"]
+                lighting_desc_template = self.template_manager.get_lighting_template(lighting_type)
+                if lighting_desc_template: lighting_description_text = lighting_desc_template
+            if lighting_description_text and lighting_description_text.lower() not in description.lower():
+                description = self.text_formatter.smart_append(description, lighting_description_text)
+             # 添加視角特定的觀察描述
+            if viewpoint != "eye_level":
+                viewpoint_template = self.template_manager.get_viewpoint_template(viewpoint)
+                prefix = viewpoint_template.get('prefix', '')
+                observation_template = viewpoint_template.get("observation", "")
+                scene_elements_for_vp = "the overall layout and objects"
+                if viewpoint == "aerial": scene_elements_for_vp = "crossing patterns and general layout"
+                viewpoint_observation_text = observation_template.format(scene_elements=scene_elements_for_vp)
+                full_viewpoint_text = ""
+                if prefix:
+                    full_viewpoint_text = prefix.strip() + " "
+                    if viewpoint_observation_text and viewpoint_observation_text[0].islower():
+                        full_viewpoint_text += viewpoint_observation_text
+                    elif viewpoint_observation_text:
+                        full_viewpoint_text = prefix + (viewpoint_observation_text[0].lower() + viewpoint_observation_text[1:] if description else viewpoint_observation_text)
+                elif viewpoint_observation_text:
+                    full_viewpoint_text = viewpoint_observation_text[0].upper() + viewpoint_observation_text[1:]
+                if full_viewpoint_text and full_viewpoint_text.lower() not in description.lower():
+                    description = self.text_formatter.smart_append(description, full_viewpoint_text)
+            # 需要轉換或調整 describe_functional_zones
+            if functional_zones and len(functional_zones) > 0:
+                if isinstance(functional_zones, dict):
+                     zones_desc_text = self.object_description_generator.describe_functional_zones(functional_zones)
+                else: # 如果是 list of strings
+                     temp_zones_dict = {f"area_{i}": {"description": desc} for i, desc in enumerate(functional_zones)}
+                     zones_desc_text = self.object_description_generator.describe_functional_zones(temp_zones_dict)
+                if zones_desc_text:
+                    description = self.text_formatter.smart_append(description, zones_desc_text)
+            # 避免重複提到
+            if hasattr(self.text_formatter, 'deduplicate_sentences_in_description'):
+                deduplicated_description = self.text_formatter.deduplicate_sentences_in_description(description)
+                self.logger.info(f"Description before pre-LLM deduplication (len {len(description)}): '{description[:150]}...'")
+                self.logger.info(f"Description after pre-LLM deduplication (len {len(deduplicated_description)}): '{deduplicated_description[:150]}...'")
+                description = deduplicated_description # 更新 description 為去除重複後的版本
+            else:
+                self.logger.warning("TextFormatter does not have 'deduplicate_sentences_in_description'. Skipping pre-LLM deduplication of the internally generated description.")
+            # 格式化最終描述
+            final_formatted_description = self.text_formatter.format_final_description(description)
+            # 如果禁用地標，過濾地標引用
+            if not enable_landmark:
+                final_formatted_description = self.text_formatter.filter_landmark_references(final_formatted_description, enable_landmark=False)
+            # 如果描述為空，使用備用描述
+            if not final_formatted_description.strip() or final_formatted_description.strip() == ".":
+                self.logger.warning(f"Description for scene_type '{current_scene_type}' became empty after processing. Falling back.")
+                final_formatted_description = self.text_formatter.format_final_description(
+                    self._generate_generic_description(current_detected_objects, lighting_info)
+                )
+            return final_formatted_description
+        except Exception as e:
+            error_msg = f"Error generating scene description: {str(e)}"
+            self.logger.error(f"{error_msg}\n{e.__class__.__name__}: {str(e)}")
+            try:
+                fallback_desc = self._generate_generic_description(detected_objects, lighting_info)
+                return self.text_formatter.format_final_description(fallback_desc)
+            except:
+                return "A scene with various elements is visible."
+    def deduplicate_sentences_in_description(self, description: str, similarity_threshold: float = 0.80) -> str:
+        """
+        從一段描述文本中移除重複或高度相似的句子。
+        此方法會嘗試保留更長、資訊更豐富的句子版本。
+        Args:
+            description (str): 原始描述文本。
+            similarity_threshold (float): 判斷句子是否相似的 Jaccard 相似度閾值 (0 到 1)。
+                                         預設為 0.8，表示詞彙重疊度達到80%即視為相似。
+        Returns:
+            str: 移除了重複或高度相似句子後的文本。
+        """
+        try:
+            if not description or not description.strip():
+                self.logger.debug("deduplicate_sentences_in_description: Received empty or blank description.")
+                return ""
+            # 使用正則表達式分割句子，保留句尾標點符號
+            sentences = re.split(r'(?<=[.!?])\s+', description.strip())
+            if not sentences:
+                self.logger.debug("deduplicate_sentences_in_description: No sentences found after splitting.")
+                return ""
+            unique_sentences_data = []  # 存儲 (原始句子文本, 該句子的詞彙集合)
+            for current_sentence_text in sentences:
+                current_sentence_text = current_sentence_text.strip()
+                if not current_sentence_text:
+                    continue
+                # 預處理當前句子以進行比較：轉小寫、移除標點、分割成詞彙集合
+                simplified_current_text = re.sub(r'[^\w\s\d]', '', current_sentence_text.lower()) # 保留數字
+                current_sentence_words = set(simplified_current_text.split())
+                if not current_sentence_words: # 如果處理後是空集合 (例如句子只包含標點)
+                    # 如果原始句子有內容（例如只有一個標點），就保留它
+                    if current_sentence_text and not unique_sentences_data: # 避免在開頭加入孤立標點
+                         unique_sentences_data.append((current_sentence_text, current_sentence_words))
+                    continue
+                is_subsumed_or_highly_similar = False
+                index_to_replace = -1
+                for i, (kept_sentence_text, kept_sentence_words) in enumerate(unique_sentences_data):
+                    if not kept_sentence_words: # 跳過已保留的空詞彙集合
+                        continue
+                    # 計算 Jaccard 相似度
+                    intersection_len = len(current_sentence_words.intersection(kept_sentence_words))
+                    union_len = len(current_sentence_words.union(kept_sentence_words))
+                    jaccard_similarity = 0.0
+                    if union_len > 0:
+                        jaccard_similarity = intersection_len / union_len
+                    elif not current_sentence_words and not kept_sentence_words: # 兩個都是空的
+                        jaccard_similarity = 1.0
+                    if jaccard_similarity >= similarity_threshold:
+                        # 如果當前句子比已保留的句子長，則標記替換舊的
+                        if len(current_sentence_words) > len(kept_sentence_words):
+                            self.logger.debug(f"Deduplication: Replacing shorter \"{kept_sentence_text[:50]}...\" "
+                                              f"with longer similar \"{current_sentence_text[:50]}...\" (Jaccard: {jaccard_similarity:.2f})")
+                            index_to_replace = i
+                            break # 找到一個可以被替換的，就跳出內層循環
+                        # 如果當前句子比已保留的句子短，或者長度相近但內容高度相似，則標記當前句子為重複
+                        else: # current_sentence_words is shorter or of similar length
+                            is_subsumed_or_highly_similar = True
+                            self.logger.debug(f"Deduplication: Current sentence \"{current_sentence_text[:50]}...\" "
+                                              f"is subsumed by or highly similar to \"{kept_sentence_text[:50]}...\" (Jaccard: {jaccard_similarity:.2f}). Skipping.")
+                            break
+                if index_to_replace != -1:
+                    unique_sentences_data[index_to_replace] = (current_sentence_text, current_sentence_words)
+                elif not is_subsumed_or_highly_similar:
+                    unique_sentences_data.append((current_sentence_text, current_sentence_words))
+            # 從 unique_sentences_data 中提取最終的句子文本
+            final_sentences = [s_data[0] for s_data in unique_sentences_data]
+            # 重組句子，確保每個句子以標點符號結尾，並且句子間有空格
+            reconstructed_response = ""
+            for i, s_text in enumerate(final_sentences):
+                s_text = s_text.strip()
+                if not s_text:
+                    continue
+                # 確保句子以標點結尾
+                if not re.search(r'[.!?]$', s_text):
+                    s_text += "."
+                reconstructed_response += s_text
+                if i < len(final_sentences) - 1: # 如果不是最後一句，添加空格
+                    reconstructed_response += " "
+            self.logger.debug(f"Deduplicated description (len {len(reconstructed_response.strip())}): '{reconstructed_response.strip()[:150]}...'")
+            return reconstructed_response.strip()
+        except Exception as e:
+            self.logger.error(f"Error in deduplicate_sentences_in_description: {str(e)}")
+            self.logger.error(traceback.format_exc())
+            return description # 發生錯誤時返回原始描述
+    def _extract_placeholders(self, template: str) -> List[str]:
+        """提取模板中的佔位符"""
+        import re
+        return re.findall(r'\{([^}]+)\}', template)
+    def _generate_placeholder_content(self, placeholder: str, detected_objects: List[Dict],
+                                    functional_zones: List, scene_type: str,
+                                    object_statistics: Dict) -> str:
+        """生成佔位符內容"""
+        all_replacements = self._generate_default_replacements()
+        return self._get_placeholder_replacement(
+            placeholder, {}, all_replacements, detected_objects, scene_type
+        )
+    def _preprocess_functional_zones(self, functional_zones: List) -> Dict:
+        """預處理功能區域數據"""
+        if isinstance(functional_zones, list):
+            # 將列表轉換為字典格式
+            zones_dict = {}
+            for i, zone in enumerate(functional_zones):
+                if isinstance(zone, str):
+                    zones_dict[f"area {i+1}"] = {"description": zone}
+                elif isinstance(zone, dict):
+                    zones_dict[f"area {i+1}"] = zone
+            return zones_dict
+        elif isinstance(functional_zones, dict):
+            return functional_zones
+        else:
+            return {}
+    def _standardize_placeholder_content(self, content: str, placeholder_type: str) -> str:
+        """標準化佔位符內容"""
+        if not content:
+            return "various elements"
+        return content.strip()
+    def _finalize_description_output(self, description: str) -> str:
+        """最終化描述輸出"""
+        if not description:
+            return "A scene featuring various elements and organized areas of activity."
+        # 基本清理
+        import re
+        finalized = re.sub(r'\s+', ' ', description).strip()
+        # 確保適當結尾
+        if finalized and not finalized.endswith(('.', '!', '?')):
+            finalized += '.'
+        # 首字母大寫
+        if finalized:
+            finalized = finalized[0].upper() + finalized[1:] if len(finalized) > 1 else finalized.upper()
+        return finalized
+    def _sanitize_scene_type_for_description(self, scene_type: str) -> str:
+        """
+        清理場景類型名稱，確保不包含內部標識符格式
+        Args:
+            scene_type: 原始場景類型名稱
+        Returns:
+            str: 清理後的場景類型名稱
+        """
+        try:
+            # 移除下劃線並轉換為空格分隔的自然語言
+            cleaned_type = scene_type.replace('_', ' ')
+            # 確保不直接在描述中使用技術性場景類型名稱
+            return cleaned_type
+        except Exception as e:
+            self.logger.warning(f"Error sanitizing scene type '{scene_type}': {str(e)}")
+            return "general scene"
+    def _validate_and_clean_scene_details(self, scene_details: str) -> str:
+        """
+        驗證並清理場景詳細信息，移除可能的模板填充錯誤
+        Args:
+            scene_details: 原始場景詳細信息
+        Returns:
+            str: 清理後的場景詳細信息
+        """
+        try:
+            if not scene_details or not scene_details.strip():
+                return ""
+            cleaned = scene_details.strip()
+            # 移除常見的模板填充錯誤模式
+            import re
+            # 修復 "In ," 類型的錯誤
+            cleaned = re.sub(r'\bIn\s*,\s*', 'In this scene, ', cleaned)
+            cleaned = re.sub(r'\bAt\s*,\s*', 'At this location, ', cleaned)
+            cleaned = re.sub(r'\bWithin\s*,\s*', 'Within this area, ', cleaned)
+            # 移除內部標識符格式
+            cleaned = re.sub(r'\b\w+_\w+(?:_\w+)*\b(?!\s+(area|zone|region))',
+                            lambda m: m.group(0).replace('_', ' '), cleaned)
+            # 確保句子完整性
+            if cleaned and not cleaned.endswith(('.', '!', '?')):
+                cleaned += '.'
+            return cleaned
+        except Exception as e:
+            self.logger.warning(f"Error validating scene details: {str(e)}")
+            return scene_details if scene_details else ""
+    def _generate_landmark_description(self,
+                                     scene_type: str,
+                                     detected_objects: List[Dict],
+                                     confidence: float,
+                                     lighting_info: Optional[Dict] = None,
+                                     functional_zones: Optional[Dict] = None,
+                                     landmark_objects: Optional[List[Dict]] = None) -> str:
+        """
+        生成包含地標信息的場景描述
+        Args:
+            scene_type: 識別的場景類型
+            detected_objects: 檢測到的物件列表
+            confidence: 場景分類置信度
+            lighting_info: 照明條件信息
+            functional_zones: 功能區域信息
+            landmark_objects: 識別為地標的物件列表
+        Returns:
+            str: 包含地標信息的自然語言場景描述
+        """
+        try:
+            # 如果沒有提供地標物件，從檢測物件中篩選
+            if landmark_objects is None:
+                landmark_objects = [obj for obj in detected_objects if obj.get("is_landmark", False)]
+            # 如果沒有地標，退回到標準描述
+            if not landmark_objects:
+                if scene_type in ["tourist_landmark", "natural_landmark", "historical_monument"]:
+                    base_description = "A scenic area that appears to be a tourist destination, though specific landmarks are not clearly identifiable."
+                else:
+                    return self.text_formatter.format_final_description(self._generate_scene_details(
+                        scene_type,
+                        detected_objects,
+                        lighting_info,
+                        self.viewpoint_detector.detect_viewpoint(detected_objects)
+                    ))
+            else:
+                # 獲取主要地標
+                primary_landmark = max(landmark_objects, key=lambda x: x.get("confidence", 0))
+                landmark_name = primary_landmark.get("class_name", "landmark")
+                # 先取原生 location
+                landmark_location = primary_landmark.get("location", "")
+                # 如果 location 為空，就從全域 ALL_LANDMARKS 補上
+                lm_id = primary_landmark.get("landmark_id")
+                if not landmark_location and lm_id and lm_id in ALL_LANDMARKS:
+                    landmark_location = ALL_LANDMARKS[lm_id].get("location", "")
+                # 根據地標類型選擇適當的描述模板，並插入 location
+                if scene_type == "natural_landmark" or primary_landmark.get("landmark_type") == "natural":
+                    base_description = f"A natural landmark scene featuring {landmark_name} in {landmark_location}."
+                elif scene_type == "historical_monument" or primary_landmark.get("landmark_type") == "monument":
+                    base_description = f"A historical monument scene showcasing {landmark_name}, a significant landmark in {landmark_location}."
+                else:
+                    base_description = f"A tourist landmark scene centered around {landmark_name}, an iconic structure in {landmark_location}."
+            # 添加地標的額外信息
+            landmark_details = []
+            for landmark in landmark_objects:
+                details = []
+                if "year_built" in landmark:
+                    details.append(f"built in {landmark['year_built']}")
+                if "architectural_style" in landmark:
+                    details.append(f"featuring {landmark['architectural_style']} architectural style")
+                if "significance" in landmark:
+                    details.append(landmark["significance"])
+                # 補 location（如果該物件沒有 location，就再從 ALL_LANDMARKS 撈一次）
+                loc = landmark.get("location", "")
+                lm_id_iter = landmark.get("landmark_id")
+                if not loc and lm_id_iter and lm_id_iter in ALL_LANDMARKS:
+                    loc = ALL_LANDMARKS[lm_id_iter].get("location", "")
+                if loc:
+                    details.append(f"located in {loc}")
+                if details:
+                    landmark_details.append(f"{landmark['class_name']} ({', '.join(details)})")
+            # 將詳細信息添加到基本描述中
+            if landmark_details:
+                description = base_description + " The scene features " + ", ".join(landmark_details) + "."
+            else:
+                description = base_description
+            # 獲取視角
+            viewpoint = self.viewpoint_detector.detect_viewpoint(detected_objects)
+            # 生成人員活動描述
+            people_count = len([obj for obj in detected_objects if obj["class_id"] == 0])
+            if people_count > 0:
+                if people_count == 1:
+                    people_description = "There is one person in the scene, likely a tourist or visitor."
+                elif people_count < 5:
+                    people_description = f"There are {people_count} people in the scene, possibly tourists visiting the landmark."
+                else:
+                    people_description = f"The scene includes a group of {people_count} people, indicating this is a popular tourist destination."
+                description = self.text_formatter.smart_append(description, people_description)
+            # 添加照明信息
+            if lighting_info and "time_of_day" in lighting_info:
+                lighting_type = lighting_info["time_of_day"]
+                lighting_description = self.template_manager.get_lighting_template(lighting_type)
+                description = self.text_formatter.smart_append(description, lighting_description)
+            # 添加視角描述
+            if viewpoint != "eye_level":
+                viewpoint_template = self.template_manager.get_viewpoint_template(viewpoint)
+                prefix = viewpoint_template.get('prefix', '')
+                if prefix and not description.startswith(prefix):
+                    if description and description[0].isupper():
+                        description = prefix + description[0].lower() + description[1:]
+                    else:
+                        description = prefix + description
+                viewpoint_desc = viewpoint_template.get("observation", "").format(
+                    scene_elements="the landmark and surrounding area"
+                )
+                if viewpoint_desc and viewpoint_desc not in description:
+                    description = self.text_formatter.smart_append(description, viewpoint_desc)
+            # 添加功能區域描述
+            if functional_zones and len(functional_zones) > 0:
+                zones_desc = self.object_description_generator.describe_functional_zones(functional_zones)
+                if zones_desc:
+                    description = self.text_formatter.smart_append(description, zones_desc)
+            # 描述可能的活動
+            landmark_activities = []
+            if scene_type == "natural_landmark" or any(obj.get("landmark_type") == "natural" for obj in landmark_objects):
+                landmark_activities = [
+                    "nature photography",
+                    "scenic viewing",
+                    "hiking or walking",
+                    "guided nature tours",
+                    "outdoor appreciation"
+                ]
+            elif scene_type == "historical_monument" or any(obj.get("landmark_type") == "monument" for obj in landmark_objects):
+                landmark_activities = [
+                    "historical sightseeing",
+                    "educational tours",
+                    "cultural appreciation",
+                    "photography of historical architecture",
+                    "learning about historical significance"
+                ]
+            else:
+                landmark_activities = [
+                    "sightseeing",
+                    "taking photographs",
+                    "guided tours",
+                    "cultural tourism",
+                    "souvenir shopping"
+                ]
+            # 添加活動描述
+            if landmark_activities:
+                activities_text = "Common activities at this location include " + ", ".join(landmark_activities[:3]) + "."
+                description = self.text_formatter.smart_append(description, activities_text)
+            return self.text_formatter.format_final_description(description)
+        except Exception as e:
+            self.logger.warning(f"Error generating landmark description: {str(e)}")
+            # 備用處理
+            return self.text_formatter.format_final_description(
+                "A landmark scene with notable architectural or natural features."
+            )
+    def _is_intersection(self, detected_objects: List[Dict]) -> bool:
+        """
+        通過分析物件分布來判斷場景是否為十字路口
+        Args:
+            detected_objects: 檢測到的物件列表
+        Returns:
+            bool: 是否為十字路口
+        """
+        try:
+            pedestrians = [obj for obj in detected_objects if obj.get("class_id") == 0]
+            if len(pedestrians) >= 8:
+                positions = [obj.get("normalized_center", (0, 0)) for obj in pedestrians]
+                x_coords = [pos[0] for pos in positions]
+                y_coords = [pos[1] for pos in positions]
+                x_variance = np.var(x_coords) if len(x_coords) > 1 else 0
+                y_variance = np.var(y_coords) if len(y_coords) > 1 else 0
+                x_range = max(x_coords) - min(x_coords)
+                y_range = max(y_coords) - min(y_coords)
+                if x_range > 0.5 and y_range > 0.5 and 0.7 < (x_range / y_range) < 1.3:
+                    return True
+            return False
+        except Exception as e:
+            self.logger.warning(f"Error detecting intersection: {str(e)}")
+            return False
+    def _generate_generic_description(self, detected_objects: List[Dict], lighting_info: Optional[Dict] = None) -> str:
+        """
+        當場景類型未知或置信度極低時生成通用描述
+        Args:
+            detected_objects: 檢測到的物件列表
+            lighting_info: 可選的照明條件信息
+        Returns:
+            str: 基於檢測物件的通用描述
+        """
+        try:
+            obj_counts = {}
+            for obj in detected_objects:
+                class_name = obj.get("class_name", "unknown object")
+                if class_name not in obj_counts:
+                    obj_counts[class_name] = 0
+                obj_counts[class_name] += 1
+            top_objects = sorted(obj_counts.items(), key=lambda x: x[1], reverse=True)[:5]
+            if not top_objects:
+                base_desc = "This scene displays various elements, though specific objects are not clearly identifiable."
+            else:
+                objects_text = []
+                for name, count in top_objects:
+                    # 確保物件名稱不包含技術性格式
+                    clean_name = name.replace('_', ' ') if isinstance(name, str) else str(name)
+                    if count > 1:
+                        objects_text.append(f"{count} {clean_name}s")
+                    else:
+                        objects_text.append(f"a {clean_name}" if clean_name[0].lower() not in 'aeiou' else f"an {clean_name}")
+                if len(objects_text) == 1:
+                    objects_list = objects_text[0]
+                elif len(objects_text) == 2:
+                    objects_list = f"{objects_text[0]} and {objects_text[1]}"
+                else:
+                    objects_list = ", ".join(objects_text[:-1]) + f", and {objects_text[-1]}"
+                base_desc = f"This scene features {objects_list}."
+            # 添加照明信息
+            if lighting_info and "time_of_day" in lighting_info:
+                lighting_type = lighting_info["time_of_day"]
+                lighting_desc = self.template_manager.get_lighting_template(lighting_type)
+                base_desc += f" {lighting_desc}"
+            return base_desc
+        except Exception as e:
+            self.logger.warning(f"Error generating generic description: {str(e)}")
+            return "A general scene is visible with various elements."
+    def _generate_scene_details(self,
+                              scene_type: str,
+                              detected_objects: List[Dict],
+                              lighting_info: Optional[Dict] = None,
+                              viewpoint: str = "eye_level",
+                              spatial_analysis: Optional[Dict] = None,
+                              image_dimensions: Optional[Tuple[int, int]] = None,
+                              places365_info: Optional[Dict] = None,
+                              object_statistics: Optional[Dict] = None) -> str:
+        """
+        基於場景類型和檢測物件生成詳細描述
+        Args:
+            scene_type: 識別的場景類型
+            detected_objects: 檢測到的物件列表
+            lighting_info: 可選的照明條件信息
+            viewpoint: 檢測到的視角
+            spatial_analysis: 可選的空間分析結果
+            image_dimensions: 可選的圖像尺寸
+            places365_info: 可選的 Places365 場景分類結果
+            object_statistics: 可選的詳細物件統計信息
+        Returns:
+            str: 詳細場景描述
+        """
+        try:
+            scene_details = ""
+            # 日常場景類型列表
+            everyday_scene_types = [
+                "general_indoor_space", "generic_street_view",
+                "desk_area_workspace", "outdoor_gathering_spot",
+                "kitchen_counter_or_utility_area", "unknown"
+            ]
+            # 預處理場景類型以避免內部格式洩漏
+            processed_scene_type = self._sanitize_scene_type_for_description(scene_type)
+            # 確定場景描述方法
+            is_confident_specific_scene = scene_type not in everyday_scene_types and scene_type in self.template_manager.get_scene_detail_templates(scene_type)
+            treat_as_everyday = scene_type in everyday_scene_types
+            if hasattr(self, 'enable_landmark') and not self.enable_landmark:
+                if scene_type not in ["kitchen", "bedroom", "living_room", "office_workspace", "dining_area", "professional_kitchen"]:
+                    treat_as_everyday = True
+            if treat_as_everyday or not is_confident_specific_scene:
+                self.logger.debug(f"Generating dynamic description for scene_type: {scene_type}")
+                scene_details = self.object_description_generator.generate_dynamic_everyday_description(
+                    detected_objects,
+                    lighting_info,
+                    viewpoint,
+                    spatial_analysis,
+                    image_dimensions,
+                    places365_info,
+                    object_statistics
+                )
+            else:
+                self.logger.debug(f"Using template for scene_type: {scene_type}")
+                templates_list = self.template_manager.get_scene_detail_templates(scene_type, viewpoint)
+                if templates_list:
+                    detail_template = random.choice(templates_list)
+                    scene_details = self.template_manager.fill_template(
+                        detail_template,
+                        detected_objects,
+                        scene_type,
+                        places365_info,
+                        object_statistics
+                    )
+                else:
+                    scene_details = self.object_description_generator.generate_dynamic_everyday_description(
+                        detected_objects, lighting_info, viewpoint, spatial_analysis,
+                        image_dimensions, places365_info, object_statistics
+                    )
+            # 如果禁用地標檢測，過濾地標引用
+            if hasattr(self, 'enable_landmark') and not self.enable_landmark:
+                scene_details = self.text_formatter.filter_landmark_references(scene_details, enable_landmark=False)
+            return scene_details if scene_details else "A scene with some visual elements."
+        except Exception as e:
+            self.logger.warning(f"Error generating scene details: {str(e)}")
+            return "A scene with various elements."
+    def filter_landmark_references(self, text, enable_landmark=True):
+        """
+        動態過濾文本中的地標引用
+        Args:
+            text: 需要過濾的文本
+            enable_landmark: 是否啟用地標功能
+        Returns:
+            str: 過濾後的文本
+        """
+        return self.text_formatter.filter_landmark_references(text, enable_landmark)
+    def get_prominent_objects(self, detected_objects: List[Dict],
+                          min_prominence_score: float = 0.5,
+                          max_categories_to_return: Optional[int] = None,
+                          max_total_objects: Optional[int] = None) -> List[Dict]:
+        """
+        獲取最重要的物件
+        Args:
+            detected_objects: 檢測到的物件列表
+            min_prominence_score: 最小重要性分數閾值，預設為0.5
+            max_categories_to_return: 可選的最大返回類別數量限制
+            max_total_objects: 可選的最大返回物件總數限制
+        Returns:
+            List[Dict]: 重要物件列表
+        """
+        try:
+            # 傳遞所有參數
+            prominent_objects = self.object_description_generator.get_prominent_objects(
+                detected_objects,
+                min_prominence_score,
+                max_categories_to_return
+            )
+            # 如果指定了最大物件總數限制，進行額外過濾
+            if max_total_objects is not None and max_total_objects > 0:
+                # 限制總物件數量，保持重要性排序
+                prominent_objects = prominent_objects[:max_total_objects]
+            # 如果指定了最大類別數量限制，則進行額外過濾
+            if max_categories_to_return is not None and max_categories_to_return > 0:
+                # 按類別分組物件
+                categories_seen = set()
+                filtered_objects = []
+                for obj in prominent_objects:
+                    class_name = obj.get("class_name", "unknown")
+                    if class_name not in categories_seen:
+                        categories_seen.add(class_name)
+                        filtered_objects.append(obj)
+                        # 如果已達到最大類別數量，停止添加新類別
+                        if len(categories_seen) >= max_categories_to_return:
+                            break
+                    elif class_name in categories_seen:
+                        # 如果是已見過的類別，仍然添加該物件
+                        filtered_objects.append(obj)
+                return filtered_objects
+            return prominent_objects
+        except Exception as e:
+            self.logger.warning(f"Error getting prominent objects: {str(e)}")
+            return []
+    def detect_viewpoint(self, detected_objects: List[Dict]) -> str:
+        """
+        檢測圖像視角類型
+        Args:
+            detected_objects: 檢測到的物件列表
+        Returns:
+            str: 檢測到的視角類型
+        """
+        try:
+            return self.viewpoint_detector.detect_viewpoint(detected_objects)
+        except Exception as e:
+            self.logger.warning(f"Error detecting viewpoint: {str(e)}")
+            return "eye_level"
+    def detect_cultural_context(self, scene_type: str, detected_objects: List[Dict]) -> Optional[str]:
+        """
+        檢測場景的文化語境
+        Args:
+            scene_type: 識別的場景類型
+            detected_objects: 檢測到的物件列表
+        Returns:
+            Optional[str]: 檢測到的文化語境或None
+        """
+        try:
+            return self.cultural_context_analyzer.detect_cultural_context(scene_type, detected_objects)
+        except CulturalContextError as e:
+            self.logger.warning(f"Error detecting cultural context: {str(e)}")
+            return None
+    def generate_cultural_elements(self, cultural_context: str) -> str:
+        """
+        為檢測到的文化語境生成描述元素
+        Args:
+            cultural_context: 檢測到的文化語境
+        Returns:
+            str: 文化元素描述
+        """
+        try:
+            return self.cultural_context_analyzer.generate_cultural_elements(cultural_context)
+        except CulturalContextError as e:
+            self.logger.warning(f"Error generating cultural elements: {str(e)}")
+            return ""
+    def format_object_list_for_description(self, objects: List[Dict],
+                                         use_indefinite_article_for_one: bool = False,
+                                         count_threshold_for_generalization: int = -1,
+                                         max_types_to_list: int = 5) -> str:
+        """
+        將物件列表格式化為人類可讀的字符串
+        Args:
+            objects: 物件字典列表
+            use_indefinite_article_for_one: 單個物件是否使用 "a/an"
+            count_threshold_for_generalization: 計數閾值
+            max_types_to_list: 最大物件類型數���
+        Returns:
+            str: 格式化的物件描述字符串
+        """
+        try:
+            return self.object_description_generator.format_object_list_for_description(
+                objects, use_indefinite_article_for_one, count_threshold_for_generalization, max_types_to_list
+            )
+        except ObjectDescriptionError as e:
+            self.logger.warning(f"Error formatting object list: {str(e)}")
+            return "various objects"
+    def get_spatial_description(self, obj: Dict, image_width: Optional[int] = None,
+                              image_height: Optional[int] = None) -> str:
+        """
+        為物件生成空間位置描述
+        Args:
+            obj: 物件字典
+            image_width: 可選的圖像寬度
+            image_height: 可選的圖像高度
+        Returns:
+            str: 空間描述字符串
+        """
+        try:
+            return self.object_description_generator.get_spatial_description(obj, image_width, image_height)
+        except ObjectDescriptionError as e:
+            self.logger.warning(f"Error generating spatial description: {str(e)}")
+            return "in the scene"
+    def optimize_object_description(self, description: str) -> str:
+        """
+        優化物件描述，避免重複列舉相同物件
+        Args:
+            description: 原始描述文本
+        Returns:
+            str: 優化後的描述文本
+        """
+        try:
+            return self.object_description_generator.optimize_object_description(description)
+        except ObjectDescriptionError as e:
+            self.logger.warning(f"Error optimizing object description: {str(e)}")
+            return description
+    def describe_functional_zones(self, functional_zones: Dict) -> str:
+        """
+        生成場景功能區域的描述
+        Args:
+            functional_zones: 識別出的功能區域字典
+        Returns:
+            str: 功能區域描述
+        """
+        try:
+            return self.object_description_generator.describe_functional_zones(functional_zones)
+        except ObjectDescriptionError as e:
+            self.logger.warning(f"Error describing functional zones: {str(e)}")
+            return ""
+    def smart_append(self, current_text: str, new_fragment: str) -> str:
+        """
+        智能地將新文本片段附加到現有文本
+        Args:
+            current_text: 要附加到的現有文本
+            new_fragment: 要附加的新文本片段
+        Returns:
+            str: 合併後的文本
+        """
+        try:
+            return self.text_formatter.smart_append(current_text, new_fragment)
+        except TextFormattingError as e:
+            self.logger.warning(f"Error in smart append: {str(e)}")
+            return f"{current_text} {new_fragment}" if current_text else new_fragment
+    def format_final_description(self, text: str) -> str:
+        """
+        格式化最終描述文本
+        Args:
+            text: 要格式化的文本
+        Returns:
+            str: 格式化後的文本
+        """
+        try:
+            return self.text_formatter.format_final_description(text)
+        except TextFormattingError as e:
+            self.logger.warning(f"Error formatting final description: {str(e)}")
+            return text
+    def get_template(self, category: str, key: Optional[str] = None):
+        """
+        獲取指定類別的模板
+        Args:
+            category: 模板類別名稱
+            key: 可選的具體模板鍵值
+        Returns:
+            模板內容
+        """
+        try:
+            return self.template_manager.get_template(category, key)
+        except (TemplateLoadingError, TemplateFillError) as e:
+            self.logger.warning(f"Error getting template: {str(e)}")
+            return None
+    def get_viewpoint_confidence(self, detected_objects: List[Dict]) -> Tuple[str, float]:
+        """
+        獲取視角檢測結果及其信心度
+        Args:
+            detected_objects: 檢測到的物件列表
+        Returns:
+            Tuple[str, float]: (視角類型, 信心度)
+        """
+        try:
+            return self.viewpoint_detector.get_viewpoint_confidence(detected_objects)
+        except ViewpointDetectionError as e:
+            self.logger.warning(f"Error getting viewpoint confidence: {str(e)}")
+            return "eye_level", 0.5
+    def get_supported_cultures(self) -> List[str]:
+        """
+        獲取所有支援的文化語境列表
+        Returns:
+            List[str]: 支援的文化語境名稱列表
+        """
+        return self.cultural_context_analyzer.get_supported_cultures()
+    def has_cultural_context(self, cultural_context: str) -> bool:
+        """
+        檢查是否支援指定的文化語境
+        Args:
+            cultural_context: 文化語境名稱
+        Returns:
+            bool: 是否支援該文化語境
+        """
+        return self.cultural_context_analyzer.has_cultural_context(cultural_context)
+    def validate_text_quality(self, text: str) -> Dict[str, bool]:
+        """
+        驗證文本質量
+        Args:
+            text: 要驗證的文本
+        Returns:
+            Dict[str, bool]: 質量檢查結果
+        """
+        try:
+            return self.text_formatter.validate_text_quality(text)
+        except TextFormattingError as e:
+            self.logger.warning(f"Error validating text quality: {str(e)}")
+            return {"error": True}
+    def get_text_statistics(self, text: str) -> Dict[str, int]:
+        """
+        獲取文本統計信息
+        Args:
+            text: 要分析的文本
+        Returns:
+            Dict[str, int]: 文本統計信息
+        """
+        try:
+            return self.text_formatter.get_text_statistics(text)
+        except TextFormattingError as e:
+            self.logger.warning(f"Error getting text statistics: {str(e)}")
+            return {"characters": 0, "words": 0, "sentences": 0}
+    def reload_templates(self):
+        """
+        重新載入所有模板
+        """
+        try:
+            self.template_manager.reload_templates()
+            self.logger.info("Templates reloaded successfully")
+        except (TemplateLoadingError, TemplateFillError) as e:
+            self.logger.error(f"Error reloading templates: {str(e)}")
+            raise EnhancedSceneDescriberError(f"Failed to reload templates: {str(e)}") from e
+    def get_configuration(self) -> Dict[str, Any]:
+        """
+        獲取當前配置信息
+        Returns:
+            Dict[str, Any]: 配置信息字典
+        """
+        try:
+            return {
+                "scene_types_count": len(self.scene_types),
+                "viewpoint_detector_config": self.viewpoint_detector.viewpoint_params,
+                "object_generator_config": self.object_description_generator.get_configuration(),
+                "supported_cultures": self.cultural_context_analyzer.get_supported_cultures(),
+                "template_categories": self.template_manager.get_template_categories()
+            }
+        except Exception as e:
+            self.logger.warning(f"Error getting configuration: {str(e)}")
+            return {"error": str(e)}
+    def _initialize_fallback_components(self):
+        """備用組件初始化"""
+        try:
+            self.region_analyzer = RegionAnalyzer()
+            self.object_description_generator = ObjectDescriptionGenerator(
+                region_analyzer=self.region_analyzer
+            )
+        except Exception as e:
+            self.logger.error(f"Fallback component initialization failed: {str(e)}")

feature_extractor.py ADDED Viewed

	@@ -0,0 +1,822 @@

+import numpy as np
+import cv2
+import logging
+import traceback
+from typing import Dict, Any, Optional
+from configuration_manager import ConfigurationManager
+class FeatureExtractor:
+    """
+    Extracts comprehensive lighting and scene features from images.（主要從圖片提取光線資訊)
+    This class handles all basic feature computation including brightness analysis,
+    color characteristics, texture complexity, and structural features for
+    lighting analysis and scene understanding.
+    """
+    def __init__(self, config_manager: ConfigurationManager):
+        """
+        Initialize the feature extractor.
+        Args:
+            config_manager: Configuration manager instance for accessing thresholds.
+        """
+        self.config_manager = config_manager
+        self.logger = self._setup_logger()
+    def _setup_logger(self) -> logging.Logger:
+        """Set up logger for feature extraction operations."""
+        logger = logging.getLogger(f"{__name__}.FeatureExtractor")
+        if not logger.handlers:
+            handler = logging.StreamHandler()
+            formatter = logging.Formatter(
+                '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+            )
+            handler.setFormatter(formatter)
+            logger.addHandler(handler)
+            logger.setLevel(logging.INFO)
+        return logger
+    def extract_features(self, image_rgb: np.ndarray) -> Dict[str, Any]:
+        """
+        Extract all features from an RGB image.
+        Args:
+            image_rgb: Input image as numpy array in RGB format.
+        Returns:
+            Dictionary containing all extracted features.
+        """
+        try:
+            # Validate input image
+            if not self._validate_image(image_rgb):
+                return self._get_default_features()
+            # Get image dimensions and prepare processing parameters
+            height, width = image_rgb.shape[:2]
+            scale_factor = self._calculate_scale_factor(height, width)
+            # Create processed image versions
+            small_rgb = cv2.resize(
+                image_rgb,
+                (width // scale_factor, height // scale_factor),
+                interpolation=cv2.INTER_AREA
+            )
+            hsv_img = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2HSV)
+            gray_img = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2GRAY)
+            small_gray = cv2.cvtColor(small_rgb, cv2.COLOR_RGB2GRAY)
+            # Extract features by category
+            brightness_features = self.compute_brightness_features(hsv_img, height, width)
+            color_features = self.compute_color_features(hsv_img, height, width)
+            texture_features = self.compute_texture_features(small_gray, gray_img, height, width)
+            structure_features = self.compute_structure_features(
+                small_gray, gray_img, hsv_img, height, width, scale_factor
+            )
+            # Combine all features
+            features = {**brightness_features, **color_features, **texture_features, **structure_features}
+            # Add compatibility features for legacy code
+            legacy_features = self._compute_legacy_compatibility_features(
+                hsv_img, small_gray, features, scale_factor
+            )
+            features.update(legacy_features)
+            self.logger.debug(f"Successfully extracted {len(features)} features from image")
+            return features
+        except Exception as e:
+            self.logger.error(f"Error in feature extraction: {str(e)}")
+            self.logger.error(f"Traceback: {traceback.format_exc()}")
+            return self._get_default_features()
+    def compute_brightness_features(self, hsv_img: np.ndarray, height: int, width: int) -> Dict[str, float]:
+        """
+        Compute brightness-related features from HSV image.
+        Args:
+            hsv_img: Image in HSV color space.
+            height: Image height.
+            width: Image width.
+        Returns:
+            Dictionary containing brightness features.
+        """
+        try:
+            v_channel = hsv_img[:, :, 2]  # Value channel represents brightness
+            # 基本的亮度統計
+            avg_brightness = float(np.mean(v_channel))
+            brightness_std = float(np.std(v_channel))
+            # Pixel ratio calculations
+            dark_threshold = self.config_manager.feature_thresholds.dark_pixel_threshold
+            bright_threshold = self.config_manager.feature_thresholds.bright_pixel_threshold
+            total_pixels = height * width
+            dark_pixel_ratio = float(np.sum(v_channel < dark_threshold) / total_pixels)
+            bright_pixel_ratio = float(np.sum(v_channel > bright_threshold) / total_pixels)
+            # Brightness uniformity
+            brightness_uniformity = 1.0 - min(1.0, brightness_std / max(avg_brightness, 1e-5))
+            return {
+                "avg_brightness": avg_brightness,
+                "brightness_std": brightness_std,
+                "dark_pixel_ratio": dark_pixel_ratio,
+                "bright_pixel_ratio": bright_pixel_ratio,
+                "brightness_uniformity": brightness_uniformity
+            }
+        except Exception as e:
+            self.logger.error(f"Error computing brightness features: {str(e)}")
+            return {
+                "avg_brightness": 100.0,
+                "brightness_std": 50.0,
+                "dark_pixel_ratio": 0.0,
+                "bright_pixel_ratio": 0.0,
+                "brightness_uniformity": 0.5
+            }
+    def compute_color_features(self, hsv_img: np.ndarray, height: int, width: int) -> Dict[str, Any]:
+        """
+        Compute color-related features from HSV image.
+        Args:
+            hsv_img: Image in HSV color space.
+            height: Image height.
+            width: Image width.
+        Returns:
+            Dictionary containing color features.
+        """
+        try:
+            h_channel, s_channel, v_channel = cv2.split(hsv_img)
+            total_pixels = height * width
+            # Color ratio calculations
+            color_features = {}
+            # Blue color detection (general and sky-specific)
+            blue_mask = ((h_channel >= 90) & (h_channel <= 140))
+            color_features["blue_ratio"] = float(np.sum(blue_mask) / total_pixels)
+            # Sky-like blue detection
+            ft = self.config_manager.feature_thresholds
+            sky_blue_mask = (
+                (h_channel >= ft.sky_blue_hue_min) & (h_channel <= ft.sky_blue_hue_max) &
+                (s_channel > ft.sky_blue_sat_min) & (v_channel > ft.sky_blue_val_min)
+            )
+            color_features["sky_like_blue_ratio"] = float(np.sum(sky_blue_mask) / total_pixels)
+            # Yellow-orange detection
+            yellow_orange_mask = ((h_channel >= 15) & (h_channel <= 45))
+            color_features["yellow_orange_ratio"] = float(np.sum(yellow_orange_mask) / total_pixels)
+            # Gray detection
+            gray_mask = (
+                (s_channel < ft.gray_sat_max) &
+                (v_channel > ft.gray_val_min) &
+                (v_channel < ft.gray_val_max)
+            )
+            color_features["gray_ratio"] = float(np.sum(gray_mask) / total_pixels)
+            # Saturation statistics
+            color_features["avg_saturation"] = float(np.mean(s_channel))
+            # Sky region analysis
+            sky_region_features = self._analyze_sky_region(h_channel, s_channel, v_channel, height)
+            color_features.update(sky_region_features)
+            # Color atmosphere analysis
+            atmosphere_features = self._analyze_color_atmosphere(h_channel, s_channel, total_pixels)
+            color_features.update(atmosphere_features)
+            return color_features
+        except Exception as e:
+            self.logger.error(f"Error computing color features: {str(e)}")
+            return self._get_default_color_features()
+    def compute_texture_features(self, small_gray: np.ndarray, gray_img: np.ndarray,
+                                height: int, width: int) -> Dict[str, float]:
+        """
+        Compute texture and gradient features.
+        Args:
+            small_gray: Downscaled grayscale image for efficient processing.
+            gray_img: Full-resolution grayscale image.
+            height: Original image height.
+            width: Original image width.
+        Returns:
+            Dictionary containing texture features.
+        """
+        try:
+            # Compute gradients on small image for efficiency
+            gx = cv2.Sobel(small_gray, cv2.CV_32F, 1, 0, ksize=3)
+            gy = cv2.Sobel(small_gray, cv2.CV_32F, 0, 1, ksize=3)
+            avg_abs_gx = float(np.mean(np.abs(gx)))
+            avg_abs_gy = float(np.mean(np.abs(gy)))
+            # Gradient ratio (vertical to horizontal)
+            gradient_ratio_vertical_horizontal = avg_abs_gy / max(avg_abs_gx, 1e-5)
+            # Top region texture complexity
+            small_top_third_height = small_gray.shape[0] // 3
+            small_sky_region_gray = small_gray[:small_top_third_height, :]
+            if small_sky_region_gray.size > 0:
+                laplacian_var_sky = cv2.Laplacian(small_sky_region_gray, cv2.CV_64F).var()
+                top_region_texture_complexity = min(1.0, laplacian_var_sky / 1000.0)
+            else:
+                top_region_texture_complexity = 0.5
+            # Shadow clarity estimation
+            brightness_std = float(np.std(gray_img))
+            avg_brightness = float(np.mean(gray_img))
+            dark_pixel_ratio = float(np.sum(gray_img < 50) / (height * width))
+            if brightness_std > 60 and dark_pixel_ratio < 0.15 and avg_brightness > 100:
+                shadow_clarity_score = 0.7
+            elif brightness_std < 30 and dark_pixel_ratio > 0.1:
+                shadow_clarity_score = 0.3
+            else:
+                shadow_clarity_score = 0.5
+            # Edge density
+            edges_density = min(1.0, (avg_abs_gx + avg_abs_gy) / 100.0)
+            return {
+                "gradient_ratio_vertical_horizontal": gradient_ratio_vertical_horizontal,
+                "top_region_texture_complexity": top_region_texture_complexity,
+                "shadow_clarity_score": shadow_clarity_score,
+                "vertical_strength": avg_abs_gy,
+                "horizontal_strength": avg_abs_gx,
+                "edges_density": edges_density
+            }
+        except Exception as e:
+            self.logger.error(f"Error computing texture features: {str(e)}")
+            return {
+                "gradient_ratio_vertical_horizontal": 1.0,
+                "top_region_texture_complexity": 0.5,
+                "shadow_clarity_score": 0.5,
+                "vertical_strength": 0.0,
+                "horizontal_strength": 0.0,
+                "edges_density": 0.0
+            }
+    def compute_structure_features(self, small_gray: np.ndarray, gray_img: np.ndarray,
+                                  hsv_img: np.ndarray, height: int, width: int,
+                                  scale_factor: int) -> Dict[str, float]:
+        """
+        Compute structural features including ceiling likelihood and boundary clarity.
+        Args:
+            small_gray: Downscaled grayscale image.
+            gray_img: Full-resolution grayscale image.
+            hsv_img: HSV image for brightness analysis.
+            height: Original image height.
+            width: Original image width.
+            scale_factor: Downscaling factor used.
+        Returns:
+            Dictionary containing structural features.
+        """
+        try:
+            # Compute gradients
+            gx = cv2.Sobel(small_gray, cv2.CV_32F, 1, 0, ksize=3)
+            gy = cv2.Sobel(small_gray, cv2.CV_32F, 0, 1, ksize=3)
+            avg_abs_gx = float(np.mean(np.abs(gx)))
+            avg_abs_gy = float(np.mean(np.abs(gy)))
+            # Ceiling likelihood analysis
+            ceiling_features = self._analyze_ceiling_likelihood(
+                small_gray, hsv_img, gx, avg_abs_gx, height, scale_factor
+            )
+            # Boundary clarity analysis
+            boundary_clarity = self._compute_boundary_clarity(small_gray, avg_abs_gx, avg_abs_gy)
+            # Openness analysis
+            openness_top_edge = self._compute_openness_top_edge(gy, avg_abs_gy)
+            # Legacy compatibility features
+            legacy_structure = self._compute_legacy_structure_features(gray_img, height)
+            structure_features = {
+                "ceiling_likelihood": ceiling_features["ceiling_likelihood"],
+                "boundary_clarity": boundary_clarity,
+                "openness_top_edge": openness_top_edge,
+                **legacy_structure
+            }
+            return structure_features
+        except Exception as e:
+            self.logger.error(f"Error computing structure features: {str(e)}")
+            return {
+                "ceiling_likelihood": 0.0,
+                "boundary_clarity": 0.0,
+                "openness_top_edge": 0.5,
+                "ceiling_uniformity": 0.5,
+                "horizontal_line_ratio": 0.0
+            }
+    def _analyze_sky_region(self, h_channel: np.ndarray, s_channel: np.ndarray,
+                           v_channel: np.ndarray, height: int) -> Dict[str, float]:
+        """Analyze features specific to the sky region (top third of image)."""
+        try:
+            top_third_height = height // 3
+            sky_region_v = v_channel[:top_third_height, :]
+            sky_region_s = s_channel[:top_third_height, :]
+            sky_region_h = h_channel[:top_third_height, :]
+            if sky_region_v.size == 0:
+                return self._get_default_sky_features()
+            # Sky region brightness analysis
+            sky_region_avg_brightness = float(np.mean(sky_region_v))
+            overall_avg_brightness = float(np.mean(v_channel))
+            sky_region_brightness_ratio = sky_region_avg_brightness / max(overall_avg_brightness, 1e-5)
+            sky_region_saturation = float(np.mean(sky_region_s))
+            # Sky blue dominance in sky region
+            ft = self.config_manager.feature_thresholds
+            sky_region_blue_pixels = np.sum(
+                (sky_region_h >= ft.sky_blue_hue_min) & (sky_region_h <= ft.sky_blue_hue_max) &
+                (sky_region_s > ft.sky_blue_sat_min) & (sky_region_v > ft.sky_blue_val_min)
+            )
+            sky_region_blue_dominance = float(sky_region_blue_pixels / max(1, sky_region_v.size))
+            return {
+                "sky_region_brightness_ratio": sky_region_brightness_ratio,
+                "sky_region_saturation": sky_region_saturation,
+                "sky_region_blue_dominance": sky_region_blue_dominance,
+                "sky_brightness": sky_region_avg_brightness
+            }
+        except Exception as e:
+            self.logger.error(f"Error analyzing sky region: {str(e)}")
+            return self._get_default_sky_features()
+    def _analyze_color_atmosphere(self, h_channel: np.ndarray, s_channel: np.ndarray,
+                                 total_pixels: int) -> Dict[str, Any]:
+        """Analyze warm/cool color atmosphere."""
+        try:
+            cr = self.config_manager.color_ranges
+            # Warm colors detection
+            warm_mask = np.zeros_like(h_channel, dtype=bool)
+            for h_min, h_max in cr.warm_hue_ranges:
+                warm_mask |= ((h_channel >= h_min) & (h_channel <= h_max))
+            warm_ratio = float(np.sum(warm_mask & (s_channel > 30)) / total_pixels)
+            # Cool colors detection
+            cool_mask = np.zeros_like(h_channel, dtype=bool)
+            for h_min, h_max in cr.cool_hue_ranges:
+                cool_mask |= ((h_channel >= h_min) & (h_channel <= h_max))
+            cool_ratio = float(np.sum(cool_mask & (s_channel > 30)) / total_pixels)
+            # Determine overall atmosphere
+            if warm_ratio > cool_ratio and warm_ratio > 0.3:
+                color_atmosphere = "warm"
+            elif cool_ratio > warm_ratio and cool_ratio > 0.3:
+                color_atmosphere = "cool"
+            else:
+                color_atmosphere = "neutral"
+            return {
+                "warm_ratio": warm_ratio,
+                "cool_ratio": cool_ratio,
+                "color_atmosphere": color_atmosphere
+            }
+        except Exception as e:
+            self.logger.error(f"Error analyzing color atmosphere: {str(e)}")
+            return {
+                "warm_ratio": 0.0,
+                "cool_ratio": 0.0,
+                "color_atmosphere": "neutral"
+            }
+    def _analyze_ceiling_likelihood(self, small_gray: np.ndarray, hsv_img: np.ndarray,
+                                   gx: np.ndarray, avg_abs_gx: float, height: int,
+                                   scale_factor: int) -> Dict[str, float]:
+        """Analyze likelihood of ceiling presence."""
+        try:
+            ceiling_likelihood = 0.0
+            config = self.config_manager.indoor_outdoor_thresholds
+            # Get sky region brightness for analysis
+            v_channel = hsv_img[:, :, 2]
+            top_third_height = height // 3
+            sky_region_v = v_channel[:top_third_height, :]
+            sky_region_avg_brightness = float(np.mean(sky_region_v)) if sky_region_v.size > 0 else 0
+            # Get top region texture complexity
+            small_top_third_height = small_gray.shape[0] // 3
+            small_sky_region_gray = small_gray[:small_top_third_height, :]
+            if small_sky_region_gray.size > 0:
+                laplacian_var = cv2.Laplacian(small_sky_region_gray, cv2.CV_64F).var()
+                top_region_texture_complexity = min(1.0, laplacian_var / 1000.0)
+            else:
+                top_region_texture_complexity = 0.5
+            # Condition 1: Simple texture and moderate brightness
+            ceiling_texture_thresh = getattr(config, 'ceiling_texture_thresh', 0.4)
+            ceiling_brightness_min = getattr(config, 'ceiling_brightness_min', 60)
+            ceiling_brightness_max = getattr(config, 'ceiling_brightness_max', 230)
+            if (top_region_texture_complexity < ceiling_texture_thresh and
+                ceiling_brightness_min < sky_region_avg_brightness < ceiling_brightness_max):
+                ceiling_likelihood += 0.45
+            # Condition 2: Horizontal line strength
+            top_horizontal_lines_strength = float(np.mean(np.abs(gx[:small_gray.shape[0]//3, :])))
+            ceiling_horizontal_line_factor = getattr(config, 'ceiling_horizontal_line_factor', 1.15)
+            if top_horizontal_lines_strength > avg_abs_gx * ceiling_horizontal_line_factor:
+                ceiling_likelihood += 0.35
+            # Condition 3: Central bright spot (lamp detection)
+            center_y_sm, center_x_sm = small_gray.shape[0]//2, small_gray.shape[1]//2
+            lamp_check_radius_y = small_gray.shape[0] // 8
+            lamp_check_radius_x = small_gray.shape[1] // 8
+            center_region = small_gray[
+                max(0, center_y_sm - lamp_check_radius_y):min(small_gray.shape[0], center_y_sm + lamp_check_radius_y),
+                max(0, center_x_sm - lamp_check_radius_x):min(small_gray.shape[1], center_x_sm + lamp_check_radius_x)
+            ]
+            if center_region.size > 0:
+                avg_brightness = float(np.mean(small_gray))
+                center_brightness = float(np.mean(center_region))
+                ceiling_center_bright_factor = getattr(config, 'ceiling_center_bright_factor', 1.25)
+                if center_brightness > avg_brightness * ceiling_center_bright_factor:
+                    ceiling_likelihood += 0.30
+            # Sky dominance analysis for penalty
+            sky_region_blue_dominance = self._compute_sky_blue_dominance(hsv_img, height)
+            sky_region_brightness_ratio = sky_region_avg_brightness / max(float(np.mean(v_channel)), 1e-5)
+            # Penalties for strong sky signals
+            ceiling_max_sky_blue_thresh = getattr(config, 'ceiling_max_sky_blue_thresh', 0.08)
+            ceiling_max_sky_brightness_ratio = getattr(config, 'ceiling_max_sky_brightness_ratio', 1.15)
+            if (sky_region_blue_dominance < ceiling_max_sky_blue_thresh and
+                sky_region_brightness_ratio < ceiling_max_sky_brightness_ratio):
+                ceiling_likelihood += 0.15
+            # Strong sky override
+            sky_blue_dominance_strong_thresh = getattr(config, 'sky_blue_dominance_strong_thresh', 0.25)
+            sky_brightness_strong_thresh = getattr(config, 'sky_brightness_strong_thresh', 1.25)
+            ceiling_sky_override_factor = getattr(config, 'ceiling_sky_override_factor', 0.1)
+            if (sky_region_blue_dominance > sky_blue_dominance_strong_thresh and
+                sky_region_brightness_ratio > sky_brightness_strong_thresh):
+                ceiling_likelihood *= ceiling_sky_override_factor
+            ceiling_likelihood = min(1.0, ceiling_likelihood)
+            return {"ceiling_likelihood": ceiling_likelihood}
+        except Exception as e:
+            self.logger.error(f"Error analyzing ceiling likelihood: {str(e)}")
+            return {"ceiling_likelihood": 0.0}
+    def _compute_sky_blue_dominance(self, hsv_img: np.ndarray, height: int) -> float:
+        """Compute blue dominance in sky region."""
+        try:
+            h_channel, s_channel, v_channel = cv2.split(hsv_img)
+            top_third_height = height // 3
+            sky_region_h = h_channel[:top_third_height, :]
+            sky_region_s = s_channel[:top_third_height, :]
+            sky_region_v = v_channel[:top_third_height, :]
+            if sky_region_h.size == 0:
+                return 0.0
+            ft = self.config_manager.feature_thresholds
+            sky_region_blue_pixels = np.sum(
+                (sky_region_h >= ft.sky_blue_hue_min) & (sky_region_h <= ft.sky_blue_hue_max) &
+                (sky_region_s > ft.sky_blue_sat_min) & (sky_region_v > ft.sky_blue_val_min)
+            )
+            return float(sky_region_blue_pixels / max(1, sky_region_h.size))
+        except Exception as e:
+            self.logger.error(f"Error computing sky blue dominance: {str(e)}")
+            return 0.0
+    def _compute_boundary_clarity(self, small_gray: np.ndarray, avg_abs_gx: float,
+                                 avg_abs_gy: float) -> float:
+        """Compute boundary clarity score."""
+        try:
+            edge_width_sm = max(1, small_gray.shape[1] // 10)
+            edge_height_sm = max(1, small_gray.shape[0] // 10)
+            # Edge gradients
+            left_edge_grad_x = 0.0
+            right_edge_grad_x = 0.0
+            top_edge_grad_y = 0.0
+            if small_gray.shape[1] > edge_width_sm:
+                left_edge = small_gray[:, :edge_width_sm]
+                right_edge = small_gray[:, -edge_width_sm:]
+                left_edge_grad_x = float(np.mean(np.abs(cv2.Sobel(left_edge, cv2.CV_32F, 1, 0, ksize=3))))
+                right_edge_grad_x = float(np.mean(np.abs(cv2.Sobel(right_edge, cv2.CV_32F, 1, 0, ksize=3))))
+            if small_gray.shape[0] > edge_height_sm:
+                top_edge = small_gray[:edge_height_sm, :]
+                top_edge_grad_y = float(np.mean(np.abs(cv2.Sobel(top_edge, cv2.CV_32F, 0, 1, ksize=3))))
+            # Normalize against average gradients
+            boundary_clarity = (left_edge_grad_x + right_edge_grad_x + top_edge_grad_y) / (
+                3 * max(avg_abs_gx, avg_abs_gy, 1e-5)
+            )
+            boundary_clarity = min(1.0, boundary_clarity / 1.5)
+            return boundary_clarity
+        except Exception as e:
+            self.logger.error(f"Error computing boundary clarity: {str(e)}")
+            return 0.0
+    def _compute_openness_top_edge(self, gy: np.ndarray, avg_abs_gy: float) -> float:
+        """Compute openness of top edge."""
+        try:
+            top_edge_strip_gy = float(np.mean(np.abs(gy[:max(1, gy.shape[0]//20), :])))
+            openness_top_edge = 1.0 - min(1.0, top_edge_strip_gy / max(avg_abs_gy, 1e-5) / 0.5)
+            return openness_top_edge
+        except Exception as e:
+            self.logger.error(f"Error computing top edge openness: {str(e)}")
+            return 0.5
+    def _compute_legacy_compatibility_features(self, hsv_img: np.ndarray, small_gray: np.ndarray,
+                                             features: Dict[str, Any], scale_factor: int) -> Dict[str, Any]:
+        """Compute additional features for backward compatibility."""
+        try:
+            v_channel = hsv_img[:, :, 2]
+            # Light source detection
+            light_features = self._detect_light_sources(v_channel, features["avg_brightness"],
+                                                       features["brightness_std"], scale_factor)
+            # Street line detection
+            street_score = self._compute_street_line_score(small_gray)
+            # Additional legacy features
+            legacy_features = {
+                **light_features,
+                "street_line_score": street_score,
+                "sky_blue_ratio": features.get("sky_like_blue_ratio", 0.0),  # Alias
+                "gradient_ratio": features.get("gradient_ratio_vertical_horizontal", 1.0)  # Alias
+            }
+            return legacy_features
+        except Exception as e:
+            self.logger.error(f"Error computing legacy compatibility features: {str(e)}")
+            return {}
+    def _detect_light_sources(self, v_channel: np.ndarray, avg_brightness: float,
+                             brightness_std: float, scale_factor: int) -> Dict[str, float]:
+        """Detect artificial light sources in the image."""
+        try:
+            # Sample pixels for efficiency
+            sampled_v = v_channel[::scale_factor*2, ::scale_factor*2]
+            # Light threshold
+            light_threshold = min(
+                self.config_manager.feature_thresholds.light_source_abs_thresh,
+                avg_brightness + 2 * brightness_std
+            )
+            is_bright_spots = sampled_v > light_threshold
+            bright_spot_count = int(np.sum(is_bright_spots))
+            # Initialize light features
+            circular_light_count = 0
+            indoor_light_score = 0.0
+            light_distribution_uniformity = 0.5
+            # Analyze light distribution if spots are found
+            if 1 < bright_spot_count < 20:
+                bright_y, bright_x = np.where(is_bright_spots)
+                if len(bright_y) > 1:
+                    mean_x, mean_y = np.mean(bright_x), np.mean(bright_y)
+                    dist_from_center = np.sqrt((bright_x - mean_x)**2 + (bright_y - mean_y)**2)
+                    if np.std(dist_from_center) < np.mean(dist_from_center):
+                        circular_light_count = min(3, len(bright_y) // 2)
+                        light_distribution_uniformity = 0.7
+                    if np.mean(bright_y) < sampled_v.shape[0] / 2:
+                        indoor_light_score = 0.6
+                    else:
+                        indoor_light_score = 0.3
+            return {
+                "bright_spot_count": bright_spot_count,
+                "circular_light_count": circular_light_count,
+                "indoor_light_score": indoor_light_score,
+                "light_distribution_uniformity": light_distribution_uniformity
+            }
+        except Exception as e:
+            self.logger.error(f"Error detecting light sources: {str(e)}")
+            return {
+                "bright_spot_count": 0,
+                "circular_light_count": 0,
+                "indoor_light_score": 0.0,
+                "light_distribution_uniformity": 0.5
+            }
+    def _compute_street_line_score(self, small_gray: np.ndarray) -> float:
+        """Compute street line detection score."""
+        try:
+            street_line_score = 0.0
+            bottom_half_sm = small_gray[small_gray.shape[0]//2:, :]
+            if bottom_half_sm.size > 0:
+                bottom_vert_gradient = cv2.Sobel(bottom_half_sm, cv2.CV_32F, 0, 1, ksize=3)
+                strong_vert_lines = np.abs(bottom_vert_gradient) > 50
+                if np.sum(strong_vert_lines) > (bottom_half_sm.size * 0.05):
+                    street_line_score = 0.7
+            return street_line_score
+        except Exception as e:
+            self.logger.error(f"Error computing street line score: {str(e)}")
+            return 0.0
+    def _compute_legacy_structure_features(self, gray_img: np.ndarray, height: int) -> Dict[str, float]:
+        """Compute legacy structure features for backward compatibility."""
+        try:
+            # Top region analysis for ceiling uniformity
+            top_region = gray_img[:height//4, :]
+            top_region_std = float(np.std(top_region)) if top_region.size > 0 else 0.0
+            ceiling_uniformity = 1.0 - min(1.0, top_region_std / max(float(np.mean(top_region)) if top_region.size > 0 else 1e-5, 1e-5))
+            # Horizontal line detection in top region
+            if top_region.size > 0:
+                top_gradients = np.abs(cv2.Sobel(top_region, cv2.CV_32F, 0, 1, ksize=3))
+                horizontal_lines_strength = float(np.mean(top_gradients))
+                horizontal_line_ratio = min(1.0, horizontal_lines_strength / 40.0)
+            else:
+                horizontal_line_ratio = 0.0
+            # Boundary edge score computation
+            boundary_edge_score = self._compute_legacy_boundary_score(gray_img)
+            return {
+                "ceiling_uniformity": ceiling_uniformity,
+                "horizontal_line_ratio": horizontal_line_ratio,
+                "top_region_std": top_region_std,
+                "boundary_edge_score": boundary_edge_score
+            }
+        except Exception as e:
+            self.logger.error(f"Error computing legacy structure features: {str(e)}")
+            return {
+                "ceiling_uniformity": 0.5,
+                "horizontal_line_ratio": 0.0,
+                "top_region_std": 0.0,
+                "boundary_edge_score": 0.0
+            }
+    def _compute_legacy_boundary_score(self, gray_img: np.ndarray) -> float:
+        """Compute legacy boundary edge score."""
+        try:
+            height, width = gray_img.shape
+            # Create small version for boundary analysis
+            small_height, small_width = height // 4, width // 4
+            small_gray = cv2.resize(gray_img, (small_width, small_height), interpolation=cv2.INTER_AREA)
+            # Edge regions
+            left_edge_sm = small_gray[:, :small_width//6] if small_width > 6 else small_gray
+            right_edge_sm = small_gray[:, 5*small_width//6:] if small_width > 6 else small_gray
+            top_edge_sm = small_gray[:small_height//6, :] if small_height > 6 else small_gray
+            # Compute gradients for each edge
+            left_gradient = float(np.mean(np.abs(cv2.Sobel(left_edge_sm, cv2.CV_32F, 1, 0, ksize=3)))) if left_edge_sm.size > 0 else 0
+            right_gradient = float(np.mean(np.abs(cv2.Sobel(right_edge_sm, cv2.CV_32F, 1, 0, ksize=3)))) if right_edge_sm.size > 0 else 0
+            top_gradient = float(np.mean(np.abs(cv2.Sobel(top_edge_sm, cv2.CV_32F, 0, 1, ksize=3)))) if top_edge_sm.size > 0 else 0
+            # Combine and normalize
+            boundary_edge_score = (min(1.0, left_gradient/50) + min(1.0, right_gradient/50) + min(1.0, top_gradient/50)) / 3
+            return boundary_edge_score
+        except Exception as e:
+            self.logger.error(f"Error computing legacy boundary score: {str(e)}")
+            return 0.0
+    def _validate_image(self, image_rgb: np.ndarray) -> bool:
+        """Validate input image format and dimensions."""
+        try:
+            if not isinstance(image_rgb, np.ndarray):
+                self.logger.error("Input is not a numpy array")
+                return False
+            if len(image_rgb.shape) != 3 or image_rgb.shape[2] != 3:
+                self.logger.error(f"Invalid image shape: {image_rgb.shape}. Expected (H, W, 3)")
+                return False
+            height, width = image_rgb.shape[:2]
+            if height == 0 or width == 0:
+                self.logger.error(f"Invalid image dimensions: {height}x{width}")
+                return False
+            return True
+        except Exception as e:
+            self.logger.error(f"Error validating image: {str(e)}")
+            return False
+    def _calculate_scale_factor(self, height: int, width: int) -> int:
+        """Calculate appropriate scale factor for image processing efficiency."""
+        try:
+            base_scale = 4
+            scale_factor = base_scale + min(8, max(0, int((height * width) / (1000 * 1000)) if height * width > 0 else 0))
+            return max(1, scale_factor)
+        except Exception as e:
+            self.logger.error(f"Error calculating scale factor: {str(e)}")
+            return 4
+    def _get_default_features(self) -> Dict[str, Any]:
+        """Return default feature values in case of processing errors."""
+        return {
+            "avg_brightness": 100.0,
+            "brightness_std": 50.0,
+            "dark_pixel_ratio": 0.0,
+            "bright_pixel_ratio": 0.0,
+            "brightness_uniformity": 0.5,
+            "blue_ratio": 0.0,
+            "sky_like_blue_ratio": 0.0,
+            "yellow_orange_ratio": 0.0,
+            "gray_ratio": 0.0,
+            "avg_saturation": 100.0,
+            "sky_region_brightness_ratio": 1.0,
+            "sky_region_saturation": 0.0,
+            "sky_region_blue_dominance": 0.0,
+            "sky_brightness": 100.0,
+            "warm_ratio": 0.0,
+            "cool_ratio": 0.0,
+            "color_atmosphere": "neutral",
+            "gradient_ratio_vertical_horizontal": 1.0,
+            "top_region_texture_complexity": 0.5,
+            "shadow_clarity_score": 0.5,
+            "vertical_strength": 0.0,
+            "horizontal_strength": 0.0,
+            "edges_density": 0.0,
+            "ceiling_likelihood": 0.0,
+            "boundary_clarity": 0.0,
+            "openness_top_edge": 0.5,
+            "ceiling_uniformity": 0.5,
+            "horizontal_line_ratio": 0.0,
+            "top_region_std": 0.0,
+            "boundary_edge_score": 0.0,
+            "bright_spot_count": 0,
+            "circular_light_count": 0,
+            "indoor_light_score": 0.0,
+            "light_distribution_uniformity": 0.5,
+            "street_line_score": 0.0,
+            "sky_blue_ratio": 0.0,
+            "gradient_ratio": 1.0
+        }
+    def _get_default_color_features(self) -> Dict[str, Any]:
+        """Return default color feature values."""
+        return {
+            "blue_ratio": 0.0,
+            "sky_like_blue_ratio": 0.0,
+            "yellow_orange_ratio": 0.0,
+            "gray_ratio": 0.0,
+            "avg_saturation": 100.0,
+            "sky_region_brightness_ratio": 1.0,
+            "sky_region_saturation": 0.0,
+            "sky_region_blue_dominance": 0.0,
+            "sky_brightness": 100.0,
+            "warm_ratio": 0.0,
+            "cool_ratio": 0.0,
+            "color_atmosphere": "neutral"
+        }
+    def _get_default_sky_features(self) -> Dict[str, float]:
+        """Return default sky region feature values."""
+        return {
+            "sky_region_brightness_ratio": 1.0,
+            "sky_region_saturation": 0.0,
+            "sky_region_blue_dominance": 0.0,
+            "sky_brightness": 100.0
+        }

functional_zone_identifier.py ADDED Viewed

	@@ -0,0 +1,938 @@

+import logging
+import traceback
+from typing import Dict, List, Any, Optional
+logger = logging.getLogger(__name__)
+class FunctionalZoneIdentifier:
+    """
+    作為功能區域辨識的主要窗口
+    整合區域評估和場景特定的區域辨識邏輯，提供統一的功能區域辨識接口
+    """
+    def __init__(self, zone_evaluator=None, scene_zone_identifier=None, scene_viewpoint_analyzer=None):
+        """
+        初始化功能區域識別器
+        Args:
+            zone_evaluator: 區域評估器實例
+            scene_zone_identifier: 場景區域辨識器實例
+            scene_viewpoint_analyzer: 場景視角分析器
+        """
+        try:
+            self.zone_evaluator = zone_evaluator
+            self.scene_zone_identifier = scene_zone_identifier
+            self.scene_viewpoint_analyzer = scene_viewpoint_analyzer
+            self.viewpoint_detector = scene_viewpoint_analyzer
+            logger.info("FunctionalZoneIdentifier initialized successfully with SceneViewpointAnalyzer")
+        except Exception as e:
+            logger.error(f"Failed to initialize FunctionalZoneIdentifier: {str(e)}")
+            logger.error(traceback.format_exc())
+            raise
+    def identify_functional_zones(self, detected_objects: List[Dict], scene_type: str) -> Dict:
+        """
+        識別場景內的功能區域，具有針對不同視角和文化背景的改進檢測能力。
+        如果偵測到 is_landmark=True 的物件，則優先直接呼叫 identify_landmark_zones 並回傳結果。
+        """
+        try:
+            # 1. 如果沒有啟用地標功能，就先把所有有 is_landmark=True 的物件過濾掉
+            if not getattr(self, 'enable_landmark', True):
+                detected_objects = [obj for obj in detected_objects if not obj.get("is_landmark", False)]
+            # 2. 只要檢測到任何 is_landmark=True 的物件，立即優先使用 identify_landmark_zones
+            landmark_objects = [obj for obj in detected_objects if obj.get("is_landmark", False)]
+            if landmark_objects and self.scene_zone_identifier:
+                lm_zones = self.scene_zone_identifier.identify_landmark_zones(landmark_objects)
+                return self._standardize_zone_keys_and_descriptions(lm_zones)
+            # 3. city_street
+            if scene_type in ["tourist_landmark", "natural_landmark", "historical_monument"]:
+                scene_type = "city_street"
+            # 4.  判斷與物件數量檢查
+            if self.zone_evaluator:
+                should_identify = self.zone_evaluator.evaluate_zone_identification_feasibility(
+                    detected_objects, scene_type
+                )
+                if not should_identify:
+                    logger.info(f"Zone identification not feasible for scene type '{scene_type}'")
+                    return {}
+            else:
+                if len(detected_objects) < 2:
+                    logger.info("Insufficient objects for zone identification")
+                    return {}
+            # 5. 建立 category_regions
+            category_regions = self._build_category_regions_mapping(detected_objects)
+            zones = {}
+            # 6. 檢測場景視角
+            viewpoint_info = {"viewpoint": "eye_level"}
+            if self.scene_viewpoint_analyzer:
+                viewpoint_info = self.scene_viewpoint_analyzer.detect_scene_viewpoint(detected_objects)
+            # 7. 根據不同 scene_type 使用各種自己的區域辨識
+            if scene_type in ["living_room", "bedroom", "dining_area", "kitchen", "office_workspace", "meeting_room"]:
+                if self.scene_zone_identifier:
+                    raw_zones = self.scene_zone_identifier.identify_indoor_zones(
+                        category_regions, detected_objects, scene_type
+                    )
+                    zones.update(self._standardize_zone_keys_and_descriptions(raw_zones))
+            elif scene_type in ["city_street", "parking_lot", "park_area"]:
+                if self.scene_zone_identifier:
+                    raw_zones = self.scene_zone_identifier.identify_outdoor_general_zones(
+                        category_regions, detected_objects, scene_type
+                    )
+                    zones.update(self._standardize_zone_keys_and_descriptions(raw_zones))
+            elif "aerial" in scene_type or viewpoint_info.get("viewpoint") == "aerial":
+                if self.scene_zone_identifier:
+                    raw_zones = self.scene_zone_identifier.identify_aerial_view_zones(
+                        category_regions, detected_objects, scene_type
+                    )
+                    zones.update(self._standardize_zone_keys_and_descriptions(raw_zones))
+            elif "asian" in scene_type:
+                if self.scene_zone_identifier:
+                    asian_zones = self.scene_zone_identifier.identify_asian_cultural_zones(
+                        category_regions, detected_objects, scene_type
+                    )
+                    zones.update(self._standardize_zone_keys_and_descriptions(asian_zones))
+            elif scene_type == "urban_intersection":
+                if self.scene_zone_identifier:
+                    raw_zones = self.scene_zone_identifier.identify_intersection_zones(
+                        category_regions, detected_objects, viewpoint_info.get("viewpoint")
+                    )
+                    zones.update(self._standardize_zone_keys_and_descriptions(raw_zones))
+                    used_tl_count_per_region = {}
+                    for zone_info in raw_zones.values():
+                        obj_list = zone_info.get("objects", [])
+                        if "traffic light" in obj_list:
+                            rg = zone_info.get("region", "")
+                            count_in_zone = obj_list.count("traffic light")
+                            used_tl_count_per_region[rg] = used_tl_count_per_region.get(rg, 0) + count_in_zone
+                    signal_regions = {}
+                    for t in [obj for obj in detected_objects if obj.get("class_id") == 9]:
+                        region = t.get("region", "")
+                        signal_regions.setdefault(region, []).append(t)
+                    for idx, (region, signals) in enumerate(signal_regions.items()):
+                        total_in_region = len(signals)
+                        used_in_region = used_tl_count_per_region.get(region, 0)
+                        remaining_in_region = total_in_region - used_in_region
+                        if remaining_in_region > 0:
+                            direction = self._get_directional_description(region)
+                            if direction and direction != "central":
+                                zone_key = f"{direction} traffic control area"
+                            else:
+                                zone_key = "primary traffic control area" if idx == 0 else "auxiliary traffic control area"
+                            if zone_key in zones:
+                                suffix = 1
+                                new_key = f"{zone_key} ({suffix})"
+                                while new_key in zones:
+                                    suffix += 1
+                                    new_key = f"{zone_key} ({suffix})"
+                                zone_key = new_key
+                            zones[zone_key] = {
+                                "region": region,
+                                "objects": ["traffic light"] * remaining_in_region,
+                                "description": f"Traffic control area with {remaining_in_region} traffic lights in {region}"
+                            }
+                    for region, signals in signal_regions.items():
+                        used = used_tl_count_per_region.get(region, 0)
+                        total = len(signals)
+                        remaining = total - used
+                        # print(f"[DEBUG] Region '{region}': Total TL = {total}, Used in crossing = {used}, Remaining = {remaining}")
+            elif scene_type == "financial_district":
+                if self.scene_zone_identifier:
+                    fd_zones = self.scene_zone_identifier.identify_financial_district_zones(
+                        category_regions, detected_objects
+                    )
+                    zones.update(self._standardize_zone_keys_and_descriptions(fd_zones))
+            elif scene_type == "upscale_dining":
+                if self.scene_zone_identifier:
+                    ud_zones = self.scene_zone_identifier.identify_upscale_dining_zones(
+                        category_regions, detected_objects
+                    )
+                    zones.update(self._standardize_zone_keys_and_descriptions(ud_zones))
+            else:
+                # 如果不是上述任何一種場景，就用「預設功能區」
+                default_zones = self._identify_default_zones(category_regions, detected_objects)
+                zones.update(self._standardize_zone_keys_and_descriptions(default_zones))
+            # 8. 如果此時 zones 仍為空，就會變成 default → basic → fallback
+            if not zones:
+                default_zones = self._identify_default_zones(category_regions, detected_objects)
+                if default_zones:
+                    zones.update(self._standardize_zone_keys_and_descriptions(default_zones))
+                else:
+                    basic_zones = self._create_basic_zones_from_objects(detected_objects, scene_type)
+                    zones.update(self._standardize_zone_keys_and_descriptions(basic_zones))
+            # 通用 fallback：把所有還沒被列出的 (class_name, region) 通通補進去
+            fallback_zones = self._generate_category_fallback_zones(detected_objects, zones)
+            zones.update(fallback_zones)
+            # Debug: 列印出各功能區的 traffic light 統計
+            total_tl_in_zones = 0
+            for zone_key, zone_info in zones.items():
+                if isinstance(zone_info, dict):
+                    sub_objs = zone_info.get("objects", [])
+                else:
+                    sub_objs = []
+                t_in_zone = [obj for obj in sub_objs if obj == "traffic light"]
+                # print(f"[DEBUG] identify_functional_zones - Zone '{zone_key}' has {len(t_in_zone)} traffic light(s).")
+                total_tl_in_zones += len(t_in_zone)
+            # print(f"[DEBUG] identify_functional_zones - Total traffic lights in zones: {total_tl_in_zones}")
+            logger.info(f"Identified {len(zones)} functional zones for scene type '{scene_type}'")
+            return zones
+        except Exception as e:
+            logger.error(f"Error identifying functional zones: {str(e)}")
+            logger.error(traceback.format_exc())
+            return {}
+    def _standardize_zone_keys_and_descriptions(self, raw_zones: Dict) -> Dict:
+        """
+        標準化區域鍵名和描述，將內部標識符轉換為描述性名稱
+        Args:
+            raw_zones: 原始區域識別結果
+        Returns:
+            Dict: 標準化後的區域字典
+        """
+        try:
+            standardized_zones = {}
+            for zone_key, zone_data in raw_zones.items():
+                # 生成描述性的區域鍵名
+                descriptive_key = self._generate_descriptive_zone_key(zone_key, zone_data)
+                # 確保區域描述也經過標準化
+                if isinstance(zone_data, dict) and "description" in zone_data:
+                    zone_data["description"] = self._enhance_zone_description(zone_data["description"], zone_data)
+                standardized_zones[descriptive_key] = zone_data
+            return standardized_zones
+        except Exception as e:
+            logger.error(f"Error standardizing zone keys and descriptions: {str(e)}")
+            return raw_zones
+    def _generate_descriptive_zone_key(self, original_key: str, zone_data: Dict) -> str:
+        """
+        基於區域內容生成描述性的鍵名
+        核心修改：只要該區域內有任一個 'traffic light'，就優先回傳 'traffic control zone'，
+        """
+        try:
+            objects = zone_data.get("objects", [])
+            region = zone_data.get("region", "")
+            # 優先檢查是否含有 traffic light
+            if any(obj == "traffic light" or "traffic light" in obj for obj in objects):
+                return "traffic control zone"
+            # 如果沒有 traffic light，才繼續分析「主要物件」順序
+            primary_objects = self._analyze_primary_objects(objects)
+            # 依序檢查人、車、家具、紅綠燈等
+            if "person" in primary_objects:
+                if len([o for o in objects if o == "person"]) > 1:
+                    return "pedestrian activity area"
+                else:
+                    return "individual activity zone"
+            elif any(vehicle in primary_objects for vehicle in ["car", "truck", "bus", "motorcycle"]):
+                return "vehicle movement area"
+            elif any(furniture in primary_objects for furniture in ["chair", "table", "sofa", "bed"]):
+                return "furniture arrangement area"
+            # 若上述都不符合，改用「基於位置」做 fallback
+            position_descriptions = {
+                "top_left": "upper left area",
+                "top_center": "upper central area",
+                "top_right": "upper right area",
+                "middle_left": "left side area",
+                "middle_center": "main crossing area",
+                "middle_right": "right side area",
+                "bottom_left": "lower left area",
+                "bottom_center": "lower central area",
+                "bottom_right": "lower right area"
+            }
+            if region in position_descriptions:
+                return position_descriptions[region]
+            # 再次檢查主要物件，給出另一種 fallback 命名
+            if primary_objects:
+                if "traffic light" in primary_objects:
+                    return "traffic control zone"
+                elif any(vehicle in primary_objects for vehicle in ["car", "truck", "bus"]):
+                    return "vehicle movement area"
+                elif "person" in primary_objects:
+                    return "pedestrian activity area"
+            # 最後最後的備用名稱
+            return "activity area"
+        except Exception as e:
+            logger.warning(f"Error generating descriptive key for '{original_key}': {str(e)}")
+            return "activity area"
+    def _analyze_primary_objects(self, objects: List[str]) -> List[str]:
+        """
+        分析區域中的主要物件類型
+        Args:
+            objects: 物件名稱列表
+        Returns:
+            List[str]: 主要物件類型列表
+        """
+        try:
+            # 計算物件出現頻率
+            object_counts = {}
+            for obj in objects:
+                normalized_obj = obj.replace('_', ' ').lower().strip()
+                object_counts[normalized_obj] = object_counts.get(normalized_obj, 0) + 1
+            # 按出現頻率排序，返回前三個主要物件
+            sorted_objects = sorted(object_counts.items(), key=lambda x: x[1], reverse=True)
+            return [obj[0] for obj in sorted_objects[:3]]
+        except Exception as e:
+            logger.warning(f"Error analyzing primary objects: {str(e)}")
+            return []
+    def _enhance_zone_description(self, original_description: str, zone_data: Dict) -> str:
+        """
+        增強區域描述的自然性和完整性
+        """
+        try:
+            if not original_description or not original_description.strip():
+                return self._generate_fallback_description(zone_data)
+            import re
+            enhanced = original_description.strip()
+            # 改善技術性表達為自然語言
+            enhanced = re.sub(r'\bin central direction\b', 'in the center', enhanced)
+            enhanced = re.sub(r'\bin west area\b', 'on the left side', enhanced)
+            enhanced = re.sub(r'\bin east direction\b', 'on the right side', enhanced)
+            enhanced = re.sub(r'\bnear traffic signals\b', 'near the traffic lights', enhanced)
+            enhanced = re.sub(r'\bwith (\d+) (\w+)\b', r'where \1 \2 can be seen', enhanced)
+            # 移除重複和冗餘表達
+            enhanced = re.sub(r'\barea with.*?in.*?area\b', lambda m: m.group(0).split(' in ')[0], enhanced)
+            enhanced = enhanced.replace('traffic area', 'area').replace('crossing area', 'crossing')
+            # 標準化描述結構
+            if enhanced.startswith('Pedestrian'):
+                enhanced = re.sub(r'^Pedestrian crossing area', 'The main pedestrian crossing', enhanced)
+            elif enhanced.startswith('Vehicle'):
+                enhanced = re.sub(r'^Vehicle traffic area', 'The vehicle movement area', enhanced)
+            elif enhanced.startswith('Traffic control'):
+                enhanced = re.sub(r'^Traffic control area', 'Traffic management elements', enhanced)
+            # 移除內部標識符格式
+            enhanced = re.sub(r'\b\w+_\w+(?:_\w+)*\b', lambda m: m.group(0).replace('_', ' '), enhanced)
+            # 確保描述的完整性
+            if not enhanced.endswith('.'):
+                enhanced += '.'
+            # 改善描述的自然性
+            enhanced = enhanced.replace('with with', 'with')
+            enhanced = re.sub(r'\s{2,}', ' ', enhanced)
+            return enhanced
+        except Exception as e:
+            logger.warning(f"Error enhancing zone description: {str(e)}")
+            return original_description if original_description else "A functional area within the scene."
+    def _generate_fallback_description(self, zone_data: Dict) -> str:
+        """
+        為缺少描述的區域生成備用描述
+        Args:
+            zone_data: 區域數據
+        Returns:
+            str: 備用描述
+        """
+        try:
+            objects = zone_data.get("objects", [])
+            region = zone_data.get("region", "")
+            if objects:
+                object_count = len(objects)
+                unique_objects = list(set(objects))
+                if object_count == 1:
+                    return f"Area containing {unique_objects[0].replace('_', ' ')}."
+                elif len(unique_objects) <= 3:
+                    obj_list = ", ".join([obj.replace('_', ' ') for obj in unique_objects])
+                    return f"Area featuring {obj_list}."
+                else:
+                    return f"Multi-functional area with {object_count} elements including various objects."
+            return "Functional area within the scene."
+        except Exception as e:
+            logger.warning(f"Error generating fallback description: {str(e)}")
+            return "Activity area."
+    def _build_category_regions_mapping(self, detected_objects: List[Dict]) -> Dict:
+        """
+        建立物件按類別和區域的分組映射
+        Args:
+            detected_objects: 檢測到的物件列表
+        Returns:
+            按類別和區域分組的物件字典
+        """
+        try:
+            category_regions = {}
+            for obj in detected_objects:
+                category = self._categorize_object(obj)
+                if not category:
+                    continue
+                if category not in category_regions:
+                    category_regions[category] = {}
+                region = obj.get("region", "center")
+                if region not in category_regions[category]:
+                    category_regions[category][region] = []
+                category_regions[category][region].append(obj)
+            logger.debug(f"Built category regions mapping with {len(category_regions)} categories")
+            return category_regions
+        except Exception as e:
+            logger.error(f"Error building category regions mapping: {str(e)}")
+            logger.error(traceback.format_exc())
+            return {}
+    def _categorize_object(self, obj: Dict) -> str:
+        """
+        將檢測到的物件分類到功能類別中，用於區域識別
+        Args:
+            obj: 物件字典
+        Returns:
+            物件功能類別字串
+        """
+        try:
+            class_id = obj.get("class_id", -1)
+            class_name = obj.get("class_name", "").lower()
+            # 使用現有的類別映射（如果可用）
+            if hasattr(self, 'OBJECT_CATEGORIES') and self.OBJECT_CATEGORIES:
+                for category, ids in self.OBJECT_CATEGORIES.items():
+                    if class_id in ids:
+                        return category
+            # 基於COCO類別名稱的後備分類
+            furniture_items = ["chair", "couch", "bed", "dining table", "toilet"]
+            plant_items = ["potted plant"]
+            electronic_items = ["tv", "laptop", "mouse", "remote", "keyboard", "cell phone"]
+            vehicle_items = ["bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat"]
+            person_items = ["person"]
+            kitchen_items = ["bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl",
+                            "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog",
+                            "pizza", "donut", "cake", "refrigerator", "oven", "toaster", "sink", "microwave"]
+            sports_items = ["frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat",
+                        "baseball glove", "skateboard", "surfboard", "tennis racket"]
+            personal_items = ["handbag", "tie", "suitcase", "umbrella", "backpack"]
+            if any(item in class_name for item in furniture_items):
+                return "furniture"
+            elif any(item in class_name for item in plant_items):
+                return "plant"
+            elif any(item in class_name for item in electronic_items):
+                return "electronics"
+            elif any(item in class_name for item in vehicle_items):
+                return "vehicle"
+            elif any(item in class_name for item in person_items):
+                return "person"
+            elif any(item in class_name for item in kitchen_items):
+                return "kitchen_items"
+            elif any(item in class_name for item in sports_items):
+                return "sports"
+            elif any(item in class_name for item in personal_items):
+                return "personal_items"
+            else:
+                return "misc"
+        except Exception as e:
+            logger.error(f"Error categorizing object: {str(e)}")
+            logger.error(traceback.format_exc())
+            return "misc"
+    def _identify_default_zones(self, category_regions: Dict, detected_objects: List[Dict]) -> Dict:
+        """
+        當沒有匹配到特定場景類型時的一般功能區域識別
+        Args:
+            category_regions: 按類別和區域分組的物件字典
+            detected_objects: 檢測到的物件列表
+        Returns:
+            預設功能區域字典
+        """
+        try:
+            zones = {}
+            # 按類別分組物件並找到主要集中區域
+            for category, regions in category_regions.items():
+                if not regions:
+                    continue
+                # 找到此類別中物件最多的區域
+                main_region = max(regions.items(),
+                            key=lambda x: len(x[1]),
+                            default=(None, []))
+                if main_region[0] is None or len(main_region[1]) < 2:
+                    continue
+                # 創建基於物件類別的區域
+                zone_objects = [obj["class_name"] for obj in main_region[1]]
+                # 如果物件太少，跳過
+                if len(zone_objects) < 2:
+                    continue
+                # 根據類別創建區域名稱和描述
+                if category == "furniture":
+                    zones["furniture arrangement area"] = {
+                        "region": main_region[0],
+                        "objects": zone_objects,
+                        "description": f"Furniture arrangement area featuring {self._format_object_list_naturally(zone_objects[:3])}"
+                    }
+                elif category == "electronics":
+                    zones["electronics area"] = {
+                        "region": main_region[0],
+                        "objects": zone_objects,
+                        "description": f"Electronics area containing {self._format_object_list_naturally(zone_objects[:3])}"
+                    }
+                elif category == "kitchen_items":
+                    zones["dining_zone"] = {
+                        "region": main_region[0],
+                        "objects": zone_objects,
+                        "description": f"Dining or food area with {', '.join(zone_objects[:3])}"
+                    }
+                elif category == "vehicle":
+                    zones["vehicle_zone"] = {
+                        "region": main_region[0],
+                        "objects": zone_objects,
+                        "description": f"Area with vehicles including {', '.join(zone_objects[:3])}"
+                    }
+                elif category == "personal_items":
+                    zones["personal_items_zone"] = {
+                        "region": main_region[0],
+                        "objects": zone_objects,
+                        "description": f"Area with personal items including {', '.join(zone_objects[:3])}"
+                    }
+            # 檢查人群聚集
+            people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
+            if len(people_objs) >= 2:
+                people_regions = {}
+                for obj in people_objs:
+                    region = obj["region"]
+                    if region not in people_regions:
+                        people_regions[region] = []
+                    people_regions[region].append(obj)
+                if people_regions:
+                    main_people_region = max(people_regions.items(),
+                                        key=lambda x: len(x[1]),
+                                        default=(None, []))
+                    if main_people_region[0] is not None:
+                        zones["people_zone"] = {
+                            "region": main_people_region[0],
+                            "objects": ["person"] * len(main_people_region[1]),
+                            "description": f"Area with {len(main_people_region[1])} people"
+                        }
+            logger.debug(f"Identified {len(zones)} default zones")
+            return zones
+        except Exception as e:
+            logger.error(f"Error identifying default zones: {str(e)}")
+            logger.error(traceback.format_exc())
+            return {}
+    def _format_object_list_naturally(self, object_list: List[str]) -> str:
+        """
+        將物件列表格式化為自然語言表達
+        Args:
+            object_list: 物件名稱列表
+        Returns:
+            str: 自然語言格式的物件列表
+        """
+        try:
+            if not object_list:
+                return "various items"
+            # 標準化物件名稱
+            normalized_objects = []
+            for obj in object_list:
+                normalized = obj.replace('_', ' ').strip()
+                if normalized:
+                    normalized_objects.append(normalized)
+            if not normalized_objects:
+                return "various items"
+            # 格式化列表
+            if len(normalized_objects) == 1:
+                return normalized_objects[0]
+            elif len(normalized_objects) == 2:
+                return f"{normalized_objects[0]} and {normalized_objects[1]}"
+            else:
+                return ", ".join(normalized_objects[:-1]) + f", and {normalized_objects[-1]}"
+        except Exception as e:
+            logger.warning(f"Error formatting object list naturally: {str(e)}")
+            return "various items"
+    def _create_basic_zones_from_objects(self, detected_objects: List[Dict], scene_type: str) -> Dict:
+        """
+        從個別高置信度物件創建基本功能區域
+        這是標準區域識別失敗時的後備方案
+        Args:
+            detected_objects: 檢測到的物件列表
+            scene_type: 場景類型
+        Returns:
+            基本區域字典
+        """
+        try:
+            zones = {}
+            # 專注於高置信度物件
+            high_conf_objects = [obj for obj in detected_objects if obj.get("confidence", 0) >= 0.6]
+            if not high_conf_objects:
+                high_conf_objects = detected_objects  # 後備到所有物件
+            # 基於個別重要物件創建區域
+            processed_objects = set()  # 避免重複處理相同類型的物件
+            for obj in high_conf_objects[:3]:  # 限制為前3個物件
+                class_name = obj["class_name"]
+                region = obj.get("region", "center")
+                # 避免為同一類型物件創建多個區域
+                if class_name in processed_objects:
+                    continue
+                processed_objects.add(class_name)
+                # 基於物件類型創建描述性區域
+                zone_description = self._get_basic_zone_description(class_name, scene_type)
+                descriptive_key = self._generate_object_based_zone_key(class_name, region)
+                if zone_description and descriptive_key:
+                    zones[descriptive_key] = {
+                        "region": region,
+                        "objects": [class_name],
+                        "description": zone_description
+                    }
+            logger.debug(f"Created {len(zones)} basic zones from high confidence objects")
+            return zones
+        except Exception as e:
+            logger.error(f"Error creating basic zones from objects: {str(e)}")
+            logger.error(traceback.format_exc())
+            return {}
+    def _generate_object_based_zone_key(self, class_name: str, region: str) -> str:
+        """
+        基於物件類型和位置生成描述性的區域鍵名
+        Args:
+            class_name: 物件類別名稱
+            region: 區域位置
+        Returns:
+            str: 描述性區域鍵名
+        """
+        try:
+            # 標準化物件名稱
+            normalized_class = class_name.replace('_', ' ').lower().strip()
+            # 物件類型對應的區域描述
+            object_zone_mapping = {
+                'person': 'activity area',
+                'car': 'vehicle area',
+                'truck': 'vehicle area',
+                'bus': 'vehicle area',
+                'motorcycle': 'vehicle area',
+                'bicycle': 'cycling area',
+                'traffic light': 'traffic control area',
+                'chair': 'seating area',
+                'sofa': 'seating area',
+                'bed': 'rest area',
+                'dining table': 'dining area',
+                'tv': 'entertainment area',
+                'laptop': 'workspace area',
+                'potted plant': 'decorative area'
+            }
+            base_description = object_zone_mapping.get(normalized_class, f"{normalized_class} area")
+            # 添加位置信息以提供更具體的描述
+            position_modifiers = {
+                'top_left': 'upper left',
+                'top_center': 'upper central',
+                'top_right': 'upper right',
+                'middle_left': 'left side',
+                'middle_center': 'central',
+                'middle_right': 'right side',
+                'bottom_left': 'lower left',
+                'bottom_center': 'lower central',
+                'bottom_right': 'lower right'
+            }
+            if region in position_modifiers:
+                return f"{position_modifiers[region]} {base_description}"
+            return base_description
+        except Exception as e:
+            logger.warning(f"Error generating object-based zone key for '{class_name}': {str(e)}")
+            return "activity area"
+    def _get_basic_zone_description(self, class_name: str, scene_type: str) -> str:
+        """
+        基於物件和場景類型生成基本區域描述
+        Args:
+            class_name: 物件類別名稱
+            scene_type: 場景類型
+        Returns:
+            區域描述字串
+        """
+        try:
+            # 物件特定描述
+            descriptions = {
+                "bed": "Sleeping and rest area",
+                "sofa": "Seating and relaxation area",
+                "chair": "Seating area",
+                "dining table": "Dining and meal area",
+                "tv": "Entertainment and media area",
+                "laptop": "Work and computing area",
+                "potted plant": "Decorative and green space area",
+                "refrigerator": "Food storage and kitchen area",
+                "car": "Vehicle and transportation area",
+                "person": "Activity and social area"
+            }
+            return descriptions.get(class_name, f"Functional area with {class_name}")
+        except Exception as e:
+            logger.error(f"Error getting basic zone description for '{class_name}': {str(e)}")
+            return f"Functional area with {class_name}"
+    def _generate_category_fallback_zones(self, all_detected_objects: List[Dict], current_zones: Dict) -> Dict:
+        """
+        通用 fallback：針對 all_detected_objects 裡，每一個 (class_name, region) 組合是否已經
+        在 current_zones 裡出現過。如果還沒，就為它們產生一個 fallback zone。
+        """
+        general_fallback = {
+                0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus',
+                6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light', 10: 'fire hydrant',
+                11: 'stop sign', 12: 'parking meter', 13: 'bench', 14: 'bird', 15: 'cat',
+                16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow', 20: 'elephant', 21: 'bear',
+                22: 'zebra', 23: 'giraffe', 24: 'backpack', 25: 'umbrella', 26: 'handbag',
+                27: 'tie', 28: 'suitcase', 29: 'frisbee', 30: 'skis', 31: 'snowboard',
+                32: 'sports ball', 33: 'kite', 34: 'baseball bat', 35: 'baseball glove',
+                36: 'skateboard', 37: 'surfboard', 38: 'tennis racket', 39: 'bottle',
+                40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife', 44: 'spoon', 45: 'bowl',
+                46: 'banana', 47: 'apple', 48: 'sandwich', 49: 'orange', 50: 'broccoli',
+                51: 'carrot', 52: 'hot dog', 53: 'pizza', 54: 'donut', 55: 'cake', 56: 'chair',
+                57: 'couch', 58: 'potted plant', 59: 'bed', 60: 'dining table', 61: 'toilet',
+                62: 'tv', 63: 'laptop', 64: 'mouse', 65: 'remote', 66: 'keyboard',
+                67: 'cell phone', 68: 'microwave', 69: 'oven', 70: 'toaster', 71: 'sink',
+                72: 'refrigerator', 73: 'book', 74: 'clock', 75: 'vase', 76: 'scissors',
+                77: 'teddy bear', 78: 'hair drier', 79: 'toothbrush'
+        }
+        # 1. 統計 current_zones 裡，已使用掉的 (class_name, region) 次數
+        used_count = {}
+        for zone_info in current_zones.values():
+            rg = zone_info.get("region", "")
+            for obj_name in zone_info.get("objects", []):
+                key = (obj_name, rg)
+                used_count[key] = used_count.get(key, 0) + 1
+        # 2. 統計 all_detected_objects 裡的 (class_name, region) 總次數
+        total_count = {}
+        for obj in all_detected_objects:
+            cname = obj.get("class_name", "")
+            rg = obj.get("region", "")
+            key = (cname, rg)
+            total_count[key] = total_count.get(key, 0) + 1
+        # 3. 把 default_classes 轉換成「class_name → fallback 區域 type」的對照表
+        category_to_fallback = {
+            # 行人與交通工具
+            "person":        "pedestrian area",
+            "bicycle":       "vehicle movement area",
+            "car":           "vehicle movement area",
+            "motorcycle":    "vehicle movement area",
+            "airplane":      "vehicle movement area",
+            "bus":           "vehicle movement area",
+            "train":         "vehicle movement area",
+            "truck":         "vehicle movement area",
+            "boat":          "vehicle movement area",
+            "traffic light": "traffic control area",
+            "fire hydrant":  "traffic control area",
+            "stop sign":     "traffic control area",
+            "parking meter": "traffic control area",
+            "bench":         "public furniture area",
+            # 動物類、鳥類
+            "bird":          "animal area",
+            "cat":           "animal area",
+            "dog":           "animal area",
+            "horse":         "animal area",
+            "sheep":         "animal area",
+            "cow":           "animal area",
+            "elephant":      "animal area",
+            "bear":          "animal area",
+            "zebra":         "animal area",
+            "giraffe":       "animal area",
+            # 托運與行李
+            "backpack":      "personal items area",
+            "umbrella":      "personal items area",
+            "handbag":       "personal items area",
+            "tie":           "personal items area",
+            "suitcase":      "personal items area",
+            # 運動器材
+            "frisbee":       "sports area",
+            "skis":          "sports area",
+            "snowboard":     "sports area",
+            "sports ball":   "sports area",
+            "kite":          "sports area",
+            "baseball bat":  "sports area",
+            "baseball glove":"sports area",
+            "skateboard":    "sports area",
+            "surfboard":     "sports area",
+            "tennis racket": "sports area",
+            # 廚房與食品（Kitchen）
+            "bottle":        "kitchen area",
+            "wine glass":    "kitchen area",
+            "cup":           "kitchen area",
+            "fork":          "kitchen area",
+            "knife":         "kitchen area",
+            "spoon":         "kitchen area",
+            "bowl":          "kitchen area",
+            "banana":        "kitchen area",
+            "apple":         "kitchen area",
+            "sandwich":      "kitchen area",
+            "orange":        "kitchen area",
+            "broccoli":      "kitchen area",
+            "carrot":        "kitchen area",
+            "hot dog":       "kitchen area",
+            "pizza":         "kitchen area",
+            "donut":         "kitchen area",
+            "cake":          "kitchen area",
+            "dining table":  "furniture arrangement area",
+            "refrigerator":  "kitchen area",
+            "oven":          "kitchen area",
+            "microwave":     "kitchen area",
+            "toaster":       "kitchen area",
+            "sink":          "kitchen area",
+            "book":          "miscellaneous area",
+            "clock":         "miscellaneous area",
+            "vase":          "decorative area",
+            "scissors":      "miscellaneous area",
+            "teddy bear":    "miscellaneous area",
+            "hair drier":    "miscellaneous area",
+            "toothbrush":    "miscellaneous area",
+            # 電子產品
+            "tv":            "electronics area",
+            "laptop":        "electronics area",
+            "mouse":         "electronics area",
+            "remote":        "electronics area",
+            "keyboard":      "electronics area",
+            "cell phone":    "electronics area",
+            # 家具類
+            "chair":         "furniture arrangement area",
+            "couch":         "furniture arrangement area",
+            "bed":           "furniture arrangement area",
+            "toilet":        "furniture arrangement area",
+            # 植物（室內植物或戶外綠化）
+            "potted plant":  "decorative area",
+        }
+        # 4. 計算缺少的 (class_name, region) 並建立 fallback zone
+        for (cname, rg), total in total_count.items():
+            used = used_count.get((cname, rg), 0)
+            missing = total - used
+            if missing <= 0:
+                continue
+            # (A) 決定這個 cname 在 fallback 裡屬於哪個大 class（zone_type）
+            zone_type = category_to_fallback.get(cname, "miscellaneous area")
+            # (B) 根據 region 與 zone_type 組合成 fallback_key
+            fallback_key = f"{rg} {zone_type}"
+            # (C) 如果名稱重複，就在後面加 (1),(2),… 避免掉衝突
+            if fallback_key in current_zones or fallback_key in general_fallback:
+                suffix = 1
+                new_key = f"{fallback_key} ({suffix})"
+                while new_key in current_zones or new_key in general_fallback:
+                    suffix += 1
+                    new_key = f"{fallback_key} ({suffix})"
+                fallback_key = new_key
+            # (D) 建立這支 fallback zone，objects 裡放 missing 個 cname
+            general_fallback[fallback_key] = {
+                "region": rg,
+                "objects": [cname] * missing,
+                "description": f"{missing} {cname}(s) placed in fallback {zone_type} for region {rg}"
+            }
+        return general_fallback

image_analyzer.py ADDED Viewed

	@@ -0,0 +1,365 @@

+import numpy as np
+import logging
+import traceback
+from typing import List, Dict, Tuple, Optional, Union, Any
+from PIL import Image
+class ImageAnalyzer:
+    """
+    專注於圖像分析和預處理，包括多尺度金字塔分析、視角分析、建築特徵識別和圖像增強等功能
+    """
+    def __init__(self):
+        """
+        初始化圖像分析器
+        """
+        self.logger = logging.getLogger(__name__)
+    def get_image_hash(self, image: Union[Image.Image, np.ndarray]) -> int:
+        """
+        為圖像生成簡單的 hash 值用於快取
+        Args:
+            image: PIL Image 或 numpy 數組
+        Returns:
+            int: 圖像的 hash 值
+        """
+        try:
+            if isinstance(image, np.ndarray):
+                # 對於 numpy 數組，降採樣並計算簡單 hash
+                small_img = image[::10, ::10] if image.ndim == 3 else image
+                return hash(small_img.tobytes())
+            else:
+                # 對於 PIL 圖像，調整大小後轉換為 bytes
+                small_img = image.resize((32, 32))
+                return hash(small_img.tobytes())
+        except Exception as e:
+            self.logger.error(f"Error generating image hash: {e}")
+            self.logger.error(traceback.format_exc())
+            return 0
+    def enhance_features(self, image: Union[Image.Image, np.ndarray]) -> Image.Image:
+        """
+        增強圖像特徵以改善地標檢測
+        Args:
+            image: 輸入圖像
+        Returns:
+            PIL.Image: 增強後的圖像
+        """
+        try:
+            # ensure PIL format
+            if not isinstance(image, Image.Image):
+                if isinstance(image, np.ndarray):
+                    image = Image.fromarray(image)
+                else:
+                    raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
+            # 轉換為numpy進行處理
+            img_array = np.array(image)
+            # 跳過灰度圖像的處理
+            if len(img_array.shape) < 3:
+                return image
+            # 應用自適應對比度增強
+            try:
+                from skimage import color, exposure
+                # 轉換到LAB色彩空間
+                if img_array.shape[2] == 4:  # 處理RGBA
+                    img_array = img_array[:,:,:3]
+                lab = color.rgb2lab(img_array[:,:,:3] / 255.0)
+                l_channel = lab[:,:,0]
+                # 增強L通道的對比度
+                p2, p98 = np.percentile(l_channel, (2, 98))
+                l_channel_enhanced = exposure.rescale_intensity(l_channel, in_range=(p2, p98))
+                # 替換L通道並轉換回RGB
+                lab[:,:,0] = l_channel_enhanced
+                enhanced_img = color.lab2rgb(lab) * 255.0
+                enhanced_img = enhanced_img.astype(np.uint8)
+                return Image.fromarray(enhanced_img)
+            except ImportError:
+                self.logger.warning("skimage not available for feature enhancement")
+                return image
+        except Exception as e:
+            self.logger.error(f"Error in feature enhancement: {e}")
+            self.logger.error(traceback.format_exc())
+            return image
+    def analyze_viewpoint(self, image: Union[Image.Image, np.ndarray],
+                         clip_model_manager) -> Dict[str, Any]:
+        """
+        分析圖像視角以調整檢測參數
+        Args:
+            image: 輸入圖像
+            clip_model_manager: CLIP模型管理器實例
+        Returns:
+            Dict: 視角分析結果
+        """
+        try:
+            viewpoint_prompts = {
+                "aerial_view": "an aerial view from above looking down",
+                "street_level": "a street level view looking up at a tall structure",
+                "eye_level": "an eye-level horizontal view of a landmark",
+                "distant": "a distant view of a landmark on the horizon",
+                "close_up": "a close-up detailed view of architectural features",
+                "interior": "an interior view inside a structure",
+                "angled_view": "an angled view of a structure",
+                "low_angle": "a low angle view looking up at a building"
+            }
+            # 計算相似度分數
+            viewpoint_scores = self.calculate_similarity_scores(image, viewpoint_prompts, clip_model_manager)
+            # 找到主要視角
+            dominant_viewpoint = max(viewpoint_scores.items(), key=lambda x: x[1])
+            return {
+                "viewpoint_scores": viewpoint_scores,
+                "dominant_viewpoint": dominant_viewpoint[0],
+                "confidence": dominant_viewpoint[1]
+            }
+        except Exception as e:
+            self.logger.error(f"Error in viewpoint analysis: {e}")
+            self.logger.error(traceback.format_exc())
+            return {
+                "viewpoint_scores": {},
+                "dominant_viewpoint": "eye_level",
+                "confidence": 0.0
+            }
+    def calculate_similarity_scores(self, image: Union[Image.Image, np.ndarray],
+                                  prompts: Dict[str, str],
+                                  clip_model_manager) -> Dict[str, float]:
+        """
+        計算圖像與一組特定提示之間的相似度分數
+        Args:
+            image: 輸入圖像
+            prompts: 提示詞字典 {名稱: 提示文本}
+            clip_model_manager: CLIP模型管理器實例
+        Returns:
+            Dict[str, float]: 每個提示的相似度分數
+        """
+        try:
+            # ensure PIL format
+            if not isinstance(image, Image.Image):
+                if isinstance(image, np.ndarray):
+                    image = Image.fromarray(image)
+                else:
+                    raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
+            # preprocess image
+            image_input = clip_model_manager.preprocess_image(image)
+            # get image features
+            image_features = clip_model_manager.encode_image(image_input)
+            # 計算與每個提示的similarity
+            scores = {}
+            prompt_texts = list(prompts.values())
+            prompt_features = clip_model_manager.encode_single_text(prompt_texts)
+            # 計算相似度
+            similarity = clip_model_manager.calculate_similarity(image_features, prompt_features)
+            # result
+            for i, (name, _) in enumerate(prompts.items()):
+                scores[name] = float(similarity[0][i])
+            return scores
+        except Exception as e:
+            self.logger.error(f"Error calculating similarity scores: {e}")
+            self.logger.error(traceback.format_exc())
+            return {}
+    def analyze_architectural_features(self, image: Union[Image.Image, np.ndarray],
+                                     clip_model_manager) -> Dict[str, Any]:
+        """
+        分析圖像中結構的建築特徵，不硬編碼特定地標
+        Args:
+            image: 輸入圖像
+            clip_model_manager: CLIP模型管理器實例
+        Returns:
+            Dict: 建築特徵分析結果
+        """
+        try:
+            # 定義通用建築特徵提示，適用於所有類型的地標
+            architecture_prompts = {
+                "tall_structure": "a tall vertical structure standing alone",
+                "tiered_building": "a building with multiple stacked tiers or segments",
+                "historical_structure": "a building with historical architectural elements",
+                "modern_design": "a modern structure with contemporary architectural design",
+                "segmented_exterior": "a structure with visible segmented or sectioned exterior",
+                "viewing_platform": "a tall structure with observation area at the top",
+                "time_display": "a structure with timepiece features",
+                "glass_facade": "a building with prominent glass exterior surfaces",
+                "memorial_structure": "a monument or memorial structure",
+                "ancient_construction": "ancient constructed elements or archaeological features",
+                "natural_landmark": "a natural geographic formation or landmark",
+                "slanted_design": "a structure with non-vertical or leaning profile"
+            }
+            # 計算與通用建築模式的相似度分數
+            context_scores = self.calculate_similarity_scores(image, architecture_prompts, clip_model_manager)
+            # 確定最相關的建築特徵
+            top_features = sorted(context_scores.items(), key=lambda x: x[1], reverse=True)[:3]
+            # 計算特徵置信度
+            context_confidence = sum(score for _, score in top_features) / 3
+            # 根據頂級特徵確定主要建築類別
+            architectural_categories = {
+                "tower": ["tall_structure", "viewing_platform", "time_display"],
+                "skyscraper": ["tall_structure", "modern_design", "glass_facade"],
+                "historical": ["historical_structure", "ancient_construction", "memorial_structure"],
+                "natural": ["natural_landmark"],
+                "distinctive": ["tiered_building", "segmented_exterior", "slanted_design"]
+            }
+            # 根據頂級特徵為每個類別評分
+            category_scores = {}
+            for category, features in architectural_categories.items():
+                category_score = 0
+                for feature, score in context_scores.items():
+                    if feature in features:
+                        category_score += score
+                category_scores[category] = category_score
+            primary_category = max(category_scores.items(), key=lambda x: x[1])[0]
+            return {
+                "architectural_features": top_features,
+                "context_confidence": context_confidence,
+                "primary_category": primary_category,
+                "category_scores": category_scores
+            }
+        except Exception as e:
+            self.logger.error(f"Error in architectural feature analysis: {e}")
+            self.logger.error(traceback.format_exc())
+            return {
+                "architectural_features": [],
+                "context_confidence": 0.0,
+                "primary_category": "building",
+                "category_scores": {}
+            }
+    def perform_pyramid_analysis(self, image: Union[Image.Image, np.ndarray],
+                               clip_model_manager, landmark_data_manager,
+                               levels: int = 4, base_threshold: float = 0.25,
+                               aspect_ratios: List[float] = [1.0, 0.75, 1.5]) -> Dict[str, Any]:
+        """
+        對圖像執行多尺度金字塔分析以改善地標檢測
+        Args:
+            image: 輸入圖像
+            clip_model_manager: CLIP模型管理器實例
+            landmark_data_manager: 地標數據管理器實例
+            levels: 金字塔層級數
+            base_threshold: 基礎置信度閾值
+            aspect_ratios: 不同縱橫比列表
+        Returns:
+            Dict: 金字塔分析結果
+        """
+        try:
+            # 確保圖像是PIL格式
+            if not isinstance(image, Image.Image):
+                if isinstance(image, np.ndarray):
+                    image = Image.fromarray(image)
+                else:
+                    raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
+            width, height = image.size
+            pyramid_results = []
+            # 獲取預計算的地標文本特徵
+            landmark_prompts = landmark_data_manager.get_landmark_prompts()
+            if not landmark_prompts:
+                return {
+                    "is_landmark": False,
+                    "results": [],
+                    "best_result": None
+                }
+            landmark_text_features = clip_model_manager.encode_text_batch(landmark_prompts)
+            # 對每個縮放和縱橫比組合進行處理
+            for level in range(levels):
+                # 計算縮放因子
+                scale_factor = 1.0 - (level * 0.2)
+                for aspect_ratio in aspect_ratios:
+                    # 計算新尺寸，保持面積近似不變
+                    if aspect_ratio != 1.0:
+                        # 保持面積近似不變的情況下調整縱橫比
+                        new_width = int(width * scale_factor * (1/aspect_ratio)**0.5)
+                        new_height = int(height * scale_factor * aspect_ratio**0.5)
+                    else:
+                        new_width = int(width * scale_factor)
+                        new_height = int(height * scale_factor)
+                    # 調整圖像大小
+                    scaled_image = image.resize((new_width, new_height), Image.LANCZOS)
+                    # 預處理圖像
+                    image_input = clip_model_manager.preprocess_image(scaled_image)
+                    # 獲取圖像特徵
+                    image_features = clip_model_manager.encode_image(image_input)
+                    # 計算相似度
+                    similarity = clip_model_manager.calculate_similarity(image_features, landmark_text_features)
+                    # 找到最佳匹配
+                    best_idx = similarity[0].argmax().item()
+                    best_score = similarity[0][best_idx]
+                    if best_score >= base_threshold:
+                        landmark_id, landmark_info = landmark_data_manager.get_landmark_by_index(best_idx)
+                        if landmark_id:
+                            pyramid_results.append({
+                                "landmark_id": landmark_id,
+                                "landmark_name": landmark_info.get("name", "Unknown"),
+                                "confidence": float(best_score),
+                                "scale_factor": scale_factor,
+                                "aspect_ratio": aspect_ratio,
+                                "location": landmark_info.get("location", "Unknown Location")
+                            })
+            # 按置信度排序
+            pyramid_results.sort(key=lambda x: x["confidence"], reverse=True)
+            return {
+                "is_landmark": len(pyramid_results) > 0,
+                "results": pyramid_results,
+                "best_result": pyramid_results[0] if pyramid_results else None
+            }
+        except Exception as e:
+            self.logger.error(f"Error in pyramid analysis: {e}")
+            self.logger.error(traceback.format_exc())
+            return {
+                "is_landmark": False,
+                "results": [],
+                "best_result": None
+            }

image_processor.py CHANGED Viewed

@@ -32,6 +32,26 @@ class ImageProcessor:
             self.enable_places365 = enable_places365
             self.model_instances = {}
             # Initialize ColorMapper
             self.color_mapper = ColorMapper()
             print("ColorMapper initialized successfully")
@@ -57,12 +77,12 @@ class ImageProcessor:
             # Initialize SceneAnalyzer with error handling
             self.scene_analyzer = None
-            self.class_names = None  # Will be set when first model is loaded
             try:
                 # Initialize SceneAnalyzer without class_names (will be set later)
                 self.scene_analyzer = SceneAnalyzer(
-                    class_names=None,
                     use_llm=self.use_llm,
                     use_clip=True,
                     enable_landmark=True,
@@ -365,9 +385,14 @@ class ImageProcessor:
             else:
                 # Update existing scene analyzer with current settings
                 if result and hasattr(result, 'names'):
-                    self.scene_analyzer.class_names = result.names
                     if hasattr(self.scene_analyzer, 'spatial_analyzer') and self.scene_analyzer.spatial_analyzer:
-                        self.scene_analyzer.spatial_analyzer.class_names = result.names
                 self.scene_analyzer.enable_landmark = enable_landmark
                 if hasattr(self.scene_analyzer, 'spatial_analyzer') and self.scene_analyzer.spatial_analyzer:

             self.enable_places365 = enable_places365
             self.model_instances = {}
+            self.coco_class_names = {
+                0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane',
+                5: 'bus', 6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light',
+                10: 'fire hydrant', 11: 'stop sign', 12: 'parking meter', 13: 'bench',
+                14: 'bird', 15: 'cat', 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow',
+                20: 'elephant', 21: 'bear', 22: 'zebra', 23: 'giraffe', 24: 'backpack',
+                25: 'umbrella', 26: 'handbag', 27: 'tie', 28: 'suitcase', 29: 'frisbee',
+                30: 'skis', 31: 'snowboard', 32: 'sports ball', 33: 'kite', 34: 'baseball bat',
+                35: 'baseball glove', 36: 'skateboard', 37: 'surfboard', 38: 'tennis racket',
+                39: 'bottle', 40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife',
+                44: 'spoon', 45: 'bowl', 46: 'banana', 47: 'apple', 48: 'sandwich',
+                49: 'orange', 50: 'broccoli', 51: 'carrot', 52: 'hot dog', 53: 'pizza',
+                54: 'donut', 55: 'cake', 56: 'chair', 57: 'couch', 58: 'potted plant',
+                59: 'bed', 60: 'dining table', 61: 'toilet', 62: 'tv', 63: 'laptop',
+                64: 'mouse', 65: 'remote', 66: 'keyboard', 67: 'cell phone',
+                68: 'microwave', 69: 'oven', 70: 'toaster', 71: 'sink', 72: 'refrigerator',
+                73: 'book', 74: 'clock', 75: 'vase', 76: 'scissors', 77: 'teddy bear',
+                78: 'hair drier', 79: 'toothbrush'
+            }
             # Initialize ColorMapper
             self.color_mapper = ColorMapper()
             print("ColorMapper initialized successfully")
             # Initialize SceneAnalyzer with error handling
             self.scene_analyzer = None
+            self.class_names = self.coco_class_names
             try:
                 # Initialize SceneAnalyzer without class_names (will be set later)
                 self.scene_analyzer = SceneAnalyzer(
+                    class_names=self.coco_class_names,
                     use_llm=self.use_llm,
                     use_clip=True,
                     enable_landmark=True,
             else:
                 # Update existing scene analyzer with current settings
                 if result and hasattr(result, 'names'):
+                    # 使用檢測結果的類別名稱或回退到預定義映射
+                    current_class_names = result.names if result.names else self.coco_class_names
+                    self.scene_analyzer.class_names = current_class_names
                     if hasattr(self.scene_analyzer, 'spatial_analyzer') and self.scene_analyzer.spatial_analyzer:
+                        self.scene_analyzer.spatial_analyzer.update_class_names(current_class_names)
+                    logger.info(f"Updated class names in scene analyzer: {list(current_class_names.keys())}")
                 self.scene_analyzer.enable_landmark = enable_landmark
                 if hasattr(self.scene_analyzer, 'spatial_analyzer') and self.scene_analyzer.spatial_analyzer:

indoor_outdoor_classifier.py ADDED Viewed

	@@ -0,0 +1,755 @@

+import numpy as np
+import logging
+import traceback
+from typing import Dict, Any, Optional, List
+from configuration_manager import ConfigurationManager
+class IndoorOutdoorClassifier:
+    """
+    Classifies scenes as indoor or outdoor based on visual features and Places365 context.(判斷室內室外)
+    此class會融入PLACES365，使判斷更準確
+    This class implements sophisticated decision logic that combines multiple evidence sources
+    including visual scene analysis, structural features, and external scene classification
+    data to determine whether a scene is indoor or outdoor.
+    """
+    def __init__(self, config_manager: ConfigurationManager):
+        """
+        Initialize the indoor/outdoor classifier.
+        Args:
+            config_manager: Configuration manager instance for accessing thresholds and weights.
+        """
+        self.config_manager = config_manager
+        self.logger = self._setup_logger()
+        # Internal threshold constants for Places365 confidence levels
+        self.P365_HIGH_CONF_THRESHOLD = 0.65
+        self.P365_MODERATE_CONF_THRESHOLD = 0.4
+        # 以下是絕對室內/室外的基本情況
+        self.DEFINITELY_OUTDOOR_KEYWORDS_P365 = [
+            "street", "road", "highway", "park", "beach", "mountain", "forest", "field",
+            "outdoor", "sky", "coast", "courtyard", "square", "plaza", "bridge",
+            "parking_lot", "playground", "stadium", "construction_site", "river", "ocean",
+            "desert", "garden", "trail", "intersection", "crosswalk", "sidewalk", "pathway",
+            "avenue", "boulevard", "downtown", "city_center", "market_outdoor"
+        ]
+        self.DEFINITELY_INDOOR_KEYWORDS_P365 = [
+            "bedroom", "office", "kitchen", "library", "classroom", "conference_room", "living_room",
+            "bathroom", "hospital", "hotel_room", "cabin", "interior", "museum", "gallery",
+            "mall", "market_indoor", "basement", "corridor", "lobby", "restaurant_indoor",
+            "bar_indoor", "shop_indoor", "gym_indoor"
+        ]
+    def _setup_logger(self) -> logging.Logger:
+        """Set up logger for classification operations."""
+        logger = logging.getLogger(f"{__name__}.IndoorOutdoorClassifier")
+        if not logger.handlers:
+            handler = logging.StreamHandler()
+            formatter = logging.Formatter(
+                '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+            )
+            handler.setFormatter(formatter)
+            logger.addHandler(handler)
+            logger.setLevel(logging.INFO)
+        return logger
+    def classify(self, features: Dict[str, Any], places365_info: Optional[Dict] = None) -> Dict[str, Any]:
+        """
+        Classify scene as indoor or outdoor based on features and Places365 context.
+        Args:
+            features: Dictionary containing extracted image features.
+            places365_info: Optional Places365 classification information.
+        Returns:
+            Dictionary containing classification results including decision, probability,
+            feature contributions, and diagnostic information.
+        """
+        try:
+            self.logger.debug("Starting indoor/outdoor classification")
+            # Initialize classification components
+            visual_score = 0.0
+            feature_contributions = {}
+            diagnostics = {}
+            # Extract Places365 information
+            p365_context = self._extract_places365_context(places365_info, diagnostics)
+            # Compute visual evidence score
+            visual_analysis = self._analyze_visual_evidence(features, diagnostics)
+            visual_score = visual_analysis["visual_score"]
+            feature_contributions.update(visual_analysis["contributions"])
+            # Incorporate Places365 influence
+            p365_analysis = self._analyze_places365_influence(
+                p365_context, visual_analysis.get("strong_sky_signal", False), diagnostics
+            )
+            p365_influence_score = p365_analysis["influence_score"]
+            if abs(p365_influence_score) > 0.01:
+                feature_contributions["places365_influence_score"] = round(p365_influence_score, 2)
+            # Calculate final score and probability
+            final_indoor_score = visual_score + p365_influence_score
+            classification_result = self._compute_final_classification(
+                final_indoor_score, visual_score, p365_influence_score, diagnostics
+            )
+            # Apply Places365 override if conditions are met
+            override_result = self._apply_places365_override(
+                classification_result, p365_context, diagnostics
+            )
+            # Ensure default values for missing contributions
+            self._ensure_default_contributions(feature_contributions)
+            # 最終結果
+            result = {
+                "is_indoor": override_result["is_indoor"],
+                "indoor_probability": override_result["indoor_probability"],
+                "indoor_score_raw": override_result["final_score"],
+                "feature_contributions": feature_contributions,
+                "diagnostics": diagnostics
+            }
+            self.logger.debug(f"Classification complete: indoor={result['is_indoor']}, "
+                            f"probability={result['indoor_probability']:.3f}")
+            return result
+        except Exception as e:
+            self.logger.error(f"Error in indoor/outdoor classification: {str(e)}")
+            self.logger.error(f"Traceback: {traceback.format_exc()}")
+            return self._get_default_classification_result()
+    def _extract_places365_context(self, places365_info: Optional[Dict],
+                                  diagnostics: Dict[str, Any]) -> Dict[str, Any]:
+        """Extract and validate Places365 context information."""
+        context = {
+            "mapped_scene": "unknown",
+            "is_indoor_from_classification": None,
+            "attributes": [],
+            "confidence": 0.0,
+            "is_indoor": None
+        }
+        if places365_info:
+            context["mapped_scene"] = places365_info.get('mapped_scene_type', 'unknown').lower()
+            context["attributes"] = [attr.lower() for attr in places365_info.get('attributes', [])]
+            context["confidence"] = places365_info.get('confidence', 0.0)
+            context["is_indoor_from_classification"] = places365_info.get('is_indoor_from_classification', None)
+            context["is_indoor"] = places365_info.get('is_indoor', None)
+            diagnostics["p365_context_received"] = (
+                f"P365 Scene: {context['mapped_scene']}, P365 SceneConf: {context['confidence']:.2f}, "
+                f"P365 DirectIndoor: {context['is_indoor_from_classification']}, "
+                f"P365 Attrs: {context['attributes']}"
+            )
+        return context
+    def _analyze_visual_evidence(self, features: Dict[str, Any],
+                                diagnostics: Dict[str, Any]) -> Dict[str, Any]:
+        """Analyze visual evidence for indoor/outdoor classification."""
+        visual_score = 0.0
+        contributions = {}
+        strong_sky_signal = False
+        # Sky and openness analysis
+        sky_analysis = self._analyze_sky_evidence(features, diagnostics)
+        visual_score += sky_analysis["score"]
+        if sky_analysis["score"] != 0:
+            contributions["sky_openness_features_visual"] = round(sky_analysis["score"], 2)
+        strong_sky_signal = sky_analysis["strong_signal"]
+        # Enclosure and structural analysis
+        enclosure_analysis = self._analyze_enclosure_evidence(features, strong_sky_signal, diagnostics)
+        visual_score += enclosure_analysis["score"]
+        if enclosure_analysis["score"] != 0:
+            contributions["enclosure_features"] = round(enclosure_analysis["score"], 2)
+        # Brightness uniformity analysis
+        uniformity_analysis = self._analyze_brightness_uniformity(features, strong_sky_signal, diagnostics)
+        visual_score += uniformity_analysis["score"]
+        if uniformity_analysis["score"] != 0:
+            contributions["brightness_uniformity_contribution"] = round(uniformity_analysis["score"], 2)
+        # Light source analysis
+        light_analysis = self._analyze_light_sources(features, strong_sky_signal, diagnostics)
+        visual_score += light_analysis["score"]
+        if light_analysis["score"] != 0:
+            contributions["light_source_features"] = round(light_analysis["score"], 2)
+        # Color atmosphere analysis
+        atmosphere_analysis = self._analyze_color_atmosphere(features, strong_sky_signal, diagnostics)
+        visual_score += atmosphere_analysis["score"]
+        if atmosphere_analysis["score"] != 0:
+            contributions["warm_atmosphere_indoor_visual_contrib"] = round(atmosphere_analysis["score"], 2)
+        # Home environment pattern analysis
+        home_analysis = self._analyze_home_environment_pattern(features, strong_sky_signal, diagnostics)
+        visual_score += home_analysis["score"]
+        if home_analysis["score"] != 0:
+            contributions["home_environment_pattern_visual"] = round(home_analysis["score"], 2)
+        # Aerial street pattern analysis
+        aerial_analysis = self._analyze_aerial_street_pattern(features, strong_sky_signal, contributions, diagnostics)
+        visual_score += aerial_analysis["score"]
+        if aerial_analysis["score"] != 0:
+            contributions["aerial_street_pattern_visual"] = round(aerial_analysis["score"], 2)
+        diagnostics["visual_indoor_score_subtotal"] = round(visual_score, 3)
+        return {
+            "visual_score": visual_score,
+            "contributions": contributions,
+            "strong_sky_signal": strong_sky_signal
+        }
+    def _analyze_sky_evidence(self, features: Dict[str, Any],
+                             diagnostics: Dict[str, Any]) -> Dict[str, Any]:
+        """Analyze sky-related evidence for outdoor classification."""
+        sky_evidence_score = 0.0
+        strong_sky_signal = False
+        # Extract relevant features
+        sky_blue_dominance = features.get("sky_region_blue_dominance", 0.0)
+        sky_brightness_ratio = features.get("sky_region_brightness_ratio", 1.0)
+        texture_complexity = features.get("top_region_texture_complexity", 0.5)
+        openness_top_edge = features.get("openness_top_edge", 0.5)
+        # Get thresholds
+        thresholds = self.config_manager.indoor_outdoor_thresholds
+        weights = self.config_manager.weighting_factors
+        # Strong blue sky signal
+        if sky_blue_dominance > thresholds.sky_blue_dominance_thresh:
+            sky_evidence_score -= weights.sky_blue_dominance_w * sky_blue_dominance
+            diagnostics["sky_detection_reason_visual"] = f"Visual: Strong sky-like blue ({sky_blue_dominance:.2f})"
+            strong_sky_signal = True
+        # Bright top region with low texture
+        elif (sky_brightness_ratio > getattr(thresholds, 'sky_brightness_ratio_strong_thresh', 1.35) and
+              texture_complexity < getattr(thresholds, 'sky_texture_complexity_clear_thresh', 0.25)):
+            outdoor_push = weights.sky_brightness_ratio_w * (sky_brightness_ratio - 1.0)
+            sky_evidence_score -= outdoor_push
+            sky_evidence_score -= weights.sky_texture_w
+            diagnostics["sky_detection_reason_visual"] = (
+                f"Visual: Top brighter (ratio:{sky_brightness_ratio:.2f}) & low texture."
+            )
+            strong_sky_signal = True
+        # High top edge openness
+        elif openness_top_edge > getattr(thresholds, 'openness_top_strong_thresh', 0.80):
+            sky_evidence_score -= weights.openness_top_w * openness_top_edge
+            diagnostics["sky_detection_reason_visual"] = (
+                f"Visual: Very high top edge openness ({openness_top_edge:.2f})."
+            )
+            strong_sky_signal = True
+        # Weak sky signal (cloudy conditions)
+        elif (not strong_sky_signal and
+              texture_complexity < getattr(thresholds, 'sky_texture_complexity_cloudy_thresh', 0.20) and
+              sky_brightness_ratio > getattr(thresholds, 'sky_brightness_ratio_cloudy_thresh', 0.95)):
+            sky_evidence_score -= weights.sky_texture_w * (1.0 - texture_complexity) * 0.5
+            diagnostics["sky_detection_reason_visual"] = (
+                f"Visual: Weak sky signal (low texture, brightish top: {texture_complexity:.2f}), less weight."
+            )
+        if strong_sky_signal:
+            diagnostics["strong_sky_signal_visual_detected"] = True
+        return {
+            "score": sky_evidence_score,
+            "strong_signal": strong_sky_signal
+        }
+    def _analyze_enclosure_evidence(self, features: Dict[str, Any], strong_sky_signal: bool,
+                                   diagnostics: Dict[str, Any]) -> Dict[str, Any]:
+        """Analyze enclosure evidence for indoor classification."""
+        enclosure_score = 0.0
+        # Extract features
+        ceiling_likelihood = features.get("ceiling_likelihood", 0.0)
+        boundary_clarity = features.get("boundary_clarity", 0.0)
+        texture_complexity = features.get("top_region_texture_complexity", 0.5)
+        openness_top_edge = features.get("openness_top_edge", 0.5)
+        # Get configuration
+        thresholds = self.config_manager.indoor_outdoor_thresholds
+        weights = self.config_manager.weighting_factors
+        override_factors = self.config_manager.override_factors
+        # Ceiling likelihood analysis
+        if ceiling_likelihood > thresholds.ceiling_likelihood_thresh:
+            current_ceiling_score = weights.ceiling_likelihood_w * ceiling_likelihood
+            if strong_sky_signal:
+                current_ceiling_score *= override_factors.sky_override_factor_ceiling
+            enclosure_score += current_ceiling_score
+            diagnostics["indoor_reason_ceiling_visual"] = (
+                f"Visual Ceiling: {ceiling_likelihood:.2f}, ScoreCont: {current_ceiling_score:.2f}"
+            )
+        # Boundary clarity analysis
+        if boundary_clarity > thresholds.boundary_clarity_thresh:
+            current_boundary_score = weights.boundary_clarity_w * boundary_clarity
+            if strong_sky_signal:
+                current_boundary_score *= override_factors.sky_override_factor_boundary
+            enclosure_score += current_boundary_score
+            diagnostics["indoor_reason_boundary_visual"] = (
+                f"Visual Boundary: {boundary_clarity:.2f}, ScoreCont: {current_boundary_score:.2f}"
+            )
+        # Complex urban top detection
+        if (not strong_sky_signal and texture_complexity > 0.7 and
+            openness_top_edge < 0.3 and ceiling_likelihood < 0.35):
+            diagnostics["complex_urban_top_visual"] = True
+            if boundary_clarity > 0.5:
+                enclosure_score *= 0.5
+                diagnostics["reduced_enclosure_for_urban_top_visual"] = True
+        return {"score": enclosure_score}
+    def _analyze_brightness_uniformity(self, features: Dict[str, Any], strong_sky_signal: bool,
+                                      diagnostics: Dict[str, Any]) -> Dict[str, Any]:
+        """Analyze brightness uniformity patterns."""
+        uniformity_score = 0.0
+        # Calculate brightness uniformity
+        brightness_std = features.get("brightness_std", 50.0)
+        avg_brightness = features.get("avg_brightness", 100.0)
+        brightness_uniformity = 1.0 - min(1.0, brightness_std / max(avg_brightness, 1e-5))
+        shadow_clarity = features.get("shadow_clarity_score", 0.5)
+        # Get configuration
+        thresholds = self.config_manager.indoor_outdoor_thresholds
+        weights = self.config_manager.weighting_factors
+        override_factors = self.config_manager.override_factors
+        # High uniformity (indoor indicator)
+        if brightness_uniformity > thresholds.brightness_uniformity_thresh_indoor:
+            uniformity_score = weights.brightness_uniformity_w * brightness_uniformity
+            if strong_sky_signal:
+                uniformity_score *= override_factors.sky_override_factor_uniformity
+        # Low uniformity (potential outdoor indicator)
+        elif brightness_uniformity < thresholds.brightness_uniformity_thresh_outdoor:
+            if shadow_clarity > 0.65:
+                uniformity_score = -weights.brightness_non_uniformity_outdoor_w * (1.0 - brightness_uniformity)
+            elif not strong_sky_signal:
+                uniformity_score = weights.brightness_non_uniformity_indoor_penalty_w * (1.0 - brightness_uniformity)
+        return {"score": uniformity_score}
+    def _analyze_light_sources(self, features: Dict[str, Any], strong_sky_signal: bool,
+                              diagnostics: Dict[str, Any]) -> Dict[str, Any]:
+        """Analyze artificial light source patterns."""
+        light_score = 0.0
+        # Extract light features
+        indoor_light_score = features.get("indoor_light_score", 0.0)
+        circular_light_count = features.get("circular_light_count", 0)
+        bright_spot_count = features.get("bright_spot_count", 0)
+        avg_brightness = features.get("avg_brightness", 100.0)
+        gradient_ratio = features.get("gradient_ratio_vertical_horizontal", 1.0)
+        edges_density = features.get("edges_density", 0.0)
+        # Get configuration
+        thresholds = self.config_manager.indoor_outdoor_thresholds
+        weights = self.config_manager.weighting_factors
+        override_factors = self.config_manager.override_factors
+        # Circular lights detection
+        if circular_light_count >= 1 and not strong_sky_signal:
+            light_score += weights.circular_lights_w * circular_light_count
+        # Indoor light score
+        elif indoor_light_score > 0.55 and not strong_sky_signal:
+            light_score += weights.indoor_light_score_w * indoor_light_score
+        # Many bright spots in dim scenes
+        elif (bright_spot_count > thresholds.many_bright_spots_thresh and
+              avg_brightness < thresholds.dim_scene_for_spots_thresh and
+              not strong_sky_signal):
+            light_score += weights.many_bright_spots_indoor_w * min(bright_spot_count / 10.0, 1.5)
+        # Street structure detection
+        is_likely_street_structure = (0.7 < gradient_ratio < 1.5) and edges_density > 0.15
+        if is_likely_street_structure and bright_spot_count > 3 and not strong_sky_signal:
+            light_score *= 0.2
+            diagnostics["street_lights_heuristic_visual"] = True
+        elif strong_sky_signal:
+            light_score *= override_factors.sky_override_factor_lights
+        return {"score": light_score}
+    def _analyze_color_atmosphere(self, features: Dict[str, Any], strong_sky_signal: bool,
+                                 diagnostics: Dict[str, Any]) -> Dict[str, Any]:
+        """Analyze color atmosphere patterns."""
+        atmosphere_score = 0.0
+        # Extract features
+        color_atmosphere = features.get("color_atmosphere", "neutral")
+        avg_brightness = features.get("avg_brightness", 100.0)
+        avg_saturation = features.get("avg_saturation", 100.0)
+        gradient_ratio = features.get("gradient_ratio_vertical_horizontal", 1.0)
+        edges_density = features.get("edges_density", 0.0)
+        indoor_light_score = features.get("indoor_light_score", 0.0)
+        # Get configuration
+        thresholds = self.config_manager.indoor_outdoor_thresholds
+        weights = self.config_manager.weighting_factors
+        # Warm atmosphere analysis
+        if (color_atmosphere == "warm" and
+            avg_brightness < thresholds.warm_indoor_max_brightness_thresh):
+            # Check exclusion conditions
+            is_likely_street_structure = (0.7 < gradient_ratio < 1.5) and edges_density > 0.15
+            is_complex_urban_top = diagnostics.get("complex_urban_top_visual", False)
+            if (not strong_sky_signal and not is_complex_urban_top and
+                not (is_likely_street_structure and avg_brightness > 80) and
+                avg_saturation < 160):
+                if indoor_light_score > 0.05:
+                    atmosphere_score = weights.warm_atmosphere_indoor_w
+        return {"score": atmosphere_score}
+    def _analyze_home_environment_pattern(self, features: Dict[str, Any], strong_sky_signal: bool,
+                                         diagnostics: Dict[str, Any]) -> Dict[str, Any]:
+        """Analyze home/residential environment patterns."""
+        home_score = 0.0
+        if strong_sky_signal:
+            diagnostics["skipped_home_env_visual_due_to_sky"] = True
+            return {"score": 0.0}
+        # Calculate bedroom/home indicators
+        bedroom_indicators = 0.0
+        brightness_uniformity = features.get("brightness_uniformity", 0.0)
+        boundary_clarity = features.get("boundary_clarity", 0.0)
+        ceiling_likelihood = features.get("ceiling_likelihood", 0.0)
+        bright_spot_count = features.get("bright_spot_count", 0)
+        circular_light_count = features.get("circular_light_count", 0)
+        warm_ratio = features.get("warm_ratio", 0.0)
+        avg_saturation = features.get("avg_saturation", 100.0)
+        # Accumulate indicators
+        if brightness_uniformity > 0.65 and boundary_clarity > 0.40:
+            bedroom_indicators += 1.1
+        if ceiling_likelihood > 0.35 and (bright_spot_count > 0 or circular_light_count > 0):
+            bedroom_indicators += 1.1
+        if warm_ratio > 0.55 and brightness_uniformity > 0.65:
+            bedroom_indicators += 1.0
+        if brightness_uniformity > 0.70 and avg_saturation < 60:
+            bedroom_indicators += 0.7
+        # Get configuration
+        thresholds = self.config_manager.indoor_outdoor_thresholds
+        weights = self.config_manager.weighting_factors
+        # Apply scoring based on indicator strength
+        if bedroom_indicators >= thresholds.home_pattern_thresh_strong:
+            home_score = weights.home_env_strong_w
+        elif bedroom_indicators >= thresholds.home_pattern_thresh_moderate:
+            home_score = weights.home_env_moderate_w
+        if bedroom_indicators > 0:
+            diagnostics["home_environment_pattern_visual_indicators"] = round(bedroom_indicators, 1)
+        return {"score": home_score}
+    def _analyze_aerial_street_pattern(self, features: Dict[str, Any], strong_sky_signal: bool,
+                                      contributions: Dict[str, float],
+                                      diagnostics: Dict[str, Any]) -> Dict[str, Any]:
+        """Analyze aerial view street patterns."""
+        aerial_score = 0.0
+        # Extract features
+        sky_brightness_ratio = features.get("sky_region_brightness_ratio", 1.0)
+        texture_complexity = features.get("top_region_texture_complexity", 0.5)
+        avg_brightness = features.get("avg_brightness", 100.0)
+        # Get configuration
+        thresholds = self.config_manager.indoor_outdoor_thresholds
+        weights = self.config_manager.weighting_factors
+        # Aerial street pattern detection
+        if (sky_brightness_ratio < thresholds.aerial_top_dark_ratio_thresh and
+            texture_complexity > thresholds.aerial_top_complex_thresh and
+            avg_brightness > thresholds.aerial_min_avg_brightness_thresh and
+            not strong_sky_signal):
+            aerial_score = -weights.aerial_street_w
+            diagnostics["aerial_street_pattern_visual_detected"] = True
+            # Reduce enclosure features if aerial pattern detected
+            if ("enclosure_features" in contributions and
+                contributions["enclosure_features"] > 0):
+                reduction_factor = self.config_manager.override_factors.aerial_enclosure_reduction_factor
+                positive_enclosure_score = max(0, contributions["enclosure_features"])
+                reduction_amount = positive_enclosure_score * reduction_factor
+                contributions["enclosure_features_reduced_by_aerial"] = round(-reduction_amount, 2)
+                contributions["enclosure_features"] = round(
+                    contributions["enclosure_features"] - reduction_amount, 2
+                )
+        return {"score": aerial_score}
+    def _analyze_places365_influence(self, p365_context: Dict[str, Any],
+                                    strong_sky_signal: bool,
+                                    diagnostics: Dict[str, Any]) -> Dict[str, Any]:
+        """Analyze Places365 influence on classification."""
+        p365_influence_score = 0.0
+        if not p365_context or p365_context["confidence"] < self.P365_MODERATE_CONF_THRESHOLD:
+            return {"influence_score": 0.0}
+        # Places365 direct classification influence
+        if p365_context["is_indoor_from_classification"] is not None:
+            p365_influence_score += self._compute_direct_classification_influence(
+                p365_context, strong_sky_signal, diagnostics
+            )
+        # Places365 scene context influence
+        elif p365_context["confidence"] >= self.P365_MODERATE_CONF_THRESHOLD:
+            p365_influence_score += self._compute_scene_context_influence(
+                p365_context, strong_sky_signal, diagnostics
+            )
+        # Places365 attributes influence
+        if p365_context["attributes"] and p365_context["confidence"] > 0.5:
+            p365_influence_score += self._compute_attributes_influence(
+                p365_context, strong_sky_signal, diagnostics
+            )
+        # High confidence street scene boost
+        if (p365_context["confidence"] >= 0.85 and
+            any(kw in p365_context["mapped_scene"] for kw in ["intersection", "crosswalk", "street", "road"])):
+            additional_outdoor_push = -3.0 * p365_context["confidence"]
+            p365_influence_score += additional_outdoor_push
+            diagnostics["p365_street_scene_boost"] = (
+                f"Additional outdoor push: {additional_outdoor_push:.2f} for street scene: "
+                f"{p365_context['mapped_scene']}"
+            )
+            self.logger.debug(f"High confidence street scene detected - "
+                            f"{p365_context['mapped_scene']} with confidence {p365_context['confidence']:.3f}")
+        return {"influence_score": p365_influence_score}
+    def _compute_direct_classification_influence(self, p365_context: Dict[str, Any],
+                                               strong_sky_signal: bool,
+                                               diagnostics: Dict[str, Any]) -> float:
+        """Compute influence from Places365 direct indoor/outdoor classification."""
+        P365_DIRECT_INDOOR_WEIGHT = 3.5
+        P365_DIRECT_OUTDOOR_WEIGHT = 4.0
+        confidence = p365_context["confidence"]
+        is_indoor = p365_context["is_indoor_from_classification"]
+        mapped_scene = p365_context["mapped_scene"]
+        if is_indoor is True:
+            current_contrib = P365_DIRECT_INDOOR_WEIGHT * confidence
+            diagnostics["p365_influence_source"] = (
+                f"P365_DirectIndoor(True,Conf:{confidence:.2f},Scene:{mapped_scene})"
+            )
+        else:
+            current_contrib = -P365_DIRECT_OUTDOOR_WEIGHT * confidence
+            diagnostics["p365_influence_source"] = (
+                f"P365_DirectIndoor(False,Conf:{confidence:.2f},Scene:{mapped_scene})"
+            )
+        # Apply sky override for indoor predictions
+        if strong_sky_signal and current_contrib > 0:
+            sky_override_factor = self.config_manager.override_factors.sky_override_factor_p365_indoor_decision
+            current_contrib *= sky_override_factor
+            diagnostics["p365_indoor_push_reduced_by_visual_sky"] = f"Reduced to {current_contrib:.2f}"
+        return current_contrib
+    def _compute_scene_context_influence(self, p365_context: Dict[str, Any],
+                                        strong_sky_signal: bool,
+                                        diagnostics: Dict[str, Any]) -> float:
+        """Compute influence from Places365 scene context."""
+        P365_SCENE_CONTEXT_INDOOR_WEIGHT = 2.0
+        P365_SCENE_CONTEXT_OUTDOOR_WEIGHT = 2.5
+        confidence = p365_context["confidence"]
+        mapped_scene = p365_context["mapped_scene"]
+        is_def_indoor = any(kw in mapped_scene for kw in self.DEFINITELY_INDOOR_KEYWORDS_P365)
+        is_def_outdoor = any(kw in mapped_scene for kw in self.DEFINITELY_OUTDOOR_KEYWORDS_P365)
+        current_contrib = 0.0
+        if is_def_indoor and not is_def_outdoor:
+            current_contrib = P365_SCENE_CONTEXT_INDOOR_WEIGHT * confidence
+            diagnostics["p365_influence_source"] = (
+                f"P365_SceneContext(Indoor: {mapped_scene}, Conf:{confidence:.2f})"
+            )
+        elif is_def_outdoor and not is_def_indoor:
+            current_contrib = -P365_SCENE_CONTEXT_OUTDOOR_WEIGHT * confidence
+            diagnostics["p365_influence_source"] = (
+                f"P365_SceneContext(Outdoor: {mapped_scene}, Conf:{confidence:.2f})"
+            )
+        # Apply sky override for indoor predictions
+        if strong_sky_signal and current_contrib > 0:
+            sky_override_factor = self.config_manager.override_factors.sky_override_factor_p365_indoor_decision
+            current_contrib *= sky_override_factor
+            diagnostics["p365_context_indoor_push_reduced_by_visual_sky"] = f"Reduced to {current_contrib:.2f}"
+        return current_contrib
+    def _compute_attributes_influence(self, p365_context: Dict[str, Any],
+                                     strong_sky_signal: bool,
+                                     diagnostics: Dict[str, Any]) -> float:
+        """Compute influence from Places365 attributes."""
+        P365_ATTRIBUTE_INDOOR_WEIGHT = 1.0
+        P365_ATTRIBUTE_OUTDOOR_WEIGHT = 1.5
+        confidence = p365_context["confidence"]
+        attributes = p365_context["attributes"]
+        attr_contrib = 0.0
+        if "indoor" in attributes and "outdoor" not in attributes:
+            attr_contrib += P365_ATTRIBUTE_INDOOR_WEIGHT * (confidence * 0.5)
+            diagnostics["p365_attr_influence"] = f"+{attr_contrib:.2f} (indoor attr)"
+        elif "outdoor" in attributes and "indoor" not in attributes:
+            attr_contrib -= P365_ATTRIBUTE_OUTDOOR_WEIGHT * (confidence * 0.5)
+            diagnostics["p365_attr_influence"] = f"{attr_contrib:.2f} (outdoor attr)"
+        # Apply sky override for indoor attributes
+        if strong_sky_signal and attr_contrib > 0:
+            sky_override_factor = self.config_manager.override_factors.sky_override_factor_p365_indoor_decision
+            attr_contrib *= sky_override_factor
+        return attr_contrib
+    def _compute_final_classification(self, final_indoor_score: float, visual_score: float,
+                                     p365_influence_score: float, diagnostics: Dict[str, Any]) -> Dict[str, Any]:
+        """Compute final classification probability and decision."""
+        # Record score breakdown
+        diagnostics["final_indoor_score_value"] = round(final_indoor_score, 3)
+        diagnostics["final_score_breakdown"] = (
+            f"VisualScore: {visual_score:.2f}, P365Influence: {p365_influence_score:.2f}"
+        )
+        # Apply sigmoid transformation
+        sigmoid_scale = self.config_manager.algorithm_parameters.indoor_score_sigmoid_scale
+        indoor_probability = 1 / (1 + np.exp(-final_indoor_score * sigmoid_scale))
+        # Make decision
+        decision_threshold = self.config_manager.algorithm_parameters.indoor_decision_threshold
+        is_indoor = indoor_probability > decision_threshold
+        return {
+            "is_indoor": is_indoor,
+            "indoor_probability": indoor_probability,
+            "final_score": final_indoor_score
+        }
+    def _apply_places365_override(self, classification_result: Dict[str, Any],
+                                 p365_context: Dict[str, Any],
+                                 diagnostics: Dict[str, Any]) -> Dict[str, Any]:
+        """Apply Places365 high-confidence override if conditions are met."""
+        is_indoor = classification_result["is_indoor"]
+        indoor_probability = classification_result["indoor_probability"]
+        final_score = classification_result["final_score"]
+        # Check for override conditions
+        if not p365_context or p365_context["confidence"] < 0.5:
+            diagnostics["final_indoor_probability_calculated"] = round(indoor_probability, 3)
+            diagnostics["final_is_indoor_decision"] = bool(is_indoor)
+            return classification_result
+        p365_is_indoor_decision = p365_context.get("is_indoor", None)
+        confidence = p365_context["confidence"]
+        self.logger.debug(f"Override check: is_indoor={is_indoor}, p365_conf={confidence}, "
+                         f"p365_raw_is_indoor={p365_is_indoor_decision}")
+        # Apply override for high confidence Places365 decisions
+        if p365_is_indoor_decision is not None:
+            if p365_is_indoor_decision == False:
+                self.logger.debug(f"Applying outdoor override. Original: {is_indoor}")
+                original_decision = f"Indoor:{is_indoor}, Prob:{indoor_probability:.3f}, Score:{final_score:.2f}"
+                is_indoor = False
+                indoor_probability = 0.02
+                final_score = -8.0
+                diagnostics["p365_force_override_applied"] = (
+                    f"P365 FORCED OUTDOOR (is_indoor: {p365_is_indoor_decision}, Conf: {confidence:.3f})"
+                )
+                diagnostics["p365_override_original_decision"] = original_decision
+                self.logger.info(f"Places365 FORCED OUTDOOR override applied. New is_indoor: {is_indoor}")
+            elif p365_is_indoor_decision == True:
+                self.logger.debug(f"Applying indoor override. Original: {is_indoor}")
+                original_decision = f"Indoor:{is_indoor}, Prob:{indoor_probability:.3f}, Score:{final_score:.2f}"
+                is_indoor = True
+                indoor_probability = 0.98
+                final_score = 8.0
+                diagnostics["p365_force_override_applied"] = (
+                    f"P365 FORCED INDOOR (is_indoor: {p365_is_indoor_decision}, Conf: {confidence:.3f})"
+                )
+                diagnostics["p365_override_original_decision"] = original_decision
+                self.logger.info(f"Places365 FORCED INDOOR override applied. New is_indoor: {is_indoor}")
+        # Record final values
+        diagnostics["final_indoor_probability_calculated"] = round(indoor_probability, 3)
+        diagnostics["final_is_indoor_decision"] = bool(is_indoor)
+        self.logger.debug(f"Final classification: is_indoor={is_indoor}, score={final_score}, prob={indoor_probability}")
+        return {
+            "is_indoor": is_indoor,
+            "indoor_probability": indoor_probability,
+            "final_score": final_score
+        }
+    def _ensure_default_contributions(self, feature_contributions: Dict[str, float]) -> None:
+        """Ensure all expected feature contribution keys have default values."""
+        default_keys = [
+            "sky_openness_features", "enclosure_features",
+            "brightness_uniformity_contribution", "light_source_features"
+        ]
+        for key in default_keys:
+            if key not in feature_contributions:
+                feature_contributions[key] = 0.0
+    def _get_default_classification_result(self) -> Dict[str, Any]:
+        """Return default classification result in case of errors."""
+        return {
+            "is_indoor": False,
+            "indoor_probability": 0.5,
+            "indoor_score_raw": 0.0,
+            "feature_contributions": {
+                "sky_openness_features": 0.0,
+                "enclosure_features": 0.0,
+                "brightness_uniformity_contribution": 0.0,
+                "light_source_features": 0.0
+            },
+            "diagnostics": {
+                "error": "Classification failed, using default values"
+            }
+        }

landmark_data_manager.py ADDED Viewed

	@@ -0,0 +1,283 @@

+import logging
+import traceback
+from typing import List, Dict, Tuple, Optional, Union, Any
+from landmark_data import ALL_LANDMARKS, get_all_landmark_prompts
+from landmark_activities import LANDMARK_ACTIVITIES
+class LandmarkDataManager:
+    """
+    專門處理地標數據的載入、管理和查詢功能，包括地標信息、提示詞和活動建議
+    """
+    def __init__(self):
+        """
+        initialize landmark related
+        """
+        self.logger = logging.getLogger(__name__)
+        self.landmark_data = {}
+        self.landmark_prompts = []
+        self.landmark_id_to_index = {}
+        self.is_enabled = False
+        self._load_landmark_data()
+    def _load_landmark_data(self):
+        """
+        載入地標數據和相關資訊
+        """
+        try:
+            self.landmark_data = ALL_LANDMARKS
+            self.landmark_prompts = get_all_landmark_prompts()
+            self.logger.info(f"Loaded {len(self.landmark_prompts)} landmark prompts for classification")
+            # 創建地標ID到索引的映射，可快速查找
+            self.landmark_id_to_index = {landmark_id: i for i, landmark_id in enumerate(ALL_LANDMARKS.keys())}
+            self.is_enabled = True
+            self.logger.info(f"Successfully loaded landmark data with {len(self.landmark_data)} landmarks")
+        except ImportError:
+            self.logger.warning("landmark_data.py not found. Landmark classification will be limited")
+            self.landmark_data = {}
+            self.landmark_prompts = []
+            self.landmark_id_to_index = {}
+            self.is_enabled = False
+        except Exception as e:
+            self.logger.error(f"Error loading landmark data: {e}")
+            self.logger.error(traceback.format_exc())
+            self.landmark_data = {}
+            self.landmark_prompts = []
+            self.landmark_id_to_index = {}
+            self.is_enabled = False
+    def get_landmark_prompts(self) -> List[str]:
+        """
+        獲取所有地標提示詞
+        Returns:
+            List[str]: 地標提示詞列表
+        """
+        return self.landmark_prompts
+    def get_landmark_by_id(self, landmark_id: str) -> Dict[str, Any]:
+        """
+        根據地標ID獲取地標信息
+        Args:
+            landmark_id: Landmark ID
+        Returns:
+            Dict[str, Any]: 地標詳細信息
+        """
+        return self.landmark_data.get(landmark_id, {})
+    def get_landmark_by_index(self, index: int) -> Tuple[str, Dict[str, Any]]:
+        """
+        根據索引獲取地標信息
+        Args:
+            index: 地標在列表中的索引
+        Returns:
+            Tuple[str, Dict[str, Any]]: (地標ID, 地標info)
+        """
+        try:
+            landmark_ids = list(self.landmark_data.keys())
+            if 0 <= index < len(landmark_ids):
+                landmark_id = landmark_ids[index]
+                return landmark_id, self.landmark_data[landmark_id]
+            else:
+                self.logger.warning(f"Index {index} out of range for landmark data")
+                return None, {}
+        except Exception as e:
+            self.logger.error(f"Error getting landmark by index {index}: {e}")
+            self.logger.error(traceback.format_exc())
+            return None, {}
+    def get_landmark_index(self, landmark_id: str) -> Optional[int]:
+        """
+        獲取地標ID對應的index
+        Args:
+            landmark_id: 地標ID
+        Returns:
+            Optional[int]: 索引，如果不存在則返回None
+        """
+        return self.landmark_id_to_index.get(landmark_id)
+    def determine_landmark_type(self, landmark_id: str) -> str:
+        """
+        自動判斷地標類型，基於地標數據和命名
+        Args:
+            landmark_id: 地標ID
+        Returns:
+            str: 地標類型，用於調整閾值
+        """
+        if not landmark_id:
+            return "building"  # 預設類型
+        try:
+            # 獲取地標詳細數據
+            landmark_info = self.landmark_data.get(landmark_id, {})
+            # 獲取地標相關文本
+            landmark_id_lower = landmark_id.lower()
+            landmark_name = landmark_info.get("name", "").lower()
+            landmark_location = landmark_info.get("location", "").lower()
+            landmark_aliases = [alias.lower() for alias in landmark_info.get("aliases", [])]
+            # 合併所有文本數據用於特徵判斷
+            combined_text = " ".join([landmark_id_lower, landmark_name] + landmark_aliases)
+            # 地標類型的特色特徵
+            type_features = {
+                "skyscraper": ["skyscraper", "tall", "tower", "高樓", "摩天", "大厦", "タワー"],
+                "tower": ["tower", "bell", "clock", "塔", "鐘樓", "タワー", "campanile"],
+                "monument": ["monument", "memorial", "statue", "紀念", "雕像", "像", "memorial"],
+                "natural": ["mountain", "lake", "canyon", "falls", "beach", "山", "湖", "峽谷", "瀑布", "海灘"],
+                "temple": ["temple", "shrine", "寺", "神社", "廟"],
+                "palace": ["palace", "castle", "宮", "城", "皇宮", "宫殿"],
+                "distinctive": ["unique", "leaning", "slanted", "傾斜", "斜", "獨特", "傾く"]
+            }
+            # 檢查是否位於亞洲地區
+            asian_regions = ["china", "japan", "korea", "taiwan", "singapore", "vietnam", "thailand",
+                            "hong kong", "中國", "日本", "韓國", "台灣", "新加坡", "越南", "泰國", "香港"]
+            is_asian = any(region in landmark_location for region in asian_regions)
+            # 判斷地標類型
+            best_type = None
+            max_matches = 0
+            for type_name, features in type_features.items():
+                # 計算特徵詞匹配數量
+                matches = sum(1 for feature in features if feature in combined_text)
+                if matches > max_matches:
+                    max_matches = matches
+                    best_type = type_name
+            # 處理亞洲地區特例
+            if is_asian and best_type == "tower":
+                best_type = "skyscraper"  # 亞洲地區的塔型建築閾值較低
+            # 特例處理：檢測傾斜建築
+            if any(term in combined_text for term in ["leaning", "slanted", "tilt", "inclined", "斜", "傾斜"]):
+                return "distinctive"  # 傾斜建築需要特殊處理
+            return best_type if best_type and max_matches > 0 else "building"  # 預設為一般建築
+        except Exception as e:
+            self.logger.error(f"Error determining landmark type for {landmark_id}: {e}")
+            self.logger.error(traceback.format_exc())
+            return "building"
+    def extract_landmark_specific_info(self, landmark_id: str) -> Dict[str, Any]:
+        """
+        提取特定地標的詳細信息，包括特色模板和活動建議
+        Args:
+            landmark_id: 地標ID
+        Returns:
+            Dict[str, Any]: 地標特定信息
+        """
+        if not landmark_id or landmark_id == "unknown":
+            return {"has_specific_activities": False}
+        specific_info = {"has_specific_activities": False}
+        try:
+            # 從 landmark_data 中提取基本信息
+            landmark_data_source = self.landmark_data.get(landmark_id)
+            # 處理地標基本數據
+            if landmark_data_source:
+                # 提取正確的地標名稱
+                if "name" in landmark_data_source:
+                    specific_info["landmark_name"] = landmark_data_source["name"]
+                # 提取所有可用的 prompts 作為特色模板
+                if "prompts" in landmark_data_source:
+                    specific_info["feature_templates"] = landmark_data_source["prompts"][:5]
+                    specific_info["primary_template"] = landmark_data_source["prompts"][0]
+                # 提取別名info
+                if "aliases" in landmark_data_source:
+                    specific_info["aliases"] = landmark_data_source["aliases"]
+                # 提取位置信息
+                if "location" in landmark_data_source:
+                    specific_info["location"] = landmark_data_source["location"]
+                # 提取其他相關信息
+                for key in ["year_built", "architectural_style", "significance", "description"]:
+                    if key in landmark_data_source:
+                        specific_info[key] = landmark_data_source[key]
+            # 嘗試從 LANDMARK_ACTIVITIES 中提取活動建議
+            try:
+                if landmark_id in LANDMARK_ACTIVITIES:
+                    activities = LANDMARK_ACTIVITIES[landmark_id]
+                    specific_info["landmark_specific_activities"] = activities
+                    specific_info["has_specific_activities"] = True
+                    self.logger.info(f"Found {len(activities)} specific activities for landmark {landmark_id}")
+                else:
+                    self.logger.info(f"No specific activities found for landmark {landmark_id} in LANDMARK_ACTIVITIES")
+                    specific_info["has_specific_activities"] = False
+            except ImportError:
+                self.logger.warning("Could not import LANDMARK_ACTIVITIES from landmark_activities")
+                specific_info["has_specific_activities"] = False
+            except Exception as e:
+                self.logger.error(f"Error loading landmark activities for {landmark_id}: {e}")
+                self.logger.error(traceback.format_exc())
+                specific_info["has_specific_activities"] = False
+        except Exception as e:
+            self.logger.error(f"Error extracting landmark specific info for {landmark_id}: {e}")
+            self.logger.error(traceback.format_exc())
+        return specific_info
+    def get_landmark_count(self) -> int:
+        """
+        獲取地標總數
+        Returns:
+            int: 地標數量
+        """
+        return len(self.landmark_data)
+    def is_landmark_enabled(self) -> bool:
+        """
+        檢查地標功能是否啟用
+        Returns:
+            bool: 地標功能狀態
+        """
+        return self.is_enabled
+    def get_all_landmark_ids(self) -> List[str]:
+        """
+        獲取所有地標ID列表
+        Returns:
+            List[str]: 地標ID列表
+        """
+        return list(self.landmark_data.keys())
+    def validate_landmark_id(self, landmark_id: str) -> bool:
+        """
+        驗證地標ID是否有效
+        Args:
+            landmark_id: 要驗證的地標ID
+        Returns:
+            bool: ID是否有效
+        """
+        return landmark_id in self.landmark_data

landmark_processing_manager.py ADDED Viewed

	@@ -0,0 +1,512 @@

+import re
+import logging
+import traceback
+import numpy as np
+from typing import Dict, List, Tuple, Optional, Any
+from PIL import Image
+from clip_zero_shot_classifier import CLIPZeroShotClassifier
+from landmark_activities import LANDMARK_ACTIVITIES
+from landmark_data import ALL_LANDMARKS
+class LandmarkProcessingManager:
+    """
+    負責處理所有地標相關的檢測和處理邏輯，包括未知物體的地標識別、
+    地標物體的創建和驗證，以及地標引用的清理。
+    """
+    def __init__(self, enable_landmark: bool = True, use_clip: bool = True):
+        """
+        初始化地標處理管理器。
+        Args:
+            enable_landmark: 是否啟用地標檢測功能
+            use_clip: 是否啟用 CLIP 分析功能
+        """
+        self.logger = logging.getLogger(__name__)
+        self.enable_landmark = enable_landmark
+        self.use_clip = use_clip
+        # 載入地標相關數據
+        self.landmark_activities = {}
+        self.all_landmarks = {}
+        self._load_landmark_data()
+        # 地標分類器將按需初始化
+        self.landmark_classifier = None
+    def _load_landmark_data(self):
+        """載入地標相關的數據結構。"""
+        try:
+            self.landmark_activities = LANDMARK_ACTIVITIES
+            self.logger.info("Loaded LANDMARK_ACTIVITIES successfully")
+        except ImportError as e:
+            self.logger.warning(f"Failed to load LANDMARK_ACTIVITIES: {e}")
+            self.landmark_activities = {}
+        try:
+            self.all_landmarks = ALL_LANDMARKS
+            self.logger.info("Loaded ALL_LANDMARKS successfully")
+        except ImportError as e:
+            self.logger.warning(f"Failed to load ALL_LANDMARKS: {e}")
+            self.all_landmarks = {}
+    def set_landmark_classifier(self, landmark_classifier):
+        """
+        設置地標分類器實例。
+        Args:
+            landmark_classifier: CLIPZeroShotClassifier 實例
+        """
+        self.landmark_classifier = landmark_classifier
+    def process_unknown_objects(self, detection_result, detected_objects, clip_analyzer=None):
+        """
+        對 YOLO 未能識別或信心度低的物體進行地標檢測。
+        Args:
+            detection_result: YOLO 檢測結果
+            detected_objects: 已識別的物體列表
+            clip_analyzer: CLIP 分析器實例（用於按需初始化地標分類器）
+        Returns:
+            tuple: (更新後的物體列表, 地標物體列表)
+        """
+        if (not self.enable_landmark or not self.use_clip or
+            not hasattr(self, 'use_landmark_detection') or not self.use_landmark_detection):
+            # 未啟用地標識別時，確保返回的物體列表中不包含任何地標物體
+            cleaned_objects = [obj for obj in detected_objects if not obj.get("is_landmark", False)]
+            return cleaned_objects, []
+        try:
+            # 獲取原始圖像
+            original_image = None
+            if detection_result is not None and hasattr(detection_result, 'orig_img'):
+                original_image = detection_result.orig_img
+            # 檢查原始圖像是否存在
+            if original_image is None:
+                self.logger.warning("Original image not available for landmark detection")
+                return detected_objects, []
+            # 確保原始圖像為 PIL 格式或可轉換為 PIL 格式
+            if not isinstance(original_image, Image.Image):
+                if isinstance(original_image, np.ndarray):
+                    try:
+                        if original_image.ndim == 3 and original_image.shape[2] == 4:  # RGBA
+                            original_image = original_image[:, :, :3]  # 轉換為 RGB
+                        if original_image.ndim == 2:  # 灰度圖
+                            original_image = Image.fromarray(original_image).convert("RGB")
+                        else:  # 假設為 RGB 或 BGR
+                            original_image = Image.fromarray(original_image)
+                        if hasattr(original_image, 'mode') and original_image.mode == 'BGR':  # 從 OpenCV 明確將 BGR 轉換為 RGB
+                            original_image = original_image.convert('RGB')
+                    except Exception as e:
+                        self.logger.warning(f"Error converting image for landmark detection: {e}")
+                        return detected_objects, []
+                else:
+                    self.logger.warning(f"Cannot process image of type {type(original_image)}")
+                    return detected_objects, []
+            # 獲取圖像維度
+            if isinstance(original_image, np.ndarray):
+                h, w = original_image.shape[:2]
+            elif isinstance(original_image, Image.Image):
+                w, h = original_image.size
+            else:
+                self.logger.warning(f"Unable to determine image dimensions for type {type(original_image)}")
+                return detected_objects, []
+            # 收集可能含有地標的區域
+            candidate_boxes = []
+            low_conf_boxes = []
+            # 即使沒有 YOLO 檢測到的物體，也嘗試進行更詳細的地標分析
+            if len(detected_objects) == 0:
+                # 創建一個包含整個圖像的框
+                full_image_box = [0, 0, w, h]
+                low_conf_boxes.append(full_image_box)
+                candidate_boxes.append((full_image_box, "full_image"))
+                # 加入網格分析以增加檢測成功率
+                grid_size = 2  # 2x2 網格
+                for i in range(grid_size):
+                    for j in range(grid_size):
+                        # 創建網格框
+                        grid_box = [
+                            j * w / grid_size,
+                            i * h / grid_size,
+                            (j + 1) * w / grid_size,
+                            (i + 1) * h / grid_size
+                        ]
+                        low_conf_boxes.append(grid_box)
+                        candidate_boxes.append((grid_box, "grid"))
+                # 創建更大的中心框（覆蓋中心 70% 區域）
+                center_box = [
+                    w * 0.15, h * 0.15,
+                    w * 0.85, h * 0.85
+                ]
+                low_conf_boxes.append(center_box)
+                candidate_boxes.append((center_box, "center"))
+                self.logger.info("No YOLO detections, attempting detailed landmark analysis with multiple regions")
+            else:
+                try:
+                    # 獲取原始 YOLO 檢測結果中的低置信度物體
+                    if (hasattr(detection_result, 'boxes') and
+                        hasattr(detection_result.boxes, 'xyxy') and
+                        hasattr(detection_result.boxes, 'conf') and
+                        hasattr(detection_result.boxes, 'cls')):
+                        all_boxes = (detection_result.boxes.xyxy.cpu().numpy()
+                                   if hasattr(detection_result.boxes.xyxy, 'cpu')
+                                   else detection_result.boxes.xyxy)
+                        all_confs = (detection_result.boxes.conf.cpu().numpy()
+                                   if hasattr(detection_result.boxes.conf, 'cpu')
+                                   else detection_result.boxes.conf)
+                        all_cls = (detection_result.boxes.cls.cpu().numpy()
+                                 if hasattr(detection_result.boxes.cls, 'cpu')
+                                 else detection_result.boxes.cls)
+                        # 收集低置信度區域和可能含有地標的區域（如建築物）
+                        for i, (box, conf, cls) in enumerate(zip(all_boxes, all_confs, all_cls)):
+                            is_low_conf = conf < 0.4 and conf > 0.1
+                            # 根據物體類別 ID 識別建築物 - 使用通用分類
+                            common_building_classes = [11, 12, 13, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65]  # 常見建築類別 ID
+                            is_building = int(cls) in common_building_classes
+                            # 計算相對面積 - 大物體
+                            is_large_object = (box[2] - box[0]) * (box[3] - box[1]) > (0.1 * w * h)
+                            if is_low_conf or is_building:
+                                # 確保 box 是一個有效的數組或列表
+                                if isinstance(box, (list, tuple, np.ndarray)) and len(box) >= 4:
+                                    low_conf_boxes.append(box)
+                                    if is_large_object:
+                                        candidate_boxes.append((box, "building" if is_building else "low_conf"))
+                except Exception as e:
+                    self.logger.error(f"Error processing YOLO detections: {e}")
+                    traceback.print_exc()
+            # 按需初始化地標分類器
+            if not self.landmark_classifier:
+                if clip_analyzer and hasattr(clip_analyzer, 'get_clip_instance'):
+                    try:
+                        self.logger.info("Initializing landmark classifier for process_unknown_objects")
+                        model, preprocess, device = clip_analyzer.get_clip_instance()
+                        self.landmark_classifier = CLIPZeroShotClassifier(device=device)
+                    except Exception as e:
+                        self.logger.error(f"Error initializing landmark classifier: {e}")
+                        return detected_objects, []
+                else:
+                    self.logger.warning("landmark_classifier not available and cannot be initialized")
+                    return detected_objects, []
+            # 使用智能地標搜索
+            landmark_results = None
+            try:
+                # 確保有有效的框
+                if not low_conf_boxes:
+                    # 如果沒有低置信度框，添加全圖
+                    low_conf_boxes.append([0, 0, w, h])
+                landmark_results = self.landmark_classifier.intelligent_landmark_search(
+                    original_image,
+                    yolo_boxes=low_conf_boxes,
+                    base_threshold=0.25
+                )
+            except Exception as e:
+                self.logger.error(f"Error in intelligent_landmark_search: {e}")
+                traceback.print_exc()
+                return detected_objects, []
+            # 處理識別結果
+            landmark_objects = []
+            # 如果有效的地標結果
+            if landmark_results and landmark_results.get("is_landmark_scene", False):
+                for landmark_info in landmark_results.get("detected_landmarks", []):
+                    try:
+                        # 使用 landmark_classifier 的閾值判斷
+                        base_threshold = 0.25  # 基礎閾值
+                        # 獲取地標類型並設定閾值
+                        landmark_type = "architectural"  # 預設類型
+                        type_threshold = 0.5  # 預設閾值
+                        # 優先使用 landmark_classifier
+                        if (hasattr(self.landmark_classifier, '_determine_landmark_type') and
+                            landmark_info.get("landmark_id")):
+                            landmark_type = self.landmark_classifier._determine_landmark_type(landmark_info.get("landmark_id"))
+                            type_threshold = getattr(self.landmark_classifier, 'landmark_type_thresholds', {}).get(landmark_type, 0.5)
+                        # 否則使用本地方法
+                        elif hasattr(self, '_determine_landmark_type'):
+                            landmark_type = self._determine_landmark_type(landmark_info.get("landmark_id", ""))
+                            # 依據地標類型調整閾值
+                            if landmark_type == "skyscraper":
+                                type_threshold = 0.4
+                            elif landmark_type == "natural":
+                                type_threshold = 0.6
+                        # 或者直接從地標 ID 推斷
+                        else:
+                            landmark_id = landmark_info.get("landmark_id", "").lower()
+                            if any(term in landmark_id for term in ["mountain", "canyon", "waterfall", "lake", "river", "natural"]):
+                                landmark_type = "natural"
+                                type_threshold = 0.6
+                            elif any(term in landmark_id for term in ["skyscraper", "building", "tower", "tall"]):
+                                landmark_type = "skyscraper"
+                                type_threshold = 0.4
+                            elif any(term in landmark_id for term in ["monument", "memorial", "statue", "historical"]):
+                                landmark_type = "monument"
+                                type_threshold = 0.5
+                        effective_threshold = base_threshold * (type_threshold / 0.5)
+                        # 如果置信度足夠高
+                        if landmark_info.get("confidence", 0) > effective_threshold:
+                            # 獲取邊界框
+                            if "box" in landmark_info:
+                                box = landmark_info["box"]
+                            else:
+                                # 如果沒有邊界框，使用整個圖像的 90% 區域
+                                margin_x, margin_y = w * 0.05, h * 0.05
+                                box = [margin_x, margin_y, w - margin_x, h - margin_y]
+                            # 計算中心點和其他必要信息
+                            center_x = (box[0] + box[2]) / 2
+                            center_y = (box[1] + box[3]) / 2
+                            norm_center_x = center_x / w if w > 0 else 0.5
+                            norm_center_y = center_y / h if h > 0 else 0.5
+                            # 獲取區域位置（需要 spatial_analyzer 的支持）
+                            region = "center"  # 預設
+                            # 創建地標物體
+                            landmark_obj = {
+                                "class_id": (landmark_info.get("landmark_id", "")[:15]
+                                           if isinstance(landmark_info.get("landmark_id", ""), str)
+                                           else "-100"),  # 截斷過長的 ID
+                                "class_name": landmark_info.get("landmark_name", "Unknown Landmark"),
+                                "confidence": landmark_info.get("confidence", 0.0),
+                                "box": box,
+                                "center": (center_x, center_y),
+                                "normalized_center": (norm_center_x, norm_center_y),
+                                "size": (box[2] - box[0], box[3] - box[1]),
+                                "normalized_size": (
+                                    (box[2] - box[0]) / w if w > 0 else 0,
+                                    (box[3] - box[1]) / h if h > 0 else 0
+                                ),
+                                "area": (box[2] - box[0]) * (box[3] - box[1]),
+                                "normalized_area": (
+                                    (box[2] - box[0]) * (box[3] - box[1]) / (w * h) if w * h > 0 else 0
+                                ),
+                                "region": region,
+                                "is_landmark": True,
+                                "landmark_id": landmark_info.get("landmark_id", ""),
+                                "location": landmark_info.get("location", "Unknown Location")
+                            }
+                            # 添加額外信息
+                            for key in ["year_built", "architectural_style", "significance"]:
+                                if key in landmark_info:
+                                    landmark_obj[key] = landmark_info[key]
+                            # 添加地標類型
+                            landmark_obj["landmark_type"] = landmark_type
+                            # 添加到檢測物體列表
+                            detected_objects.append(landmark_obj)
+                            landmark_objects.append(landmark_obj)
+                            self.logger.info(f"Detected landmark: {landmark_info.get('landmark_name', 'Unknown')} with confidence {landmark_info.get('confidence', 0.0):.2f}")
+                    except Exception as e:
+                        self.logger.error(f"Error processing landmark: {e}")
+                        continue
+                return detected_objects, landmark_objects
+            return detected_objects, []
+        except Exception as e:
+            self.logger.error(f"Error in landmark detection: {e}")
+            traceback.print_exc()
+            return detected_objects, []
+    def remove_landmark_references(self, text):
+        """
+        從文本中移除所有地標引用。
+        Args:
+            text: 輸入文本
+        Returns:
+            str: 清除地標引用後的文本
+        """
+        if not text:
+            return text
+        try:
+            # 動態收集所有地標名稱和位置
+            landmark_names = []
+            locations = []
+            for landmark_id, info in self.all_landmarks.items():
+                # 收集地標名稱及其別名
+                landmark_names.append(info["name"])
+                landmark_names.extend(info.get("aliases", []))
+                # 收集地理位置
+                if "location" in info:
+                    location = info["location"]
+                    locations.append(location)
+                    # 處理分離的城市和國家名稱
+                    parts = location.split(",")
+                    if len(parts) >= 1:
+                        locations.append(parts[0].strip())
+                    if len(parts) >= 2:
+                        locations.append(parts[1].strip())
+            # 使用正則表達式動態替換所有地標名稱
+            for name in landmark_names:
+                if name and len(name) > 2:  # 避免過短的名稱
+                    text = re.sub(r'\b' + re.escape(name) + r'\b', "tall structure", text, flags=re.IGNORECASE)
+            # 動態替換所有位置引用
+            for location in locations:
+                if location and len(location) > 2:
+                    # 替換常見位置表述模式
+                    text = re.sub(r'in ' + re.escape(location), "in the urban area", text, flags=re.IGNORECASE)
+                    text = re.sub(r'of ' + re.escape(location), "of the urban area", text, flags=re.IGNORECASE)
+                    text = re.sub(r'\b' + re.escape(location) + r'\b', "the urban area", text, flags=re.IGNORECASE)
+        except Exception as e:
+            self.logger.warning(f"Error in dynamic landmark reference removal, using generic patterns: {e}")
+            # 通用地標描述模式
+            landmark_patterns = [
+                # 地標地點模式
+                (r'an iconic structure in ([A-Z][a-zA-Z\s,]+)', r'an urban structure'),
+                (r'a famous (monument|tower|landmark) in ([A-Z][a-zA-Z\s,]+)', r'an urban structure'),
+                (r'(the [A-Z][a-zA-Z\s]+ Tower)', r'the tower'),
+                (r'(the [A-Z][a-zA-Z\s]+ Building)', r'the building'),
+                (r'(the CN Tower)', r'the tower'),
+                (r'([A-Z][a-zA-Z\s]+) Tower', r'tall structure'),
+                # 地標位置關係模式
+                (r'(centered|built|located|positioned) around the ([A-Z][a-zA-Z\s]+? (Tower|Monument|Landmark))', r'located in this area'),
+                # 地標活動模式
+                (r'(sightseeing|guided tours|cultural tourism) (at|around|near) (this landmark|the [A-Z][a-zA-Z\s]+)', r'\1 in this area'),
+                # 一般性地標形容模式
+                (r'this (famous|iconic|historic|well-known) (landmark|monument|tower|structure)', r'this urban structure'),
+                (r'landmark scene', r'urban scene'),
+                (r'tourist destination', r'urban area'),
+                (r'tourist attraction', r'urban area')
+            ]
+            for pattern, replacement in landmark_patterns:
+                text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
+        return text
+    def get_alternative_scene_type(self, landmark_scene_type, detected_objects, scene_scores):
+        """
+        為地標場景類型選擇適合的替代類型。
+        Args:
+            landmark_scene_type: 原始地標場景類型
+            detected_objects: 檢測到的物體列表
+            scene_scores: 所有場景類型的分數
+        Returns:
+            str: 適合的替代場景類型
+        """
+        # 1. 嘗試從現有場景分數中找出第二高的非地標場景
+        landmark_types = {"tourist_landmark", "natural_landmark", "historical_monument"}
+        alternative_scores = {k: v for k, v in scene_scores.items() if k not in landmark_types and v > 0.2}
+        if alternative_scores:
+            # 返回分數最高的非地標場景類型
+            return max(alternative_scores.items(), key=lambda x: x[1])[0]
+        # 2. 基於物體組合推斷場景類型
+        object_counts = {}
+        for obj in detected_objects:
+            class_name = obj.get("class_name", "")
+            if class_name not in object_counts:
+                object_counts[class_name] = 0
+            object_counts[class_name] += 1
+        # 根據物體組合決定場景類型
+        if "car" in object_counts or "truck" in object_counts or "bus" in object_counts:
+            # 有車輛，可能是街道或交叉路口
+            if "traffic light" in object_counts or "stop sign" in object_counts:
+                return "intersection"
+            else:
+                return "city_street"
+        if "building" in object_counts and object_counts.get("person", 0) > 0:
+            # 有建築物和人，可能是商業區
+            return "commercial_district"
+        if object_counts.get("person", 0) > 3:
+            # 多個行人，可能是行人區
+            return "pedestrian_area"
+        if "bench" in object_counts or "potted plant" in object_counts:
+            # 有長椅或盆栽，可能是公園區域
+            return "park_area"
+        # 3. 根據原始地標場景類型選擇合適的替代場景
+        if landmark_scene_type == "natural_landmark":
+            return "outdoor_natural_area"
+        elif landmark_scene_type == "historical_monument":
+            return "urban_architecture"
+        # 默認回退到城市街道
+        return "city_street"
+    def extract_landmark_specific_activities(self, landmark_objects):
+        """
+        從識別的地標中提取特定活動。
+        Args:
+            landmark_objects: 地標物體列表
+        Returns:
+            List[str]: 地標特定活動列表
+        """
+        landmark_specific_activities = []
+        # 優先收集來自識別地標的特定活動
+        for lm_obj in landmark_objects:
+            lm_id = lm_obj.get("landmark_id")
+            if lm_id and lm_id in self.landmark_activities:
+                landmark_specific_activities.extend(self.landmark_activities[lm_id])
+        if landmark_specific_activities:
+            landmark_names = [lm.get('landmark_name', 'unknown') for lm in landmark_objects if lm.get('is_landmark', False)]
+            self.logger.info(f"Added {len(landmark_specific_activities)} landmark-specific activities for {', '.join(landmark_names)}")
+        return landmark_specific_activities
+    def update_enable_landmark_status(self, enable_landmark: bool):
+        """
+        更新地標檢測的啟用狀態。
+        Args:
+            enable_landmark: 是否啟用地標檢測
+        """
+        self.enable_landmark = enable_landmark
+    def update_use_landmark_detection_status(self, use_landmark_detection: bool):
+        """
+        更新地標檢測使用狀態。
+        Args:
+            use_landmark_detection: 是否使用地標檢測
+        """
+        self.use_landmark_detection = use_landmark_detection

lighting_analyzer.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

lighting_condition_analyzer.py ADDED Viewed

	@@ -0,0 +1,854 @@

+import numpy as np
+import logging
+import traceback
+from typing import Dict, Any, Optional, List, Tuple
+from configuration_manager import ConfigurationManager
+class LightingConditionAnalyzer:
+    """
+    Determines specific lighting conditions and time of day based on scene analysis.
+    此class 會判斷一些光線的特定場景
+    This class analyzes lighting characteristics including natural and artificial illumination,
+    color temperature patterns, and temporal indicators to classify scenes into specific
+    lighting categories such as day clear, night with lights, indoor artificial, etc.
+    """
+    def __init__(self, config_manager: ConfigurationManager):
+        """
+        Initialize the lighting condition analyzer.
+        Args:
+            config_manager: Configuration manager instance for accessing thresholds and parameters.
+        """
+        self.config_manager = config_manager
+        self.logger = self._setup_logger()
+        # Internal threshold constants for Places365 analysis
+        self.P365_ATTRIBUTE_CONF_THRESHOLD = 0.60
+        self.P365_SCENE_MODERATE_CONF_THRESHOLD = 0.45
+        self.P365_SCENE_HIGH_CONF_THRESHOLD = 0.70
+        # Scene type keyword definitions
+        self.P365_OUTDOOR_SCENE_KEYWORDS = [
+            "street", "road", "highway", "park", "beach", "mountain", "forest", "field",
+            "outdoor", "sky", "coast", "courtyard", "square", "plaza", "bridge",
+            "parking", "playground", "stadium", "construction", "river", "ocean", "desert",
+            "garden", "trail", "natural_landmark", "airport_outdoor", "train_station_outdoor",
+            "bus_station_outdoor", "intersection", "crosswalk", "sidewalk", "pathway"
+        ]
+        self.P365_INDOOR_RESTAURANT_KEYWORDS = [
+            "restaurant", "bar", "cafe", "dining_room", "pub", "bistro", "eatery"
+        ]
+    def _setup_logger(self) -> logging.Logger:
+        """Set up logger for lighting condition analysis operations."""
+        logger = logging.getLogger(f"{__name__}.LightingConditionAnalyzer")
+        if not logger.handlers:
+            handler = logging.StreamHandler()
+            formatter = logging.Formatter(
+                '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+            )
+            handler.setFormatter(formatter)
+            logger.addHandler(handler)
+            logger.setLevel(logging.INFO)
+        return logger
+    def analyze_lighting_conditions(self, features: Dict[str, Any], is_indoor: bool,
+                                   places365_info: Optional[Dict] = None) -> Dict[str, Any]:
+        """
+        Determine specific lighting conditions based on features and scene context.
+        Args:
+            features: Dictionary containing extracted image features.
+            is_indoor: Boolean indicating whether the scene is indoor (from previous classification).
+            places365_info: Optional Places365 classification information.
+        Returns:
+            Dictionary containing lighting analysis results including time_of_day, confidence,
+            and diagnostic information.
+        """
+        try:
+            self.logger.debug(f"Starting lighting analysis for {'indoor' if is_indoor else 'outdoor'} scene")
+            # Initialize analysis results
+            time_of_day = "unknown"
+            confidence = 0.5
+            diagnostics = {}
+            # Extract Places365 context
+            p365_context = self._extract_places365_context(places365_info, diagnostics)
+            # Priority 1: Use Places365 attributes if highly confident
+            attribute_result = self._analyze_places365_attributes(
+                p365_context, is_indoor, features, diagnostics
+            )
+            if attribute_result["determined"] and attribute_result["confidence"] >= 0.75:
+                self.logger.debug(f"High-confidence Places365 attribute determination: {attribute_result['time_of_day']}")
+                return {
+                    "time_of_day": attribute_result["time_of_day"],
+                    "confidence": attribute_result["confidence"],
+                    "diagnostics": diagnostics
+                }
+            # Priority 2: Visual feature analysis with Places365 scene context
+            visual_result = self._analyze_visual_features(
+                features, is_indoor, p365_context, diagnostics
+            )
+            time_of_day = visual_result["time_of_day"]
+            confidence = visual_result["confidence"]
+            # Combine with attribute result if it exists but wasn't decisive
+            if attribute_result["determined"]:
+                combined_result = self._combine_attribute_and_visual_results(
+                    attribute_result, visual_result, diagnostics
+                )
+                time_of_day = combined_result["time_of_day"]
+                confidence = combined_result["confidence"]
+            # Priority 3: Special lighting refinement (neon, sodium vapor)
+            refined_result = self._apply_special_lighting_refinement(
+                time_of_day, confidence, features, is_indoor, p365_context, diagnostics
+            )
+            time_of_day = refined_result["time_of_day"]
+            confidence = refined_result["confidence"]
+            # Final confidence clamping
+            confidence = min(0.95, max(0.50, confidence))
+            # Record final results
+            diagnostics["final_lighting_time_of_day"] = time_of_day
+            diagnostics["final_lighting_confidence"] = round(confidence, 3)
+            self.logger.debug(f"Lighting analysis complete: {time_of_day} (confidence: {confidence:.3f})")
+            return {
+                "time_of_day": time_of_day,
+                "confidence": confidence,
+                "diagnostics": diagnostics
+            }
+        except Exception as e:
+            self.logger.error(f"Error in lighting condition analysis: {str(e)}")
+            self.logger.error(f"Traceback: {traceback.format_exc()}")
+            return self._get_default_lighting_result()
+    def _extract_places365_context(self, places365_info: Optional[Dict],
+                                  diagnostics: Dict[str, Any]) -> Dict[str, Any]:
+        """Extract and validate Places365 context information for lighting analysis."""
+        context = {
+            "mapped_scene": "unknown",
+            "attributes": [],
+            "confidence": 0.0
+        }
+        if places365_info:
+            context["mapped_scene"] = places365_info.get('mapped_scene_type', 'unknown').lower()
+            context["attributes"] = [attr.lower() for attr in places365_info.get('attributes', [])]
+            context["confidence"] = places365_info.get('confidence', 0.0)
+            diagnostics["p365_context_for_lighting"] = (
+                f"P365 Scene: {context['mapped_scene']}, Attrs: {context['attributes']}, "
+                f"Conf: {context['confidence']:.2f}"
+            )
+        return context
+    def _analyze_places365_attributes(self, p365_context: Dict[str, Any], is_indoor: bool,
+                                     features: Dict[str, Any], diagnostics: Dict[str, Any]) -> Dict[str, Any]:
+        """Analyze Places365 attributes for lighting condition determination."""
+        if (not p365_context["attributes"] or
+            p365_context["confidence"] <= self.P365_ATTRIBUTE_CONF_THRESHOLD):
+            return {"determined": False, "time_of_day": "unknown", "confidence": 0.5}
+        confidence = p365_context["confidence"]
+        attributes = p365_context["attributes"]
+        mapped_scene = p365_context["mapped_scene"]
+        # Outdoor attribute analysis
+        if not is_indoor:
+            outdoor_result = self._analyze_outdoor_attributes(
+                attributes, mapped_scene, confidence, diagnostics
+            )
+            if outdoor_result["determined"]:
+                return outdoor_result
+        # Indoor attribute analysis
+        if is_indoor:
+            indoor_result = self._analyze_indoor_attributes(
+                attributes, mapped_scene, features, confidence, diagnostics
+            )
+            if indoor_result["determined"]:
+                return indoor_result
+        return {"determined": False, "time_of_day": "unknown", "confidence": 0.5}
+    def _analyze_outdoor_attributes(self, attributes: List[str], mapped_scene: str,
+                                   confidence: float, diagnostics: Dict[str, Any]) -> Dict[str, Any]:
+        """Analyze Places365 attributes for outdoor lighting conditions."""
+        base_confidence_boost = (confidence - self.P365_ATTRIBUTE_CONF_THRESHOLD) * 0.25
+        if "sunny" in attributes or "clear sky" in attributes:
+            final_confidence = 0.85 + base_confidence_boost
+            diagnostics["reason"] = "P365 attribute: sunny/clear sky (Outdoor)."
+            return {
+                "determined": True,
+                "time_of_day": "day_clear",
+                "confidence": final_confidence
+            }
+        elif "nighttime" in attributes or "night" in attributes:
+            if ("artificial lighting" in attributes or "man-made lighting" in attributes or
+                any(kw in mapped_scene for kw in ["street", "city", "road", "urban", "downtown"])):
+                final_confidence = 0.82 + base_confidence_boost * 0.8
+                diagnostics["reason"] = "P365 attribute: nighttime with artificial/street lights (Outdoor)."
+                return {
+                    "determined": True,
+                    "time_of_day": "night_with_lights",
+                    "confidence": final_confidence
+                }
+            else:
+                final_confidence = 0.78 + base_confidence_boost * 0.8
+                diagnostics["reason"] = "P365 attribute: nighttime, dark (Outdoor)."
+                return {
+                    "determined": True,
+                    "time_of_day": "night_dark",
+                    "confidence": final_confidence
+                }
+        elif "cloudy" in attributes or "overcast" in attributes:
+            final_confidence = 0.80 + base_confidence_boost
+            diagnostics["reason"] = "P365 attribute: cloudy/overcast (Outdoor)."
+            return {
+                "determined": True,
+                "time_of_day": "day_cloudy_overcast",
+                "confidence": final_confidence
+            }
+        return {"determined": False, "time_of_day": "unknown", "confidence": 0.5}
+    def _analyze_indoor_attributes(self, attributes: List[str], mapped_scene: str,
+                                  features: Dict[str, Any], confidence: float,
+                                  diagnostics: Dict[str, Any]) -> Dict[str, Any]:
+        """Analyze Places365 attributes for indoor lighting conditions."""
+        base_confidence_boost = (confidence - self.P365_ATTRIBUTE_CONF_THRESHOLD) * 0.20
+        avg_brightness = features.get("avg_brightness", 128.0)
+        if "artificial lighting" in attributes or "man-made lighting" in attributes:
+            base_indoor_conf = 0.70 + base_confidence_boost
+            thresholds = self.config_manager.lighting_thresholds
+            if avg_brightness > thresholds.indoor_bright_thresh:
+                time_of_day = "indoor_bright_artificial"
+                final_confidence = base_indoor_conf + 0.10
+            elif avg_brightness > thresholds.indoor_moderate_thresh:
+                time_of_day = "indoor_moderate_artificial"
+                final_confidence = base_indoor_conf
+            else:
+                time_of_day = "indoor_dim_artificial"
+                final_confidence = base_indoor_conf - 0.05
+            diagnostics["reason"] = (
+                f"P365 attribute: artificial lighting (Indoor), "
+                f"brightness based category: {time_of_day}."
+            )
+            return {
+                "determined": True,
+                "time_of_day": time_of_day,
+                "confidence": final_confidence
+            }
+        elif "natural lighting" in attributes:
+            is_applicable_scene = (
+                self._check_home_environment_pattern(features) or
+                any(kw in mapped_scene for kw in ["living_room", "bedroom", "sunroom"])
+            )
+            if is_applicable_scene:
+                final_confidence = 0.80 + base_confidence_boost
+                diagnostics["reason"] = "P365 attribute: natural lighting in residential/applicable indoor scene."
+                return {
+                    "determined": True,
+                    "time_of_day": "indoor_residential_natural",
+                    "confidence": final_confidence
+                }
+        return {"determined": False, "time_of_day": "unknown", "confidence": 0.5}
+    def _analyze_visual_features(self, features: Dict[str, Any], is_indoor: bool,
+                                p365_context: Dict[str, Any], diagnostics: Dict[str, Any]) -> Dict[str, Any]:
+        """Analyze visual features for lighting condition determination."""
+        if is_indoor:
+            return self._analyze_indoor_visual_features(features, p365_context, diagnostics)
+        else:
+            return self._analyze_outdoor_visual_features(features, p365_context, diagnostics)
+    def _analyze_indoor_visual_features(self, features: Dict[str, Any], p365_context: Dict[str, Any],
+                                       diagnostics: Dict[str, Any]) -> Dict[str, Any]:
+        """Analyze visual features for indoor lighting conditions."""
+        avg_brightness = features.get("avg_brightness", 128.0)
+        thresholds = self.config_manager.lighting_thresholds
+        # Extract relevant features
+        sky_blue_in_sky_region = features.get("sky_region_blue_dominance", 0.0)
+        sky_region_is_brighter = features.get("sky_region_brightness_ratio", 1.0) > 1.05
+        is_likely_home_environment = self._check_home_environment_pattern(features)
+        # Lighting and structural features
+        circular_lights = features.get("circular_light_count", 0)
+        bright_spots_overall = features.get("bright_spot_count", 0)
+        brightness_uniformity = features.get("brightness_uniformity", 0.0)
+        warm_ratio = features.get("warm_ratio", 0.0)
+        # Natural light hints calculation
+        natural_light_hints = 0.0
+        if sky_blue_in_sky_region > 0.05 and sky_region_is_brighter:
+            natural_light_hints += 1.0
+        if brightness_uniformity > 0.65 and features.get("brightness_std", 100.0) < 70:
+            natural_light_hints += 1.0
+        if warm_ratio > 0.15 and avg_brightness > 110:
+            natural_light_hints += 0.5
+        # Designer lighting detection
+        is_designer_lit = (
+            (circular_lights > 0 or bright_spots_overall > 2) and
+            brightness_uniformity > 0.6 and warm_ratio > 0.2 and avg_brightness > 90
+        )
+        # Brightness-based classification
+        if avg_brightness > thresholds.indoor_bright_thresh:
+            return self._classify_bright_indoor(
+                features, natural_light_hints, is_designer_lit, is_likely_home_environment,
+                p365_context, diagnostics
+            )
+        elif avg_brightness > thresholds.indoor_moderate_thresh:
+            return self._classify_moderate_indoor(
+                features, is_designer_lit, is_likely_home_environment, p365_context, diagnostics
+            )
+        else:
+            return self._classify_dim_indoor(features, diagnostics)
+    def _classify_bright_indoor(self, features: Dict[str, Any], natural_light_hints: float,
+                               is_designer_lit: bool, is_likely_home_environment: bool,
+                               p365_context: Dict[str, Any], diagnostics: Dict[str, Any]) -> Dict[str, Any]:
+        """Classify bright indoor lighting conditions."""
+        mapped_scene = p365_context["mapped_scene"]
+        sky_blue_in_sky_region = features.get("sky_region_blue_dominance", 0.0)
+        sky_region_is_brighter = features.get("sky_region_brightness_ratio", 1.0) > 1.05
+        # Natural residential lighting
+        if (natural_light_hints >= 1.5 and
+            (is_likely_home_environment or any(kw in mapped_scene for kw in ["home", "residential", "living", "bedroom"]))):
+            return {
+                "time_of_day": "indoor_residential_natural",
+                "confidence": 0.82
+            }
+        # Designer residential lighting
+        elif (is_designer_lit and
+              (is_likely_home_environment or any(kw in mapped_scene for kw in ["home", "designer", "modern_interior"]))):
+            return {
+                "time_of_day": "indoor_designer_residential",
+                "confidence": 0.85
+            }
+        # Mixed natural/artificial lighting
+        elif sky_blue_in_sky_region > 0.03 and sky_region_is_brighter:
+            return {
+                "time_of_day": "indoor_bright_natural_mix",
+                "confidence": 0.78
+            }
+        # Pure artificial lighting
+        else:
+            return {
+                "time_of_day": "indoor_bright_artificial",
+                "confidence": 0.75
+            }
+    def _classify_moderate_indoor(self, features: Dict[str, Any], is_designer_lit: bool,
+                                 is_likely_home_environment: bool, p365_context: Dict[str, Any],
+                                 diagnostics: Dict[str, Any]) -> Dict[str, Any]:
+        """Classify moderate brightness indoor lighting conditions."""
+        mapped_scene = p365_context["mapped_scene"]
+        confidence = p365_context["confidence"]
+        warm_ratio = features.get("warm_ratio", 0.0)
+        yellow_orange_ratio = features.get("yellow_orange_ratio", 0.0)
+        # Designer residential lighting
+        if (is_designer_lit and
+            (is_likely_home_environment or any(kw in mapped_scene for kw in ["home", "designer"]))):
+            return {
+                "time_of_day": "indoor_designer_residential",
+                "confidence": 0.78
+            }
+        # Restaurant/bar lighting
+        elif warm_ratio > 0.35 and yellow_orange_ratio > 0.1:
+            return self._classify_restaurant_bar_lighting(
+                p365_context, features, diagnostics
+            )
+        # Standard moderate artificial
+        else:
+            return {
+                "time_of_day": "indoor_moderate_artificial",
+                "confidence": 0.70
+            }
+    def _classify_restaurant_bar_lighting(self, p365_context: Dict[str, Any],
+                                         features: Dict[str, Any],
+                                         diagnostics: Dict[str, Any]) -> Dict[str, Any]:
+        """Classify restaurant/bar specific lighting conditions."""
+        mapped_scene = p365_context["mapped_scene"]
+        confidence = p365_context["confidence"]
+        # Strong P365 restaurant/bar confirmation
+        if (any(kw in mapped_scene for kw in self.P365_INDOOR_RESTAURANT_KEYWORDS) and
+            confidence > self.P365_SCENE_MODERATE_CONF_THRESHOLD):
+            diagnostics["visual_analysis_reason"] = (
+                "Visual: Moderate warm tones. P365 context confirms restaurant/bar."
+            )
+            return {
+                "time_of_day": "indoor_restaurant_bar",
+                "confidence": 0.80 + confidence * 0.15
+            }
+        # P365 outdoor conflict detection
+        elif (any(kw in mapped_scene for kw in self.P365_OUTDOOR_SCENE_KEYWORDS) and
+              confidence > self.P365_SCENE_MODERATE_CONF_THRESHOLD):
+            diagnostics["visual_analysis_reason"] = (
+                "Visual: Moderate warm. CONFLICT: LA says indoor but P365 scene is outdoor. "
+                "Defaulting to general indoor artificial."
+            )
+            diagnostics["conflict_is_indoor_vs_p365_scene_for_restaurant_bar"] = True
+            return {
+                "time_of_day": "indoor_moderate_artificial",
+                "confidence": 0.55
+            }
+        # Neutral P365 context
+        else:
+            diagnostics["visual_analysis_reason"] = (
+                "Visual: Moderate warm tones, typical of restaurant/bar. P365 context neutral or weak."
+            )
+            return {
+                "time_of_day": "indoor_restaurant_bar",
+                "confidence": 0.70
+            }
+    def _classify_dim_indoor(self, features: Dict[str, Any],
+                            diagnostics: Dict[str, Any]) -> Dict[str, Any]:
+        """Classify dim indoor lighting conditions."""
+        warm_ratio = features.get("warm_ratio", 0.0)
+        yellow_orange_ratio = features.get("yellow_orange_ratio", 0.0)
+        if warm_ratio > 0.45 and yellow_orange_ratio > 0.15:
+            return {
+                "time_of_day": "indoor_dim_warm",
+                "confidence": 0.75
+            }
+        else:
+            return {
+                "time_of_day": "indoor_dim_general",
+                "confidence": 0.70
+            }
+    def _analyze_outdoor_visual_features(self, features: Dict[str, Any], p365_context: Dict[str, Any],
+                                        diagnostics: Dict[str, Any]) -> Dict[str, Any]:
+        """Analyze visual features for outdoor lighting conditions."""
+        avg_brightness = features.get("avg_brightness", 128.0)
+        thresholds = self.config_manager.lighting_thresholds
+        # P365 enhanced street scene analysis
+        street_result = self._analyze_p365_enhanced_street_scenes(
+            features, p365_context, diagnostics
+        )
+        if street_result["determined"]:
+            return street_result
+        # Brightness-based outdoor classification
+        if avg_brightness < thresholds.outdoor_night_thresh_brightness:
+            return self._classify_night_outdoor(features, diagnostics)
+        elif (avg_brightness < thresholds.outdoor_dusk_dawn_thresh_brightness and
+              self._check_warm_sunset_conditions(features)):
+            return self._classify_sunset_sunrise(features, p365_context, diagnostics)
+        elif avg_brightness > thresholds.outdoor_day_bright_thresh:
+            return self._classify_bright_day_outdoor(features, diagnostics)
+        elif avg_brightness > thresholds.outdoor_day_cloudy_thresh:
+            return self._classify_cloudy_day_outdoor(features, diagnostics)
+        else:
+            return self._classify_general_outdoor(features, diagnostics)
+    def _analyze_p365_enhanced_street_scenes(self, features: Dict[str, Any], p365_context: Dict[str, Any],
+                                            diagnostics: Dict[str, Any]) -> Dict[str, Any]:
+        """Analyze outdoor scenes with Places365 street context enhancement."""
+        mapped_scene = p365_context["mapped_scene"]
+        confidence = p365_context["confidence"]
+        thresholds = self.config_manager.lighting_thresholds
+        # Check for street scene with warm lighting
+        is_street_scene = (
+            any(kw in mapped_scene for kw in ["street", "city", "road", "urban", "downtown", "intersection"]) and
+            confidence > self.P365_SCENE_MODERATE_CONF_THRESHOLD and
+            features.get("color_atmosphere") == "warm"
+        )
+        if not is_street_scene:
+            return {"determined": False, "time_of_day": "unknown", "confidence": 0.5}
+        avg_brightness = features.get("avg_brightness", 128.0)
+        bright_spots_overall = features.get("bright_spot_count", 0)
+        # Night with street lights
+        if (avg_brightness < thresholds.outdoor_night_thresh_brightness and
+            bright_spots_overall > thresholds.outdoor_night_lights_thresh):
+            diagnostics["visual_analysis_reason"] = (
+                f"P365 outdoor scene '{mapped_scene}' + visual low-warm light with spots -> night_with_lights."
+            )
+            return {
+                "determined": True,
+                "time_of_day": "night_with_lights",
+                "confidence": 0.88 + confidence * 0.1
+            }
+        # Sunset/sunrise conditions
+        elif avg_brightness >= thresholds.outdoor_night_thresh_brightness:
+            diagnostics["visual_analysis_reason"] = (
+                f"P365 outdoor scene '{mapped_scene}' + visual moderate-warm light -> sunset/sunrise."
+            )
+            return {
+                "determined": True,
+                "time_of_day": "sunset_sunrise",
+                "confidence": 0.88 + confidence * 0.1
+            }
+        # Very dark conditions
+        else:
+            diagnostics["visual_analysis_reason"] = (
+                f"P365 outdoor scene '{mapped_scene}' + visual very low light -> night_dark."
+            )
+            return {
+                "determined": True,
+                "time_of_day": "night_dark",
+                "confidence": 0.75 + confidence * 0.1
+            }
+    def _classify_night_outdoor(self, features: Dict[str, Any],
+                               diagnostics: Dict[str, Any]) -> Dict[str, Any]:
+        """Classify nighttime outdoor conditions."""
+        bright_spots_overall = features.get("bright_spot_count", 0)
+        dark_pixel_ratio = features.get("dark_pixel_ratio", 0.0)
+        thresholds = self.config_manager.lighting_thresholds
+        if bright_spots_overall > thresholds.outdoor_night_lights_thresh:
+            confidence = 0.82 + min(0.13, dark_pixel_ratio / 2.5)
+            diagnostics["visual_analysis_reason"] = "Visual: Low brightness with light sources (street/car lights)."
+            return {
+                "time_of_day": "night_with_lights",
+                "confidence": confidence
+            }
+        else:
+            confidence = 0.78 + min(0.17, dark_pixel_ratio / 1.8)
+            diagnostics["visual_analysis_reason"] = "Visual: Very low brightness outdoor, deep night."
+            return {
+                "time_of_day": "night_dark",
+                "confidence": confidence
+            }
+    def _classify_sunset_sunrise(self, features: Dict[str, Any], p365_context: Dict[str, Any],
+                                diagnostics: Dict[str, Any]) -> Dict[str, Any]:
+        """Classify sunset/sunrise outdoor conditions."""
+        yellow_orange_ratio = features.get("yellow_orange_ratio", 0.0)
+        confidence = 0.75 + min(0.20, yellow_orange_ratio / 1.5)
+        diagnostics["visual_analysis_reason"] = "Visual: Moderate brightness, warm tones -> sunset/sunrise."
+        # P365 natural scene boost
+        mapped_scene = p365_context["mapped_scene"]
+        p365_confidence = p365_context["confidence"]
+        if (any(kw in mapped_scene for kw in ["beach", "mountain", "lake", "ocean", "desert", "field", "natural_landmark", "sky"]) and
+            p365_confidence > self.P365_SCENE_MODERATE_CONF_THRESHOLD):
+            confidence = min(0.95, confidence + 0.15)
+            diagnostics["visual_analysis_reason"] += f" P365 natural scene '{mapped_scene}' supports."
+        return {
+            "time_of_day": "sunset_sunrise",
+            "confidence": confidence
+        }
+    def _classify_bright_day_outdoor(self, features: Dict[str, Any],
+                                    diagnostics: Dict[str, Any]) -> Dict[str, Any]:
+        """Classify bright daytime outdoor conditions."""
+        sky_like_blue_in_sky_region = features.get("sky_region_blue_dominance", 0.0)
+        sky_region_brightness_ratio = features.get("sky_region_brightness_ratio", 1.0)
+        texture_complexity = features.get("top_region_texture_complexity", 0.5)
+        thresholds = self.config_manager.lighting_thresholds
+        # Clear sky conditions
+        if (sky_like_blue_in_sky_region > thresholds.outdoor_day_blue_thresh or
+            (sky_region_brightness_ratio > 1.05 and texture_complexity < 0.4)):
+            confidence = 0.80 + min(0.15, sky_like_blue_in_sky_region * 2 +
+                                  (sky_like_blue_in_sky_region * 1.5 if sky_region_brightness_ratio > 1.05 else 0))
+            diagnostics["visual_analysis_reason"] = "Visual: High brightness with blue/sky tones or bright smooth top."
+            return {
+                "time_of_day": "day_clear",
+                "confidence": confidence
+            }
+        # Stadium/floodlit detection
+        brightness_uniformity = features.get("brightness_uniformity", 0.0)
+        bright_spots_overall = features.get("bright_spot_count", 0)
+        if (brightness_uniformity > 0.70 and
+            bright_spots_overall > thresholds.stadium_min_spots_thresh):
+            diagnostics["visual_analysis_reason"] = (
+                "Visual: Very bright, uniform lighting with multiple sources, suggests floodlights (Outdoor)."
+            )
+            return {
+                "time_of_day": "stadium_or_floodlit_area",
+                "confidence": 0.78
+            }
+        # General bright day
+        diagnostics["visual_analysis_reason"] = "Visual: High brightness outdoor, specific sky features unclear."
+        return {
+            "time_of_day": "day_bright_general",
+            "confidence": 0.68
+        }
+    def _classify_cloudy_day_outdoor(self, features: Dict[str, Any],
+                                    diagnostics: Dict[str, Any]) -> Dict[str, Any]:
+        """Classify cloudy daytime outdoor conditions."""
+        sky_region_brightness_ratio = features.get("sky_region_brightness_ratio", 1.0)
+        texture_complexity = features.get("top_region_texture_complexity", 0.5)
+        avg_saturation = features.get("avg_saturation", 100.0)
+        gray_ratio = features.get("gray_ratio", 0.0)
+        brightness_uniformity = features.get("brightness_uniformity", 0.0)
+        thresholds = self.config_manager.lighting_thresholds
+        # Overcast conditions
+        if (sky_region_brightness_ratio > 1.05 and texture_complexity < 0.45 and avg_saturation < 70):
+            confidence = 0.75 + min(0.20, gray_ratio / 1.5 + (brightness_uniformity - 0.5) / 1.5)
+            diagnostics["visual_analysis_reason"] = (
+                "Visual: Good brightness, uniform bright top, lower saturation -> overcast."
+            )
+            return {
+                "time_of_day": "day_cloudy_overcast",
+                "confidence": confidence
+            }
+        # Gray cloudy conditions
+        elif gray_ratio > thresholds.outdoor_day_gray_thresh:
+            confidence = 0.72 + min(0.23, gray_ratio / 1.8)
+            diagnostics["visual_analysis_reason"] = "Visual: Good brightness with higher gray tones."
+            return {
+                "time_of_day": "day_cloudy_gray",
+                "confidence": confidence
+            }
+        # General bright outdoor
+        else:
+            diagnostics["visual_analysis_reason"] = "Visual: Bright outdoor, specific type less clear."
+            return {
+                "time_of_day": "day_bright_general",
+                "confidence": 0.68
+            }
+    def _classify_general_outdoor(self, features: Dict[str, Any],
+                                 diagnostics: Dict[str, Any]) -> Dict[str, Any]:
+        """Classify general outdoor conditions when specific patterns are unclear."""
+        color_atmosphere = features.get("color_atmosphere", "neutral")
+        yellow_orange_ratio = features.get("yellow_orange_ratio", 0.0)
+        sky_like_blue_in_sky_region = features.get("sky_region_blue_dominance", 0.0)
+        # Potential sunset/sunrise with low confidence
+        if color_atmosphere == "warm" and yellow_orange_ratio > 0.08:
+            diagnostics["visual_analysis_reason"] = (
+                "Visual: Outdoor, specific conditions less clear; broader visual cues suggest warm lighting."
+            )
+            return {
+                "time_of_day": "sunset_sunrise_low_confidence",
+                "confidence": 0.62
+            }
+        # Potential hazy day conditions
+        elif sky_like_blue_in_sky_region > 0.02:
+            diagnostics["visual_analysis_reason"] = (
+                "Visual: Outdoor, specific conditions less clear; some blue tones suggest daylight."
+            )
+            return {
+                "time_of_day": "day_hazy_or_partly_cloudy",
+                "confidence": 0.62
+            }
+        # Unknown outdoor daylight
+        else:
+            diagnostics["visual_analysis_reason"] = (
+                "Visual: Outdoor, specific conditions less clear; broader visual cues."
+            )
+            return {
+                "time_of_day": "outdoor_unknown_daylight",
+                "confidence": 0.58
+            }
+    def _apply_commercial_indoor_refinement(self, features: Dict[str, Any], p365_context: Dict[str, Any],
+                                           time_of_day: str, confidence: float) -> Dict[str, Any]:
+        """Apply commercial indoor lighting refinement if conditions are met."""
+        # Skip if already classified as residential, restaurant, or bar
+        if any(category in time_of_day for category in ["residential", "restaurant", "bar"]):
+            return {"time_of_day": time_of_day, "confidence": confidence}
+        # Skip if P365 suggests home environment
+        mapped_scene = p365_context["mapped_scene"]
+        if any(kw in mapped_scene for kw in ["home", "residential"]):
+            return {"time_of_day": time_of_day, "confidence": confidence}
+        # Check commercial lighting indicators
+        avg_brightness = features.get("avg_brightness", 100.0)
+        bright_spots_overall = features.get("bright_spot_count", 0)
+        light_dist_uniformity = features.get("light_distribution_uniformity", 0.5)
+        ceiling_likelihood = features.get("ceiling_likelihood", 0.0)
+        thresholds = self.config_manager.lighting_thresholds
+        if (avg_brightness > thresholds.commercial_min_brightness_thresh and
+            bright_spots_overall > thresholds.commercial_min_spots_thresh and
+            (light_dist_uniformity > 0.5 or ceiling_likelihood > 0.4)):
+            refined_confidence = 0.70 + min(0.2, bright_spots_overall * 0.02)
+            return {
+                "time_of_day": "indoor_commercial",
+                "confidence": refined_confidence
+            }
+        return {"time_of_day": time_of_day, "confidence": confidence}
+    def _apply_special_lighting_refinement(self, time_of_day: str, confidence: float,
+                                          features: Dict[str, Any], is_indoor: bool,
+                                          p365_context: Dict[str, Any],
+                                          diagnostics: Dict[str, Any]) -> Dict[str, Any]:
+        """Apply special lighting refinement for neon and sodium vapor lighting."""
+        # Apply commercial refinement for indoor scenes first
+        if is_indoor:
+            commercial_result = self._apply_commercial_indoor_refinement(
+                features, p365_context, time_of_day, confidence
+            )
+            time_of_day = commercial_result["time_of_day"]
+            confidence = commercial_result["confidence"]
+        # Check for neon/sodium vapor lighting conditions
+        is_current_night_or_dim_warm = "night" in time_of_day or time_of_day == "indoor_dim_warm"
+        if not is_current_night_or_dim_warm:
+            return {"time_of_day": time_of_day, "confidence": confidence}
+        # Extract features for neon detection
+        yellow_orange_ratio = features.get("yellow_orange_ratio", 0.0)
+        bright_spots_overall = features.get("bright_spot_count", 0)
+        color_atmosphere = features.get("color_atmosphere", "neutral")
+        avg_saturation = features.get("avg_saturation", 0.0)
+        # Get neon detection thresholds
+        thresholds = self.config_manager.lighting_thresholds
+        # Check neon lighting conditions
+        if (yellow_orange_ratio > thresholds.neon_yellow_orange_thresh and
+            bright_spots_overall > thresholds.neon_bright_spots_thresh and
+            color_atmosphere == "warm" and
+            avg_saturation > thresholds.neon_avg_saturation_thresh):
+            old_time_of_day = time_of_day
+            old_confidence = confidence
+            # Check P365 context for neon scenes
+            mapped_scene = p365_context["mapped_scene"]
+            attributes = p365_context["attributes"]
+            is_p365_neon_context = (
+                any(kw in mapped_scene for kw in ["neon", "nightclub", "bar_neon"]) or
+                "neon" in attributes
+            )
+            if is_indoor:
+                if (is_p365_neon_context or
+                    any(kw in mapped_scene for kw in self.P365_INDOOR_RESTAURANT_KEYWORDS)):
+                    time_of_day = "indoor_neon_lit"
+                    confidence = max(confidence, 0.80)
+                else:
+                    time_of_day = "indoor_dim_warm_neon_accent"
+                    confidence = max(confidence, 0.77)
+            else:
+                if (is_p365_neon_context or
+                    any(kw in mapped_scene for kw in ["street_night", "city_night", "downtown_night"])):
+                    time_of_day = "neon_or_sodium_vapor_night"
+                    confidence = max(confidence, 0.82)
+                else:
+                    time_of_day = "night_with_neon_lights"
+                    confidence = max(confidence, 0.79)
+            # Record the refinement
+            diagnostics["special_lighting_detected"] = (
+                f"Refined from {old_time_of_day} (Conf:{old_confidence:.2f}) "
+                f"to {time_of_day} (Conf:{confidence:.2f}) due to neon/sodium vapor light characteristics. "
+                f"P365 Context: {mapped_scene if is_p365_neon_context else 'N/A'}."
+            )
+        return {"time_of_day": time_of_day, "confidence": confidence}
+    def _combine_attribute_and_visual_results(self, attribute_result: Dict[str, Any],
+                                             visual_result: Dict[str, Any],
+                                             diagnostics: Dict[str, Any]) -> Dict[str, Any]:
+        """Combine Places365 attribute and visual analysis results."""
+        # If visual analysis provided a different and potentially more nuanced result
+        if (attribute_result["time_of_day"] != visual_result["time_of_day"] and
+            visual_result["confidence"] > 0.65):
+            diagnostics["final_decision_source"] = "Visual features (potentially P365-context-refined)."
+            diagnostics["p365_attr_overridden_by_visual"] = (
+                f"P365 Attr ToD {attribute_result['time_of_day']} "
+                f"(Conf {attribute_result['confidence']:.2f}) was less certain or overridden by "
+                f"visual logic result {visual_result['time_of_day']} (Conf {visual_result['confidence']:.2f})."
+            )
+            return visual_result
+        # Use attribute result if it was more confident
+        elif attribute_result["confidence"] >= visual_result["confidence"]:
+            diagnostics["final_decision_source"] = "High-confidence P365 attribute."
+            return attribute_result
+        # Use visual result
+        else:
+            diagnostics["final_decision_source"] = "Visual features (potentially P365-context-refined)."
+            return visual_result
+    def _check_home_environment_pattern(self, features: Dict[str, Any]) -> bool:
+        """Check if features indicate a home/residential environment pattern."""
+        thresholds = self.config_manager.indoor_outdoor_thresholds
+        return features.get("home_environment_pattern", 0.0) > thresholds.home_pattern_thresh_moderate * 0.7
+    def _check_warm_sunset_conditions(self, features: Dict[str, Any]) -> bool:
+        """Check if features indicate warm sunset/sunrise lighting conditions."""
+        thresholds = self.config_manager.lighting_thresholds
+        yellow_orange_ratio = features.get("yellow_orange_ratio", 0.0)
+        color_atmosphere = features.get("color_atmosphere", "neutral")
+        sky_brightness_ratio = features.get("sky_region_brightness_ratio", 1.0)
+        return (yellow_orange_ratio > thresholds.outdoor_dusk_dawn_color_thresh and
+                color_atmosphere == "warm" and
+                sky_brightness_ratio < 1.5)
+    def _get_default_lighting_result(self) -> Dict[str, Any]:
+        """Return default lighting analysis result in case of errors."""
+        return {
+            "time_of_day": "unknown",
+            "confidence": 0.5,
+            "diagnostics": {
+                "error": "Lighting analysis failed, using default values"
+            }
+        }

llm_enhancer.py CHANGED Viewed

@@ -1,1266 +1,504 @@
-import re
-import os
-import torch
-from typing import Dict, List, Tuple, Any, Optional
 import logging
 class LLMEnhancer:
     """
-    負責使用LLM (Large Language Model) 增強場景理解和描述。
-    未來可以再整合Llama或其他LLM模型進行場景描述的生成和豐富化。
     """
     def __init__(self,
-                model_path: Optional[str] = None,
-                tokenizer_path: Optional[str] = None,
-                device: Optional[str] = None,
-                max_length: int = 2048,
-                temperature: float = 0.3,
-                top_p: float = 0.85):
         """
-        初始化LLM增強器
         Args:
-            model_path: LLM模型的路徑或HuggingFace log in，默認使用Llama 3.2
-            tokenizer_path: token處理器的路徑，通常與model_path相同
-            device: 設備檢查 ('cpu'或'cuda')
-            max_length: 生成文本的最大長度
-            temperature: 生成文本的溫度（較高比較有創意，較低會偏保守）
             top_p: 生成文本時的核心採樣機率閾值
         """
-        self.logger = logging.getLogger("LLMEnhancer")
-        self.logger.setLevel(logging.INFO)
-        handler = logging.StreamHandler()
-        handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
-        self.logger.addHandler(handler)
-        # 默認用 Llama3.2
-        self.model_path = model_path or "meta-llama/Llama-3.2-3B-Instruct"
-        self.tokenizer_path = tokenizer_path or self.model_path
-        # check device
-        self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
-        self.logger.info(f"Using device: {self.device}")
-        # create parameters
-        self.max_length = max_length
-        self.temperature = temperature
-        self.top_p = top_p
-        self.model = None
-        self.tokenizer = None
-        # 追蹤模型調用次數
-        self.call_count = 0
-        self._initialize_prompts()
-        # only if need to load the model
-        self._model_loaded = False
         try:
-            self.hf_token = os.environ.get("HF_TOKEN")
-            if self.hf_token:
-                self.logger.info("Logging in to Hugging Face with token")
-                from huggingface_hub import login
-                login(token=self.hf_token)
-            else:
-                self.logger.warning("HF_TOKEN not found in environment variables. Access to gated models may be limited.")
-        except Exception as e:
-            self.logger.error(f"Error during Hugging Face login: {e}")
-    def _load_model(self):
-        """只在首次需要時加載，使用 8 位量化以節省記憶體"""
-        if self._model_loaded:
-            return
-        try:
-            self.logger.info(f"Loading LLM model from {self.model_path} with 8-bit quantization")
-            import torch
-            from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
-            torch.cuda.empty_cache()
-            if torch.cuda.is_available():
-                free_in_GB = torch.cuda.get_device_properties(0).total_memory / 1024**3
-                print(f"Total GPU memory: {free_in_GB:.2f} GB")
-            # 設置 8 位元配置(節省記憶體空間)
-            quantization_config = BitsAndBytesConfig(
-                load_in_8bit=True,
-                llm_int8_enable_fp32_cpu_offload=True
-            )
-            self.tokenizer = AutoTokenizer.from_pretrained(
-                self.tokenizer_path,
-                padding_side="left",
-                use_fast=False,
-                token=self.hf_token
             )
-            # 特殊標記
-            self.tokenizer.pad_token = self.tokenizer.eos_token
-            # 加載 8 位量化模型
-            self.model = AutoModelForCausalLM.from_pretrained(
-                self.model_path,
-                quantization_config=quantization_config,
-                device_map="auto",
-                low_cpu_mem_usage=True,
-                token=self.hf_token
-            )
-            self.logger.info("Model loaded successfully with 8-bit quantization")
-            self._model_loaded = True
-        except Exception as e:
-            self.logger.error(f"Error loading LLM model: {e}")
-            import traceback
-            traceback.print_exc()
-            raise
-    def _initialize_prompts(self):
-        """Return an optimized prompt template specifically for Zephyr model"""
-        # the critical prompt for the model
-        self.enhance_description_template = """
-            <|system|>
-            You are an expert visual analyst. Your task is to improve the readability and fluency of scene descriptions using STRICT factual accuracy.
-            Your **top priority is to avoid hallucination** or fabrication. You are working in a computer vision pipeline using object detection (YOLO) and image embeddings. You MUST treat the input object list as a whitelist. Do not speculate beyond this list.
-            </|system|>
-            <|user|>
-            Rewrite the following scene description to be fluent and clear. DO NOT add any objects, events, or spatial relationships that are not explicitly present in the original or object list.
-            ORIGINAL:
-            {original_description}
-            CRITICAL RULES:
-            1. NEVER assume room type, object function, or scene purpose unless directly stated.
-            2. NEVER invent object types. You are limited to: {object_list}
-            3. NEVER speculate on object quantity. If the description says "10 people" , DO NOT say "dozens" or "many". Maintain the original quantity unless specified.
-            4. Use terms like "in the scene", "visible in the background", or "positioned in the lower left" instead of assuming direction or layout logic.
-            5. You MAY describe confirmed materials, colors, and composition style if visually obvious and non-speculative.
-            6. Write 2–4 complete, well-structured sentences with punctuation.
-            7. Final output MUST be a single fluent paragraph of 60–200 words (not longer).
-            8. Begin your response directly with the scene description. Do NOT include any introductory phrases, explanations, or formatting indicators.
-            9. Ensure grammatical completeness in all sentences. Each sentence must have a complete subject and predicate structure.
-            10. Vary sentence structures naturally while maintaining grammatical accuracy. Avoid incomplete phrases or dangling modifiers.
-            11. Limit repetition of descriptive verbs and spatial indicators to maintain text diversity and readability.
-            12. Create natural spatial flow by connecting object descriptions organically rather than listing positions mechanically.
-            13. Use transitional phrases to connect ideas smoothly, varying expression patterns throughout the description.
-            14. End with a conclusive observation about atmosphere, style, or overall impression rather than restating layout information.
-            15. When describing quantities or arrangements, use only information explicitly confirmed by the object detection system.
-            </|user|>
-            <|assistant|>
-            """
-        # 錯誤檢測的prompt
-        self.verify_detection_template = """
-            Task: You are an advanced vision system that verifies computer vision detections for accuracy.
-            Analyze the following detection results and identify any potential errors or inconsistencies:
-            SCENE TYPE: {scene_type}
-            SCENE NAME: {scene_name}
-            CONFIDENCE: {confidence:.2f}
-            DETECTED OBJECTS: {detected_objects}
-            CLIP ANALYSIS RESULTS:
-            {clip_analysis}
-            Possible Errors to Check:
-            1. Objects misidentified (e.g., architectural elements labeled as vehicles)
-            2. Cultural elements misunderstood (e.g., Asian temple structures labeled as boats)
-            3. Objects that seem out of place for this type of scene
-            4. Inconsistencies between different detection systems
-            If you find potential errors, list them clearly with explanations. If the detections seem reasonable, state that they appear accurate.
-            Verification Results:
-            """
-        # 無檢測處理的prompt
-        self.no_detection_template = """
-            Task: You are an advanced scene understanding system analyzing an image where standard object detection failed to identify specific objects.
-            Based on advanced image embeddings (CLIP analysis), we have the following information:
-            MOST LIKELY SCENE: {top_scene} (confidence: {top_confidence:.2f})
-            VIEWPOINT: {viewpoint}
-            LIGHTING: {lighting_condition}
-            CULTURAL ANALYSIS: {cultural_analysis}
-            Create a detailed description of what might be in this scene, considering:
-            1. The most likely type of location or setting
-            2. Possible architectural or natural elements present
-            3. The lighting and atmosphere
-            4. Potential cultural or regional characteristics
-            Your description should be natural, flowing, and offer insights into what the image likely contains despite the lack of specific object detection.
-            Scene Description:
-            """
-    def _clean_llama_response(self, response: str) -> str:
-        """處理 Llama 模型特有的輸出格式問題"""
-        # 首先應用通用清理
-        response = self._clean_model_response(response)
-        # 移除 Llama 常見的前綴短語
-        prefixes_to_remove = [
-            "Here's the enhanced description:",
-            "Enhanced description:",
-            "Here is the enhanced scene description:",
-            "I've enhanced the description while preserving all factual details:"
-        ]
-        for prefix in prefixes_to_remove:
-            if response.lower().startswith(prefix.lower()):
-                response = response[len(prefix):].strip()
-        # 移除可能的後綴說明
-        suffixes_to_remove = [
-            "I've maintained all the key factual elements",
-            "I've preserved all the factual details",
-            "All factual elements have been maintained"
-        ]
-        for suffix in suffixes_to_remove:
-            if response.lower().endswith(suffix.lower()):
-                response = response[:response.rfind(suffix)].strip()
-        return response
-    # For Future Usage
-    def _detect_scene_type(self, detected_objects: List[Dict]) -> str:
-        """
-        Detect scene type based on object distribution and patterns
-        """
-        # Default scene type
-        scene_type = "intersection"
-        # Count objects by class
-        object_counts = {}
-        for obj in detected_objects:
-            class_name = obj.get("class_name", "")
-            if class_name not in object_counts:
-                object_counts[class_name] = 0
-            object_counts[class_name] += 1
-        # 辨識人
-        people_count = object_counts.get("person", 0)
-        # 交通工具的
-        car_count = object_counts.get("car", 0)
-        bus_count = object_counts.get("bus", 0)
-        truck_count = object_counts.get("truck", 0)
-        total_vehicles = car_count + bus_count + truck_count
-        # Simple scene type detection logic
-        if people_count > 8 and total_vehicles < 2:
-            scene_type = "pedestrian_crossing"
-        elif people_count > 5 and total_vehicles > 2:
-            scene_type = "busy_intersection"
-        elif people_count < 3 and total_vehicles > 3:
-            scene_type = "traffic_junction"
-        return scene_type
-    def _clean_scene_type(self, scene_type: str) -> str:
-        """清理場景類型，使其更適合用於提示詞"""
-        if not scene_type:
-            return "scene"
-        # replace underline to space or sometime capital letter
-        if '_' in scene_type:
-            return ' '.join(word.capitalize() for word in scene_type.split('_'))
-        return scene_type
-    def _clean_model_response(self, response: str) -> str:
-        """清理模型回應以移除常見的標記和前綴"""
-        # 移除任何可能殘留的系統樣式標記
-        response = re.sub(r'<\|.*?\|>', '', response)
-        # 移除任何 "This european_plaza" 或類似前綴
-        response = re.sub(r'^This [a-z_]+\s+', '', response)
-        # 確保響應以大寫字母開頭
-        if response and not response[0].isupper():
-            response = response[0].upper() + response[1:]
-        return response.strip()
-    def reset_context(self):
-        """在處理新圖像前重置模型上下文"""
-        if self._model_loaded:
-            # 清除 GPU 緩存
-            torch.cuda.empty_cache()
-            self.logger.info("Model context reset")
-        else:
-            self.logger.info("Model not loaded, no context to reset")
-    def _remove_introduction_sentences(self, response: str) -> str:
-        """remove introduction sentences"""
-        # 識別常見的介紹性模式
-        intro_patterns = [
-            r'^Here is the (?:rewritten|enhanced) .*?description:',
-            r'^The (?:rewritten|enhanced) description:',
-            r'^Here\'s the (?:rewritten|enhanced) description of .*?:'
-        ]
-        for pattern in intro_patterns:
-            if re.match(pattern, response, re.IGNORECASE):
-                # 找到冒號後的內容
-                parts = re.split(r':', response, 1)
-                if len(parts) > 1:
-                    return parts[1].strip()
-        return response
-    def enhance_description(self, scene_data: Dict[str, Any]) -> str:
-        """場景描述增強器，處理各種場景類型並保留視角與光照資訊，並作為總窗口可運用於其他class"""
-        try:
-            # 重置上下文
-            self.reset_context()
-            # 確保模型已加載
-            if not self._model_loaded:
-                self._load_model()
-            # extract original description
-            original_desc = scene_data.get("original_description", "")
-            if not original_desc:
-                return "No original description provided."
-            # get scene type 並標準化
-            scene_type = scene_data.get("scene_type", "unknown scene")
-            scene_type = self._clean_scene_type(scene_type)
-            # 提取檢測到的物件並過濾低信心度物件
-            detected_objects = scene_data.get("detected_objects", [])
-            filtered_objects = []
-            # 高信心度閾值，嚴格過濾物件
             high_confidence_threshold = 0.65
-            for obj in detected_objects:
-                confidence = obj.get("confidence", 0)
-                class_name = obj.get("class_name", "")
-                # 為特殊類別設置更高閾值
-                special_classes = ["airplane", "helicopter", "boat"]
-                if class_name in special_classes:
-                    if confidence < 0.75:  # 為這些類別設置更高閾值
-                        continue
-                # 只保留高信心度物件
-                if confidence >= high_confidence_threshold:
-                    filtered_objects.append(obj)
-            # 優先使用��入的物體統計信息，如果不存在則計算
             object_statistics = scene_data.get("object_statistics", {})
             object_counts = {}
             if object_statistics:
-                # 使用預計算的統計資訊，確保數量準確
                 for class_name, stats in object_statistics.items():
                     if stats.get("count", 0) > 0 and stats.get("avg_confidence", 0) >= high_confidence_threshold:
                         object_counts[class_name] = stats["count"]
             else:
                 # 回退到原有的計算方式
                 for obj in filtered_objects:
                     class_name = obj.get("class_name", "")
                     if class_name not in object_counts:
                         object_counts[class_name] = 0
                     object_counts[class_name] += 1
-            # 將物件格式化為更精確的描述
-            high_confidence_objects = ", ".join([
                 f"{count} {obj}{'s' if count > 1 else ''}"
                 for obj, count in object_counts.items()
             ])
-            # 如果沒有高信心度物件，回退到使用原始描述中的關鍵詞
-            if not high_confidence_objects:
-                # 從原始描述中提取物件提及
-                object_keywords = self._extract_objects_from_description(original_desc)
-                high_confidence_objects = ", ".join(object_keywords) if object_keywords else "objects visible in the scene"
-            # 保留原始描述中的關鍵視角信息
-            perspective = self._extract_perspective_from_description(original_desc)
-            # 提取光照資訊
-            lighting_description = "unknown lighting"
-            if "lighting_info" in scene_data:
-                lighting_info = scene_data.get("lighting_info", {})
-                time_of_day = lighting_info.get("time_of_day", "unknown")
-                is_indoor = lighting_info.get("is_indoor", False)
-                lighting_description = f"{'indoor' if is_indoor else 'outdoor'} {time_of_day} lighting"
-            # 創建prompt，整合所有關鍵資訊
-            prompt = self.enhance_description_template.format(
-                scene_type=scene_type,
-                object_list=high_confidence_objects,
-                original_description=original_desc,
-                perspective=perspective,
-                lighting_description=lighting_description
-            )
-            # 生成增強描述
-            self.logger.info("Generating LLM response...")
-            response = self._generate_llm_response(prompt)
-            # 檢查回應完整性的更嚴格標準
-            is_landmark_only = (
-                    scene_data.get("scene_type") in ["tourist_landmark", "natural_landmark", "historical_monument"] and
-                    (not scene_data.get("detected_objects") or len(scene_data.get("detected_objects", [])) <= 1)
-                )
-            # 如果是只有地標的情況，調整相關邏輯
-            if is_landmark_only:
-                # 確保原始描述不為空
-                original_desc = scene_data.get("original_description", "")
-                if not original_desc or len(original_desc.strip()) < 10:
-                    # 從場景類型和地標信息生成基本描述
-                    scene_type = scene_data.get("scene_type", "unknown")
-                    scene_name = scene_data.get("scene_name", "Unknown")
-                    if "primary_landmark" in scene_data:
-                        landmark_name = scene_data["primary_landmark"].get("name", "unnamed landmark")
-                        original_desc = f"A {scene_type.replace('_', ' ')} scene featuring {landmark_name}."
-                    else:
-                        original_desc = f"A {scene_type.replace('_', ' ')} scene."
-                    # 更新場景數據
-                    scene_data["original_description"] = original_desc
-            # 檢查回應完整性的更嚴格標準 (保持不變)
-            is_incomplete = (
-                len(response) < 100 or  # too short
-                (len(response) < 200 and "." not in response[-30:]) or  # 結尾沒有適當的標點符號
-                any(response.endswith(phrase) for phrase in ["in the", "with the", "and the"])  # 以不完整短語結尾
-            )
             max_retries = 3
             attempts = 0
-            while attempts < max_retries and is_incomplete:
-                self.logger.warning(f"Generated incomplete response, retrying... Attempt {attempts+1}/{max_retries}")
                 # 重新生成
-                response = self._generate_llm_response(prompt)
                 attempts += 1
-                # 重新檢查完整性
-                is_incomplete = (len(response) < 100 or
-                                (len(response) < 200 and "." not in response[-30:]) or
-                                any(response.endswith(phrase) for phrase in ["in the", "with the", "and the"]))
             if not response or len(response.strip()) < 10:
                 self.logger.warning("Generated response was empty or too short, returning original description")
                 return original_desc
-            # 使用與模型相符的清理方法
-            if "llama" in self.model_path.lower():
-                result = self._clean_llama_response(response)
-            else:
-                result = self._clean_model_response(response)
-            # 移除介紹性type句子
-            result = self._remove_introduction_sentences(result)
-            # 移除explanation
-            result = self._remove_explanatory_notes(result)
-            # fact check
-            result = self._verify_factual_accuracy(original_desc, result, high_confidence_objects)
-            # 確保場景類型和視角一致性
-            result = self._ensure_scene_type_consistency(result, scene_type, original_desc)
-            if perspective and perspective.lower() not in result.lower():
-                result = f"{perspective}, {result[0].lower()}{result[1:]}"
-            final_result = str(result)
-            if not final_result or len(final_result.strip()) < 20:
-                self.logger.warning(f"WARNING: LLM enhanced description is empty or too short!")
-                self.logger.info(f"Original description: {original_desc[:50]}...")
-                self.logger.info(f"Input data: scene_type={scene_data.get('scene_type')}, objects={len(scene_data.get('detected_objects', []))}")
-            else:
-                self.logger.info(f"LLM enhanced description generated successfully ({len(final_result)} chars)")
-            return final_result
         except Exception as e:
-            self.logger.error(f"Enhancement failed: {str(e)}")
-            import traceback
-            self.logger.error(traceback.format_exc())
-            return original_desc  # 發生任何錯誤時返回原始描述
-    def _verify_factual_accuracy(self, original: str, generated: str, object_list: str) -> str:
-        """驗證生成的描述不包含原始描述或物體列表中沒有的信息，並檢測重複用詞問題"""
-        # 將原始描述和物體列表合併為授權詞彙源
-        authorized_content = original.lower() + " " + object_list.lower()
-        # 提取生成描述中具有實質意義的名詞
-        # 創建常見地點、文化和地域詞彙的列表
-        location_terms = ["plaza", "square", "market", "mall", "avenue", "boulevard"]
-        cultural_terms = ["european", "asian", "american", "african", "western", "eastern"]
-        # 檢查生成文本中的每個詞
-        for term in location_terms + cultural_terms:
-            # 僅當該詞出現在生成文本但不在授權內容中時進行替換
-            if term in generated.lower() and term not in authorized_content:
-                # 根據詞語類型選擇適當的替換詞
-                if term in location_terms:
-                    replacement = "area"
-                else:
-                    replacement = "scene"
-                # 使用正則表達式進行完整詞匹配替換
-                pattern = re.compile(r'\b' + term + r'\b', re.IGNORECASE)
-                generated = pattern.sub(replacement, generated)
-        # 檢查描述性詞彙重複問題
-        repetitive_patterns = [
-            (r'\b(visible)\b.*?\b(visible)\b', 'Multiple uses of "visible" detected'),
-            (r'\b(positioned)\b.*?\b(positioned)\b', 'Multiple uses of "positioned" detected'),
-            (r'\b(located)\b.*?\b(located)\b', 'Multiple uses of "located" detected'),
-            (r'\b(situated)\b.*?\b(situated)\b', 'Multiple uses of "situated" detected'),
-            (r'\b(appears)\b.*?\b(appears)\b', 'Multiple uses of "appears" detected'),
-            (r'\b(features)\b.*?\b(features)\b', 'Multiple uses of "features" detected'),
-            (r'\bThis\s+(\w+)\s+.*?\bThis\s+\1\b', 'Repetitive sentence structure detected')
-        ]
-        # 定義替換詞典，提供多樣化的表達方式
-        replacement_dict = {
-            'visible': ['present', 'evident', 'apparent', 'observable'],
-            'positioned': ['arranged', 'placed', 'set', 'organized'],
-            'located': ['found', 'placed', 'situated', 'established'],
-            'situated': ['placed', 'positioned', 'arranged', 'set'],
-            'appears': ['seems', 'looks', 'presents', 'exhibits'],
-            'features': ['includes', 'contains', 'displays', 'showcases']
-        }
-        for pattern, issue in repetitive_patterns:
-            matches = list(re.finditer(pattern, generated, re.IGNORECASE | re.DOTALL))
-            if matches:
-                self.logger.warning(f"Text quality issue detected: {issue}")
-                # 針對特定重複詞彙進行替換
-                for word in replacement_dict.keys():
-                    if word in issue.lower():
-                        word_pattern = re.compile(r'\b' + word + r'\b', re.IGNORECASE)
-                        word_matches = list(word_pattern.finditer(generated))
-                        # 保留第一次出現，替換後續出現
-                        for i, match in enumerate(word_matches[1:], 1):
-                            if i <= len(replacement_dict[word]):
-                                replacement = replacement_dict[word][(i-1) % len(replacement_dict[word])]
-                                # 保持原始大小寫格式
-                                if match.group().isupper():
-                                    replacement = replacement.upper()
-                                elif match.group().istitle():
-                                    replacement = replacement.capitalize()
-                                # 執行替換
-                                generated = generated[:match.start()] + replacement + generated[match.end():]
-                                # 重新計算後續匹配位置
-                                word_matches = list(word_pattern.finditer(generated))
-                        break
-        return generated
     def verify_detection(self,
-                       detected_objects: List[Dict],
-                       clip_analysis: Dict[str, Any],
-                       scene_type: str,
-                       scene_name: str,
-                       confidence: float) -> Dict[str, Any]:
         """
         驗證並可能修正YOLO的檢測結果
         Args:
             detected_objects: YOLO檢測到的物體列表
             clip_analysis: CLIP分析結果
             scene_type: 識別的場景類型
             scene_name: 場景名稱
             confidence: 場景分類的信心度
         Returns:
             Dict: 包含驗證結果和建議的字典
         """
-        # 確保模型已加載
-        self._load_model()
-        # 格式化數據
-        objects_str = self._format_objects_for_prompt(detected_objects)
-        clip_str = self._format_clip_results(clip_analysis)
-        # 構建提示
-        prompt = self.verify_detection_template.format(
-            scene_type=scene_type,
-            scene_name=scene_name,
-            confidence=confidence,
-            detected_objects=objects_str,
-            clip_analysis=clip_str
-        )
-        # 調用LLM進行驗證
-        verification_result = self._generate_llm_response(prompt)
-        # 解析驗證結果
-        result = {
-            "verification_text": verification_result,
-            "has_errors": "appear accurate" not in verification_result.lower(),
-            "corrected_objects": None
-        }
-        return result
-    def _remove_explanatory_notes(self, response: str) -> str:
-        """移除解釋性注釋、說明和其他非描述性內容"""
-        # 識別常見的注釋和解釋模式
-        note_patterns = [
-            r'(?:^|\n)Note:.*?(?:\n|$)',
-            r'(?:^|\n)I have (?:followed|adhered to|ensured).*?(?:\n|$)',
-            r'(?:^|\n)This description (?:follows|adheres to|maintains).*?(?:\n|$)',
-            r'(?:^|\n)The enhanced description (?:maintains|preserves).*?(?:\n|$)'
-        ]
-        # 尋找第一段完整的描述內容
-        paragraphs = [p.strip() for p in response.split('\n\n') if p.strip()]
-        # 如果只有一個段落，檢查並清理它
-        if len(paragraphs) == 1:
-            for pattern in note_patterns:
-                paragraphs[0] = re.sub(pattern, '', paragraphs[0], flags=re.IGNORECASE)
-            return paragraphs[0].strip()
-        # 如果有多個段落，識別並移除注釋段落
-        content_paragraphs = []
-        for paragraph in paragraphs:
-            is_note = False
-            for pattern in note_patterns:
-                if re.search(pattern, paragraph, flags=re.IGNORECASE):
-                    is_note = True
-                    break
-            # 檢查段落是否以常見的注釋詞開頭
-            if paragraph.lower().startswith(('note:', 'please note:', 'remember:')):
-                is_note = True
-            if not is_note:
-                content_paragraphs.append(paragraph)
-        # 返回清理後的內容
-        return '\n\n'.join(content_paragraphs).strip()
     def handle_no_detection(self, clip_analysis: Dict[str, Any]) -> str:
         """
         處理YOLO未檢測到物體的情況
         Args:
             clip_analysis: CLIP分析結果
         Returns:
             str: 生成的場景描述
         """
-        # 確保模型已加載
-        self._load_model()
-        # 提取CLIP結果
-        top_scene, top_confidence = clip_analysis.get("top_scene", ("unknown", 0))
-        viewpoint = clip_analysis.get("viewpoint", ("standard", 0))[0]
-        lighting = clip_analysis.get("lighting_condition", ("unknown", 0))[0]
-        # 格式化文化分析
-        cultural_str = self._format_cultural_analysis(clip_analysis.get("cultural_analysis", {}))
-        # 構建提示
-        prompt = self.no_detection_template.format(
-            top_scene=top_scene,
-            top_confidence=top_confidence,
-            viewpoint=viewpoint,
-            lighting_condition=lighting,
-            cultural_analysis=cultural_str
-        )
-        # 調用LLM生成描述
-        description = self._generate_llm_response(prompt)
-        # 優化輸出
-        return self._clean_llm_response(description)
-    def _clean_input_text(self, text: str) -> str:
         """
-        對輸入文本進行通用的格式清理，處理常見的格式問題。
-        Args:
-            text: 輸入文本
         Returns:
-            清理後的文本
         """
-        if not text:
-            return ""
-        # 清理格式的問題
-        # 1. 處理連續標點符號問題
-        text = re.sub(r'([.,;:!?])\1+', r'\1', text)
-        # 2. 修復不完整句子的標點（如 "Something," 後沒有繼續接續下去）
-        text = re.sub(r',\s*$', '.', text)
-        # 3. 修復如 "word." 後未加空格即接下一句的問題
-        text = re.sub(r'([.!?])([A-Z])', r'\1 \2', text)
-        # 4. 移除多餘空格
-        text = re.sub(r'\s+', ' ', text).strip()
-        # 5. 確保句子正確結束（句尾加句號）
-        if text and not text[-1] in '.!?':
-            text += '.'
-        return text
-    def _fact_check_description(self, original_desc: str, enhanced_desc: str, scene_type: str, detected_objects: List[str]) -> str:
         """
-        驗證並可能修正增強後的描述，確保有保持事實準確性。
-        Args:
-            original_desc: 原始場景描述
-            enhanced_desc: 增強後的描述待驗證
-            scene_type: 場景類型
-            detected_objects: 檢測到的物體名稱列表
         Returns:
-            經過事實檢查的描述
         """
-        # 如果增強描述為空或太短，返回原始描述
-        if not enhanced_desc or len(enhanced_desc) < 30:
-            return original_desc
-        # 1. 檢查數值一致性（如人數、物體數量等）
-        # 從原始描述中提取數字和相關名詞
-        number_patterns = [
-            (r'(\d+)\s+(people|person|pedestrians|individuals)', r'\1', r'\2'), # 人數
-            (r'(\d+)\s+(cars|vehicles|automobiles)', r'\1', r'\2'),            # 車輛數
-            (r'(\d+)\s+(buildings|structures)', r'\1', r'\2')                  # 建築數
-        ]
-        # 檢查原始描述中的每個數字
-        for pattern, num_group, word_group in number_patterns:
-            original_matches = re.finditer(pattern, original_desc, re.IGNORECASE)
-            for match in original_matches:
-                number = match.group(1)
-                noun = match.group(2)
-                # 檢查增強描述中是否保留了這個數字
-                # 創建一個更通用的模式來檢查增強描述中是否包含此數字和對象類別
-                enhanced_pattern = r'(\d+)\s+(' + re.escape(noun) + r'|' + re.escape(noun.rstrip('s')) + r'|' + re.escape(noun + 's') + r')'
-                enhanced_matches = list(re.finditer(enhanced_pattern, enhanced_desc, re.IGNORECASE))
-                if not enhanced_matches:
-                    # 數字+名詞未在增強描述中找到
-                    plural_form = noun if noun.endswith('s') or number == '1' else noun + 's'
-                    if enhanced_desc.startswith("This") or enhanced_desc.startswith("The"):
-                        enhanced_desc = enhanced_desc.replace("This ", f"This scene with {number} {plural_form} ", 1)
-                        enhanced_desc = enhanced_desc.replace("The ", f"The scene with {number} {plural_form} ", 1)
-                    else:
-                        enhanced_desc = f"The scene includes {number} {plural_form}. " + enhanced_desc
-                elif enhanced_matches and match.group(1) != number:
-                    # 存在但數字不一致，就要更正數字
-                    for ematch in enhanced_matches:
-                        wrong_number = ematch.group(1)
-                        enhanced_desc = enhanced_desc.replace(f"{wrong_number} {ematch.group(2)}", f"{number} {ematch.group(2)}")
-        # 2. 檢查視角的一致性
-        perspective_terms = {
-            "aerial": ["aerial", "bird's-eye", "overhead", "top-down", "above", "looking down"],
-            "ground": ["street-level", "ground level", "eye-level", "standing"],
-            "indoor": ["inside", "interior", "indoor", "within"],
-            "close-up": ["close-up", "detailed view", "close shot"]
-        }
-        # 確定原始視角
-        original_perspective = None
-        for persp, terms in perspective_terms.items():
-            if any(term in original_desc.lower() for term in terms):
-                original_perspective = persp
-                break
-        # 檢查是否保留了視角方面
-        if original_perspective:
-            enhanced_has_perspective = any(term in enhanced_desc.lower() for term in perspective_terms[original_perspective])
-            if not enhanced_has_perspective:
-                # 添加之前缺的視角方面
-                perspective_prefixes = {
-                    "aerial": "From an aerial perspective, ",
-                    "ground": "From street level, ",
-                    "indoor": "In this indoor setting, ",
-                    "close-up": "In this close-up view, "
-                }
-                prefix = perspective_prefixes.get(original_perspective, "")
-                if prefix:
-                    if enhanced_desc[0].isupper():
-                        enhanced_desc = prefix + enhanced_desc[0].lower() + enhanced_desc[1:]
-                    else:
-                        enhanced_desc = prefix + enhanced_desc
-        # 3. 檢查場景類型一致性
-        if scene_type and scene_type.lower() != "unknown" and scene_type.lower() not in enhanced_desc.lower():
-            # 添加場景類型
-            if enhanced_desc.startswith("This ") or enhanced_desc.startswith("The "):
-                # 避免產生 "This scene" 和 "This intersection" 的重複
-                if "scene" in enhanced_desc[:15].lower():
-                    fixed_type = scene_type.lower()
-                    enhanced_desc = enhanced_desc.replace("scene", fixed_type, 1)
-                else:
-                    enhanced_desc = enhanced_desc.replace("This ", f"This {scene_type} ", 1)
-                    enhanced_desc = enhanced_desc.replace("The ", f"The {scene_type} ", 1)
-            else:
-                enhanced_desc = f"This {scene_type} " + enhanced_desc
-        # 4. 確保文字長度適當，這邊的限制要與prompt相同,否則會產生矛盾
-        words = enhanced_desc.split()
-        if len(words) > 200:
-            # 找尋接近字數限制的句子結束處
-            truncated = ' '.join(words[:200])
-            last_period = max(truncated.rfind('.'), truncated.rfind('!'), truncated.rfind('?'))
-            if last_period > 0:
-                enhanced_desc = truncated[:last_period+1]
-            else:
-                enhanced_desc = truncated + '.'
-        return enhanced_desc
-    def _extract_perspective_from_description(self, description: str) -> str:
-        """從原始描述中提取視角/透視信息"""
-        perspective_terms = {
-            "aerial": ["aerial perspective", "aerial view", "bird's-eye view", "overhead view", "from above"],
-            "ground": ["ground level", "eye level", "street level"],
-            "indoor": ["indoor setting", "inside", "interior"]
-        }
-        for persp_type, terms in perspective_terms.items():
-            for term in terms:
-                if term.lower() in description.lower():
-                    return term
-        return ""
-    def _extract_objects_from_description(self, description: str) -> List[str]:
-        """從原始描述中提取物件提及"""
-        # 常見物件正則表達式模式
-        object_patterns = [
-            r'(\d+)\s+(people|persons|pedestrians|individuals)',
-            r'(\d+)\s+(cars|vehicles|automobiles)',
-            r'(\d+)\s+(buildings|structures)',
-            r'(\d+)\s+(plants|potted plants|flowers)',
-            r'(\d+)\s+(beds|furniture|tables|chairs)'
-        ]
-        extracted_objects = []
-        for pattern in object_patterns:
-            matches = re.finditer(pattern, description, re.IGNORECASE)
-            for match in matches:
-                number = match.group(1)
-                object_type = match.group(2)
-                extracted_objects.append(f"{number} {object_type}")
-        return extracted_objects
-    def _ensure_scene_type_consistency(self, description: str, scene_type: str, original_desc: str) -> str:
-        """確保描述中的場景類型與指定的場景類型一致"""
-        # 禁止使用的錯誤場景詞列表
-        prohibited_scene_words = ["plaza", "square", "european", "asian", "american"]
-        # 檢查是否包含禁止的場景詞
-        for word in prohibited_scene_words:
-            if word in description.lower() and word not in original_desc.lower() and word not in scene_type.lower():
-                # 替換錯誤場景詞為正確場景類型
-                pattern = re.compile(r'\b' + word + r'\b', re.IGNORECASE)
-                description = pattern.sub(scene_type, description)
-        # 確保場景類型在描述中被提及
-        if scene_type.lower() not in description.lower():
-            # 尋找通用場景詞並替換
-            for general_term in ["scene", "area", "place", "location"]:
-                if general_term in description.lower():
-                    pattern = re.compile(r'\b' + general_term + r'\b', re.IGNORECASE)
-                    description = pattern.sub(scene_type, description, count=1)
-                    break
-            else:
-                # 如果沒有找到通用詞，在開頭添加場景類型
-                if description.startswith("The "):
-                    description = description.replace("The ", f"The {scene_type} ", 1)
-                elif description.startswith("This "):
-                    description = description.replace("This ", f"This {scene_type} ", 1)
-                else:
-                    description = f"This {scene_type} " + description
-        return description
-    def _generate_llm_response(self, prompt: str) -> str:
-        """生成 LLM 的回應"""
-        self._load_model()
         try:
-            self.call_count += 1
-            self.logger.info(f"LLM call #{self.call_count}")
-            # 清除 GPU 緩存
-            torch.cuda.empty_cache()
-            # 設置固定種子以提高一致性
-            torch.manual_seed(42)
-            # 準備輸入
-            inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=self.max_length).to(self.device)
-            # 根據模型類型調整參數
-            generation_params = {
-                "max_new_tokens": 120,
-                "pad_token_id": self.tokenizer.eos_token_id,
-                "attention_mask": inputs.attention_mask,
-                "use_cache": True,
-            }
-            # 為 Llama 模型設置特定參數
-            if "llama" in self.model_path.lower():
-                generation_params.update({
-                    "temperature": 0.35,        # 不要太高, 否則模型可能會太有主觀意見
-                    "max_new_tokens": 600,
-                    "do_sample": True,
-                    "top_p": 0.75,
-                    "repetition_penalty": 1.5,  # 重複的懲罰權重,可避免掉重複字
-                    "num_beams": 5 ,
-                    "length_penalty": 1,
-                    "no_repeat_ngram_size": 3
-                })
-            else:
-                # 如果用其他模型的參數
-                generation_params.update({
-                    "temperature": 0.6,
-                    "max_new_tokens": 300,
-                    "top_p": 0.9,
-                    "do_sample": True,
-                    "num_beams": 1,
-                    "repetition_penalty": 1.05
-                })
-            # 生成回應
-            with torch.no_grad():
-                outputs = self.model.generate(inputs.input_ids, **generation_params)
-            # 解碼完整輸出
-            full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-            # 提取生成的響應部分
-            assistant_tag = "<|assistant|>"
-            if assistant_tag in full_response:
-                response = full_response.split(assistant_tag)[-1].strip()
-                # 檢查是否有未閉合的 <|assistant|>
-                user_tag = "<|user|>"
-                if user_tag in response:
-                    response = response.split(user_tag)[0].strip()
-            else:
-                # 移除輸入提示
-                input_text = self.tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)
-                response = full_response
-                if response.startswith(input_text):
-                    response = response[len(input_text):].strip()
-            # 確保不返回空的回應
-            if not response or len(response.strip()) < 10:
-                self.logger.warning("response is too short or empty")
-                return "No detailed description could be generated."
-            return response
         except Exception as e:
-            self.logger.error(f"生成 LLM 響應時出錯: {str(e)}")
-            import traceback
-            self.logger.error(traceback.format_exc())
-            return "Unable to generate enhanced description."
-    def _clean_llm_response(self, response: str) -> str:
-        """
-        Clean the LLM response to ensure the output contains only clean descriptive text.
-        Sometimes it will not only display the description but display tags, notes...etc
-        Args:
-            response: Original response from the LLM
-        Returns:
-            Cleaned description text
-        """
-        if not response:
-            return ""
-        # Save original response as backup
-        original_response = response
-        # 1. Extract content between markers (if present)
-        output_start = response.find("[OUTPUT_START]")
-        output_end = response.find("[OUTPUT_END]")
-        if output_start != -1 and output_end != -1 and output_end > output_start:
-            response = response[output_start + len("[OUTPUT_START]"):output_end].strip()
-        # 2. Remove all remaining section markers and instructions
-        section_markers = [
-            r'\[.*?\]',                      # [any text]
-            r'OUTPUT_START\s*:|OUTPUT_END\s*:',  # OUTPUT_START: or OUTPUT_END:
-            r'ENHANCED DESCRIPTION\s*:',      # ENHANCED DESCRIPTION:
-            r'Scene Type\s*:.*?(?=\n|$)',    # Scene Type: text
-            r'Original Description\s*:.*?(?=\n|$)', # Original Description: text
-            r'GOOD\s*:|BAD\s*:',             # GOOD: or BAD:
-            r'PROBLEM\s*:.*?(?=\n|$)',       # PROBLEM: text
-            r'</?\|(?:assistant|system|user)\|>',  # Dialog markers
-            r'\(Note:.*?\)',                 # Notes in parentheses
-            r'\(.*?I\'ve.*?\)',              # Common explanatory content
-            r'\(.*?as per your request.*?\)' # References to instructions
-        ]
-        for marker in section_markers:
-            response = re.sub(marker, '', response, flags=re.IGNORECASE)
-        # 2.5. Deal with Here is...
-        intro_prefixes = [
-            r'^Here\s+is\s+(?:a\s+|the\s+)?(?:rewritten\s+|enhanced\s+)?scene\s+description.*?:\s*',
-            r'^The\s+(?:rewritten\s+|enhanced\s+)?(?:scene\s+)?description\s+is.*?:\s*',
-            r'^Here\'s\s+(?:a\s+|the\s+)?(?:rewritten\s+|enhanced\s+)?description.*?:\s*'
-        ]
-        for prefix_pattern in intro_prefixes:
-            response = re.sub(prefix_pattern, '', response, flags=re.IGNORECASE)
-        # 3. Remove common prefixes and suffixes
-        prefixes_to_remove = [
-            "Enhanced Description:",
-            "Scene Description:",
-            "Description:",
-            "Here is the enhanced description:",
-            "Here's the enhanced description:",
-            "Here is a rewritten scene description that adheres to the provided critical rules:",
-            "Here is the rewritten scene description:",
-            "Here's a rewritten scene description:",
-            "The rewritten scene description is as follows:"
-        ]
-        for prefix in prefixes_to_remove:
-            if response.lower().startswith(prefix.lower()):
-                response = response[len(prefix):].strip()
-        # 4. Remove any Context tags or text containing Context
-        response = re.sub(r'<\s*Context:.*?>', '', response)
-        response = re.sub(r'Context:.*?(?=\n|$)', '', response)
-        response = re.sub(r'Note:.*?(?=\n|$)', '', response, flags=re.IGNORECASE)
-        # 5. Clean improper scene type references
-        scene_type_pattern = r'This ([a-zA-Z_]+) (features|shows|displays|contains)'
-        match = re.search(scene_type_pattern, response)
-        if match and '_' in match.group(1):
-            fixed_text = f"This scene {match.group(2)}"
-            response = re.sub(scene_type_pattern, fixed_text, response)
-        # 6. Reduce dash usage for more natural punctuation
-        response = re.sub(r'—', ', ', response)
-        response = re.sub(r' - ', ', ', response)
-        # 7. Remove excess whitespace and line breaks
-        response = response.replace('\r', ' ')
-        response = re.sub(r'\n+', ' ', response)  # 將所有換行符替換為空格
-        response = re.sub(r'\s{2,}', ' ', response)  # 將多個空格替換為單個空格
-        # 8. Remove Markdown formatting
-        response = re.sub(r'\*\*|\*|__|\|', '', response)  # Remove Markdown indicators
-        # 9. Detect and remove sentence duplicates
-        sentences = re.split(r'(?<=[.!?])\s+', response)
-        unique_sentences = []
-        seen_content = set()
-        for sentence in sentences:
-            # Skip empty sentences
-            if not sentence.strip():
-                continue
-            # Create simplified version for comparison (lowercase, no punctuation)
-            simplified = re.sub(r'[^\w\s]', '', sentence.lower())
-            simplified = ' '.join(simplified.split())  # Standardize whitespace
-            # Check if we've seen a similar sentence
-            is_duplicate = False
-            for existing in seen_content:
-                if len(simplified) > 10 and (existing in simplified or simplified in existing):
-                    is_duplicate = True
-                    break
-            if not is_duplicate and simplified:
-                unique_sentences.append(sentence)
-                seen_content.add(simplified)
-        # Recombine unique sentences
-        response = ' '.join(unique_sentences)
-        # 9.5. Advanced repetition detection and replacement
-        repetitive_descriptors = ['visible', 'positioned', 'located', 'situated', 'appears', 'features', 'shows', 'displays']
-        word_usage_count = {}
-        # Count occurrences of each repetitive descriptor
-        for word in repetitive_descriptors:
-            count = len(re.findall(r'\b' + word + r'\b', response, re.IGNORECASE))
-            if count > 1:
-                word_usage_count[word] = count
-        # Replace excessive repetitions with varied alternatives
-        replacement_alternatives = {
-            'visible': ['present', 'evident', 'apparent', 'observable'],
-            'positioned': ['arranged', 'placed', 'set', 'organized'],
-            'located': ['found', 'placed', 'situated', 'established'],
-            'situated': ['placed', 'positioned', 'arranged', 'set'],
-            'appears': ['seems', 'looks', 'presents', 'exhibits'],
-            'features': ['includes', 'contains', 'displays', 'showcases'],
-            'shows': ['reveals', 'presents', 'exhibits', 'demonstrates'],
-            'displays': ['presents', 'exhibits', 'shows', 'reveals']
-        }
-        for word, count in word_usage_count.items():
-            if count > 1 and word in replacement_alternatives:
-                # Find all occurrences
-                pattern = re.compile(r'\b' + word + r'\b', re.IGNORECASE)
-                matches = list(pattern.finditer(response))
-                # Replace subsequent occurrences (keep first one)
-                for i, match in enumerate(matches[1:], 1):
-                    if i <= len(replacement_alternatives[word]):
-                        replacement = replacement_alternatives[word][(i-1) % len(replacement_alternatives[word])]
-                        # Maintain original case pattern
-                        if match.group().isupper():
-                            replacement = replacement.upper()
-                        elif match.group().istitle():
-                            replacement = replacement.capitalize()
-                        response = response[:match.start()] + replacement + response[match.end():]
-                        # Update remaining matches positions
-                        offset = len(replacement) - len(match.group())
-                        matches = list(pattern.finditer(response))
-        # 10. Ensure word count is within limits (50-150 words)
-        words = response.split()
-        if len(words) > 200:
-            # Find sentence ending near the word limit
-            truncated = ' '.join(words[:200])
-            last_period = max(truncated.rfind('.'), truncated.rfind('!'), truncated.rfind('?'))
-            if last_period > 0:
-                response = truncated[:last_period+1]
-            else:
-                response = truncated + "."
-        # 11. Check sentence completeness
-        if response and not response.strip()[-1] in ['.', '!', '?']:
-            # Find the last preposition or conjunction
-            common_prepositions = ["into", "onto", "about", "above", "across", "after", "along", "around", "at", "before", "behind", "below", "beneath", "beside", "between", "beyond", "by", "down", "during", "except", "for", "from", "in", "inside", "near", "of", "off", "on", "over", "through", "to", "toward", "under", "up", "upon", "with", "within"]
-            # Check if ending with preposition or conjunction
-            last_word = response.strip().split()[-1].lower() if response.strip().split() else ""
-            if last_word in common_prepositions or last_word in ["and", "or", "but"]:
-                # Find the last complete sentence
-                last_period = max(response.rfind('.'), response.rfind('!'), response.rfind('?'))
-                if last_period > 0:
-                    response = response[:last_period+1]
-                else:
-                    # If no complete sentence found, modify the ending
-                    words = response.strip().split()
-                    if words:
-                        # Remove the last preposition or conjunction
-                        response = " ".join(words[:-1]) + "."
-        # 12. Grammar completeness check
-        incomplete_patterns = [
-            r'\b(fine|the)\s+(the\s+)?(?:urban|area|scene)\b(?!\s+\w)',  # 檢測不完整的片語
-            r'\b(and|or|but|with|from|in|at|on)\s*[.!?]',              # 介詞後直接結束
-            r'\b\w+\s+\1\b'  # 重複詞語檢測
-        ]
-        for pattern in incomplete_patterns:
-            if re.search(pattern, response, re.IGNORECASE):
-                # 移除有問題的片段或進行修正
-                response = re.sub(pattern, '', response, flags=re.IGNORECASE)
-                response = re.sub(r'\s{2,}', ' ', response)  # 清理多餘空格
-        # 13. Ensure haven't over-filtered
-        if not response or len(response) < 40:
-            # Try to get the first meaningful paragraph from the original response
-            paragraphs = [p for p in original_response.split('\n\n') if p.strip()]
-            if paragraphs:
-                # Choose the longest paragraph as it's most likely the actual description
-                best_para = max(paragraphs, key=len)
-                # Clean it using a subset of the above rules
-                best_para = re.sub(r'\[.*?\]', '', best_para)  # Remove [SECTION] markers
-                best_para = re.sub(r'\s{2,}', ' ', best_para).strip()  # Clean whitespace
-                if len(best_para) >= 40:
-                    return best_para
-            # If still no good content, return a simple message
-            return "Unable to generate a valid enhanced description."
-        # 14. Final cleaning - catch any missed special cases
-        response = re.sub(r'</?\|.*?\|>', '', response)  # Any remaining tags
-        response = re.sub(r'\(.*?\)', '', response)  # Any remaining parenthetical content
-        response = re.sub(r'Note:.*?(?=\n|$)', '', response, flags=re.IGNORECASE)  # Any remaining notes
-        # Ensure proper spacing after punctuation
-        response = re.sub(r'([.!?])([A-Z])', r'\1 \2', response)
-        # Ensure first letter is capitalized
-        if response and response[0].islower():
-            response = response[0].upper() + response[1:]
-        # 15. 統一格式 - 確保輸出始終是單一段落
-        response = re.sub(r'\s*\n\s*', ' ', response)  # 將所有換行符替換為空格
-        response = ' '.join(response.split())
-        return response.strip()
-    def _format_objects_for_prompt(self, objects: List[Dict]) -> str:
-        """格式化物體列表以用於提示"""
-        if not objects:
-            return "No objects detected"
-        formatted = []
-        for obj in objects:
-            formatted.append(f"{obj['class_name']} (confidence: {obj['confidence']:.2f})")
-        return "\n- " + "\n- ".join(formatted)
-    def _format_clip_results(self, clip_analysis: Dict) -> str:
-        """格式化CLIP分析結果以用於提示"""
-        if not clip_analysis or "error" in clip_analysis:
-            return "No CLIP analysis available"
-        parts = ["CLIP Analysis Results:"]
-        # 加上頂級場景
-        top_scene, confidence = clip_analysis.get("top_scene", ("unknown", 0))
-        parts.append(f"- Most likely scene: {top_scene} (confidence: {confidence:.2f})")
-        # 加上視角
-        viewpoint, vp_conf = clip_analysis.get("viewpoint", ("standard", 0))
-        parts.append(f"- Camera viewpoint: {viewpoint} (confidence: {vp_conf:.2f})")
-        # 加上物體組合
-        if "object_combinations" in clip_analysis:
-            combos = []
-            for combo, score in clip_analysis["object_combinations"][:3]:
-                combos.append(f"{combo} ({score:.2f})")
-            parts.append(f"- Object combinations: {', '.join(combos)}")
-        # 加上文化分析
-        if "cultural_analysis" in clip_analysis:
-            parts.append("- Cultural analysis:")
-            for culture_type, data in clip_analysis["cultural_analysis"].items():
-                best_desc = data.get("best_description", "")
-                desc_conf = data.get("confidence", 0)
-                parts.append(f"  * {culture_type}: {best_desc} ({desc_conf:.2f})")
-        return "\n".join(parts)
-    def _format_cultural_analysis(self, cultural_analysis: Dict) -> str:
-        """格式化文化分析結果"""
-        if not cultural_analysis:
-            return "No specific cultural elements detected"
-        parts = []
-        for culture_type, data in cultural_analysis.items():
-            best_desc = data.get("best_description", "")
-            desc_conf = data.get("confidence", 0)
-            parts.append(f"{culture_type}: {best_desc} (confidence: {desc_conf:.2f})")
-        return "\n".join(parts)

 import logging
+import traceback
+from typing import Dict, List, Any, Optional
+from model_manager import ModelManager
+from prompt_template_manager import PromptTemplateManager
+from response_processor import ResponseProcessor
+from text_quality_validator import TextQualityValidator
+from landmark_data import ALL_LANDMARKS
 class LLMEnhancer:
     """
+    LLM增強器的主要窗口，協調模型管理、提示模板、回應處理和品質驗證等組件。
+    提供統一的接口來處理場景描述增強、檢測結果驗證和無檢測情況處理。
     """
     def __init__(self,
+                 model_path: Optional[str] = None,
+                 tokenizer_path: Optional[str] = None,
+                 device: Optional[str] = None,
+                 max_length: int = 2048,
+                 temperature: float = 0.3,
+                 top_p: float = 0.85):
         """
+        初始化LLM增強器門面
         Args:
+            model_path: LLM模型的路徑或HuggingFace模型名稱，預設使用Llama 3.2
+            tokenizer_path: tokenizer的路徑，通常與model_path相同
+            device: 運行設備 ('cpu'或'cuda')，None時自動檢測
+            max_length: 輸入文本的最大長度
+            temperature: 生成文本的溫度參數
             top_p: 生成文本時的核心採樣機率閾值
         """
+        # 設置專屬logger
+        self.logger = logging.getLogger(self.__class__.__name__)
+        if not self.logger.handlers:
+            handler = logging.StreamHandler()
+            formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+            handler.setFormatter(formatter)
+            self.logger.addHandler(handler)
+            self.logger.setLevel(logging.INFO)
+        try:
+            # 初始化四個核心組件
+            self.model_manager = ModelManager(
+                model_path=model_path,
+                tokenizer_path=tokenizer_path,
+                device=device,
+                max_length=max_length,
+                temperature=temperature,
+                top_p=top_p
+            )
+            self.prompt_manager = PromptTemplateManager()
+            self.response_processor = ResponseProcessor()
+            self.quality_validator = TextQualityValidator()
+            # 保存模型路徑以供後續使用
+            self.model_path = model_path or "meta-llama/Llama-3.2-3B-Instruct"
+            self.logger.info("LLMEnhancer facade initialized successfully")
+        except Exception as e:
+            error_msg = f"Failed to initialize LLMEnhancer facade: {str(e)}"
+            self.logger.error(error_msg)
+            self.logger.error(traceback.format_exc())
+            raise Exception(error_msg) from e
+    def enhance_description(self, scene_data: Dict[str, Any]) -> str:
+        """
+        場景描述增強器主要入口方法，整合所有組件來處理場景描述增強
+        Args:
+            scene_data: 包含場景資訊的字典，包括原始描述、檢測物件 (含 is_landmark)、
+                        場景類型、時間/光線資訊等
+        Returns:
+            str: 增強後的場景描述
+        """
         try:
+            self.logger.info("Starting scene description enhancement")
+            # 1. 重置模型上下文
+            self.model_manager.reset_context()
+            # 2. 取出原始描述
+            original_desc = scene_data.get("original_description", "")
+            if not original_desc:
+                self.logger.warning("No original description provided")
+                return "No original description provided."
+            # 3. 準備物件統計資訊
+            object_list = self._prepare_object_statistics(scene_data)
+            if not object_list:
+                object_keywords = self.quality_validator.extract_objects_from_description(original_desc)
+                object_list = ", ".join(object_keywords) if object_keywords else "objects visible in the scene"
+            # 4. 檢測地標並準備地標資訊
+            landmark_info = self._extract_landmark_info(scene_data)
+            # 5. 將地標資訊加入scene_data
+            enhanced_scene_data = scene_data.copy()
+            if landmark_info:
+                enhanced_scene_data["landmark_location_info"] = landmark_info
+            # 6. 生成 prompt
+            prompt = self.prompt_manager.format_enhancement_prompt_with_landmark(
+                scene_data=enhanced_scene_data,
+                object_list=object_list,
+                original_description=original_desc
             )
+            # 7. 生成 LLM 回應
+            self.logger.info("Generating LLM response")
+            response = self.model_manager.generate_response(prompt)
+            # 8. 處理不完整回應（重試機制）
+            response = self._handle_incomplete_response(response, prompt, original_desc)
+            # 9. 清理 LLM 回應
+            model_type = self.model_path
+            raw_cleaned = self.response_processor.clean_response(response, model_type)
+            # 10. 移除解釋性注釋
+            cleaned_response = self.response_processor.remove_explanatory_notes(raw_cleaned)
+            # 11. 事實準確性驗證
+            try:
+                cleaned_response = self.quality_validator.verify_factual_accuracy(
+                    original_desc, cleaned_response, object_list
+                )
+            except Exception:
+                self.logger.warning("Fact verification failed; using response without verification")
+            # 12. 場景類型一致性確保
+            scene_type = scene_data.get("scene_type", "unknown scene")
+            word_count = len(cleaned_response.split())
+            if word_count >= 5 and scene_type.lower() not in cleaned_response.lower():
+                cleaned_response = self.quality_validator.ensure_scene_type_consistency(
+                    cleaned_response, scene_type, original_desc
+                )
+            # 13. 視角一致性處理
+            perspective = self.quality_validator.extract_perspective_from_description(original_desc)
+            if perspective and perspective.lower() not in cleaned_response.lower():
+                cleaned_response = f"{perspective}, {cleaned_response[0].lower()}{cleaned_response[1:]}"
+            # 14. 最終驗證：如果結果過短，嘗試fallback
+            final_result = cleaned_response.strip()
+            if not final_result or len(final_result) < 20:
+                self.logger.warning("Enhanced description too short; attempting fallback")
+                # Fallback prompt
+                fallback_scene_data = enhanced_scene_data.copy()
+                fallback_scene_data["is_fallback"] = True
+                fallback_prompt = self.prompt_manager.format_enhancement_prompt_with_landmark(
+                    scene_data=fallback_scene_data,
+                    object_list=object_list,
+                    original_description=original_desc
+                )
+                fallback_resp = self.model_manager.generate_response(fallback_prompt)
+                fallback_cleaned = self.response_processor.clean_response(fallback_resp, model_type)
+                fallback_cleaned = self.response_processor.remove_explanatory_notes(fallback_cleaned)
+                final_result = fallback_cleaned.strip()
+                if not final_result or len(final_result) < 20:
+                    self.logger.warning("Fallback also insufficient; returning original")
+                    return original_desc
+            # 15. display enhanced description
+            self.logger.info(f"Scene description enhancement completed successfully ({len(final_result)} chars)")
+            return final_result
+        except Exception as e:
+            error_msg = f"Enhancement failed: {str(e)}"
+            self.logger.error(error_msg)
+            self.logger.error(traceback.format_exc())
+            return scene_data.get("original_description", "Unable to enhance description")
+    def _extract_landmark_info(self, scene_data: Dict[str, Any]) -> Optional[Dict[str, str]]:
+        """
+        提取地標資訊，但不構建prompt內容
+        Args:
+            scene_data: 場景資料字典
+        Returns:
+            Optional[Dict[str, str]]: 地標資訊字典，包含name和location，如果沒有地標則返回None
+        """
+        try:
+            # 檢查是否有地標
+            lm_id_in_data = scene_data.get("landmark_id")
+            if not lm_id_in_data:
+                # 從檢測物件中尋找地標
+                for obj in scene_data.get("detected_objects", []):
+                    if obj.get("is_landmark") and obj.get("landmark_id"):
+                        lm_id_in_data = obj["landmark_id"]
+                        break
+            # 如果沒有檢測到地標，返回None
+            if not lm_id_in_data:
+                return None
+            # 從landmark_data.py提取地標資訊
+            if lm_id_in_data in ALL_LANDMARKS:
+                lm_info = ALL_LANDMARKS[lm_id_in_data]
+                landmark_name = scene_data.get("scene_name", lm_info.get("name", lm_id_in_data))
+                landmark_location = lm_info.get("location", "")
+                if landmark_location:
+                    return {
+                        "name": landmark_name,
+                        "location": landmark_location,
+                        "landmark_id": lm_id_in_data
+                    }
+            return None
+        except Exception as e:
+            self.logger.error(f"Error extracting landmark info: {str(e)}")
+            return None
+    def _prepare_object_statistics(self, scene_data: Dict[str, Any]) -> str:
+        """
+        準備物件統計資訊用於提示詞生成
+        Args:
+            scene_data: 場景資料字典
+        Returns:
+            str: 格式化的物件統計資訊
+        """
+        try:
+            # 高信心度閾值
             high_confidence_threshold = 0.65
+            # 優先使用預計算的統計資訊
             object_statistics = scene_data.get("object_statistics", {})
             object_counts = {}
             if object_statistics:
                 for class_name, stats in object_statistics.items():
                     if stats.get("count", 0) > 0 and stats.get("avg_confidence", 0) >= high_confidence_threshold:
                         object_counts[class_name] = stats["count"]
             else:
                 # 回退到原有的計算方式
+                detected_objects = scene_data.get("detected_objects", [])
+                filtered_objects = []
+                for obj in detected_objects:
+                    confidence = obj.get("confidence", 0)
+                    class_name = obj.get("class_name", "")
+                    # 為特殊類別設置更高閾值
+                    special_classes = ["airplane", "helicopter", "boat"]
+                    if class_name in special_classes:
+                        if confidence < 0.75:
+                            continue
+                    if confidence >= high_confidence_threshold:
+                        filtered_objects.append(obj)
                 for obj in filtered_objects:
                     class_name = obj.get("class_name", "")
                     if class_name not in object_counts:
                         object_counts[class_name] = 0
                     object_counts[class_name] += 1
+            # 格式化物件描述
+            return ", ".join([
                 f"{count} {obj}{'s' if count > 1 else ''}"
                 for obj, count in object_counts.items()
             ])
+        except Exception as e:
+            self.logger.error(f"Object statistics preparation failed: {str(e)}")
+            return "objects visible in the scene"
+    def _handle_incomplete_response(self, response: str, prompt: str, original_desc: str) -> str:
+        """
+        處理不完整的回應，必要時重新生成
+        Args:
+            response: 原始回應
+            prompt: 使用的提示詞
+            original_desc: 原始描述
+        Returns:
+            str: 處理後的回應
+        """
+        try:
+            # 檢查回應完整性
+            is_complete, issue = self.quality_validator.validate_response_completeness(response)
             max_retries = 3
             attempts = 0
+            while not is_complete and attempts < max_retries:
+                self.logger.warning(f"Incomplete response detected ({issue}), retrying... Attempt {attempts+1}/{max_retries}")
                 # 重新生成
+                response = self.model_manager.generate_response(prompt)
+                is_complete, issue = self.quality_validator.validate_response_completeness(response)
                 attempts += 1
             if not response or len(response.strip()) < 10:
                 self.logger.warning("Generated response was empty or too short, returning original description")
                 return original_desc
+            return response
         except Exception as e:
+            self.logger.error(f"Incomplete response handling failed: {str(e)}")
+            return response  # 返回原始回應
     def verify_detection(self,
+                        detected_objects: List[Dict],
+                        clip_analysis: Dict[str, Any],
+                        scene_type: str,
+                        scene_name: str,
+                        confidence: float) -> Dict[str, Any]:
         """
         驗證並可能修正YOLO的檢測結果
         Args:
             detected_objects: YOLO檢測到的物體列表
             clip_analysis: CLIP分析結果
             scene_type: 識別的場景類型
             scene_name: 場景名稱
             confidence: 場景分類的信心度
         Returns:
             Dict: 包含驗證結果和建議的字典
         """
+        try:
+            self.logger.info("Starting detection verification")
+            # 格式化驗證提示
+            prompt = self.prompt_manager.format_verification_prompt(
+                detected_objects=detected_objects,
+                clip_analysis=clip_analysis,
+                scene_type=scene_type,
+                scene_name=scene_name,
+                confidence=confidence
+            )
+            # 調用LLM進行驗證
+            verification_result = self.model_manager.generate_response(prompt)
+            # 清理回應
+            cleaned_result = self.response_processor.clean_response(verification_result, self.model_path)
+            # 解析驗證結果
+            result = {
+                "verification_text": cleaned_result,
+                "has_errors": "appear accurate" not in cleaned_result.lower(),
+                "corrected_objects": None
+            }
+            self.logger.info("Detection verification completed")
+            return result
+        except Exception as e:
+            error_msg = f"Detection verification failed: {str(e)}"
+            self.logger.error(error_msg)
+            self.logger.error(traceback.format_exc())
+            return {
+                "verification_text": "Verification failed",
+                "has_errors": False,
+                "corrected_objects": None
+            }
     def handle_no_detection(self, clip_analysis: Dict[str, Any]) -> str:
         """
         處理YOLO未檢測到物體的情況
         Args:
             clip_analysis: CLIP分析結果
         Returns:
             str: 生成的場景描述
         """
+        try:
+            self.logger.info("Handling no detection scenario")
+            # 格式化無檢測提示
+            prompt = self.prompt_manager.format_no_detection_prompt(clip_analysis)
+            # 調用LLM生成描述
+            description = self.model_manager.generate_response(prompt)
+            # 清理回應
+            cleaned_description = self.response_processor.clean_response(description, self.model_path)
+            self.logger.info("No detection handling completed")
+            return cleaned_description
+        except Exception as e:
+            error_msg = f"No detection handling failed: {str(e)}"
+            self.logger.error(error_msg)
+            self.logger.error(traceback.format_exc())
+            return "Unable to generate scene description"
+    def reset_context(self):
+        """重置LLM模型上下文"""
+        try:
+            self.model_manager.reset_context()
+            self.logger.info("LLM context reset completed")
+        except Exception as e:
+            self.logger.error(f"Context reset failed: {str(e)}")
+    def get_call_count(self) -> int:
         """
+        獲取模型調用次數
         Returns:
+            int: 調用次數
         """
+        return self.model_manager.get_call_count()
+    def get_model_info(self) -> Dict[str, Any]:
+        """
+        獲取模型和組件資訊
+        Returns:
+            Dict[str, Any]: 包含所有組件狀態的綜合資訊
+        """
+        try:
+            return {
+                "model_manager": self.model_manager.get_model_info(),
+                "prompt_manager": self.prompt_manager.get_template_info(),
+                "response_processor": self.response_processor.get_processor_info(),
+                "quality_validator": self.quality_validator.get_validator_info(),
+                "facade_status": "initialized"
+            }
+        except Exception as e:
+            self.logger.error(f"Failed to get component info: {str(e)}")
+            return {"facade_status": "error", "error_message": str(e)}
+    def is_model_loaded(self) -> bool:
         """
+        檢查模型是否已載入
         Returns:
+            bool: 模型載入狀態
         """
+        return self.model_manager.is_model_loaded()
+    def get_current_device(self) -> str:
+        """
+        獲取當前運行設備
+        Returns:
+            str: 當前設備名稱
+        """
+        return self.model_manager.get_current_device()
+    def _detect_scene_type(self, detected_objects: List[Dict]) -> str:
+        """
+        基於物件分佈和模式檢測場景類型
+        Args:
+            detected_objects: 檢測到的物件列表
+        Returns:
+            str: 檢測到的場景類型
+        """
         try:
+            # 預設場景類型
+            scene_type = "intersection"
+            # 計算物件數量
+            object_counts = {}
+            for obj in detected_objects:
+                class_name = obj.get("class_name", "")
+                if class_name not in object_counts:
+                    object_counts[class_name] = 0
+                object_counts[class_name] += 1
+            # 人數統計
+            people_count = object_counts.get("person", 0)
+            # 交通工具統計
+            car_count = object_counts.get("car", 0)
+            bus_count = object_counts.get("bus", 0)
+            truck_count = object_counts.get("truck", 0)
+            total_vehicles = car_count + bus_count + truck_count
+            # 簡單的場景類型檢測邏輯
+            if people_count > 8 and total_vehicles < 2:
+                scene_type = "pedestrian_crossing"
+            elif people_count > 5 and total_vehicles > 2:
+                scene_type = "busy_intersection"
+            elif people_count < 3 and total_vehicles > 3:
+                scene_type = "traffic_junction"
+            return scene_type
         except Exception as e:
+            self.logger.error(f"Scene type detection failed: {str(e)}")
+            return "intersection"

model_manager.py ADDED Viewed

	@@ -0,0 +1,358 @@

+import os
+import torch
+import logging
+from typing import Dict, Optional, Any
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+from huggingface_hub import login
+class ModelLoadingError(Exception):
+    """Custom exception for model loading failures"""
+    pass
+class ModelGenerationError(Exception):
+    """Custom exception for model generation failures"""
+    pass
+class ModelManager:
+    """
+    負責LLM模型的載入、設備管理和文本生成。
+    管理模型、記憶體優化和設備配置。
+    """
+    def __init__(self,
+                 model_path: Optional[str] = None,
+                 tokenizer_path: Optional[str] = None,
+                 device: Optional[str] = None,
+                 max_length: int = 2048,
+                 temperature: float = 0.3,
+                 top_p: float = 0.85):
+        """
+        初始化模型管理器
+        Args:
+            model_path: LLM模型的路徑或HuggingFace模型名稱，默認使用Llama 3.2
+            tokenizer_path: tokenizer的路徑，通常與model_path相同
+            device: 運行設備 ('cpu'或'cuda')，None時自動檢測
+            max_length: 輸入文本的最大長度
+            temperature: 生成文本的溫度參數
+            top_p: 生成文本時的核心採樣機率閾值
+        """
+        # 設置專屬logger
+        self.logger = logging.getLogger(self.__class__.__name__)
+        if not self.logger.handlers:
+            handler = logging.StreamHandler()
+            formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+            handler.setFormatter(formatter)
+            self.logger.addHandler(handler)
+            self.logger.setLevel(logging.INFO)
+        # 模型配置
+        self.model_path = model_path or "meta-llama/Llama-3.2-3B-Instruct"
+        self.tokenizer_path = tokenizer_path or self.model_path
+        # 設備管理
+        self.device = self._detect_device(device)
+        self.logger.info(f"Device selected: {self.device}")
+        # 生成參數
+        self.max_length = max_length
+        self.temperature = temperature
+        self.top_p = top_p
+        # 模型狀態
+        self.model = None
+        self.tokenizer = None
+        self._model_loaded = False
+        self.call_count = 0
+        # HuggingFace認證
+        self.hf_token = self._setup_huggingface_auth()
+    def _detect_device(self, device: Optional[str]) -> str:
+        """
+        檢測並設置運行設備
+        Args:
+            device: 用戶指定的設備，None時自動檢測
+        Returns:
+            str: ('cuda' or 'cpu')
+        """
+        if device:
+            if device == 'cuda' and not torch.cuda.is_available():
+                self.logger.warning("CUDA requested but not available, falling back to CPU")
+                return 'cpu'
+            return device
+        detected_device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        if detected_device == 'cuda':
+            gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)
+            self.logger.info(f"CUDA detected with {gpu_memory:.2f} GB GPU memory")
+        return detected_device
+    def _setup_huggingface_auth(self) -> Optional[str]:
+        """
+        設置HuggingFace認證
+        Returns:
+            Optional[str]: HuggingFace token，如果可用
+        """
+        hf_token = os.environ.get("HF_TOKEN")
+        if hf_token:
+            try:
+                login(token=hf_token)
+                self.logger.info("Successfully authenticated with HuggingFace")
+                return hf_token
+            except Exception as e:
+                self.logger.error(f"HuggingFace authentication failed: {e}")
+                return None
+        else:
+            self.logger.warning("HF_TOKEN not found. Access to gated models may be limited")
+            return None
+    def _load_model(self):
+        """
+        載入LLM模型和tokenizer，使用8位量化以節省記憶體
+        Raises:
+            ModelLoadingError: 當模型載入失敗時
+        """
+        if self._model_loaded:
+            return
+        try:
+            self.logger.info(f"Loading model from {self.model_path} with 8-bit quantization")
+            # 清理GPU記憶體
+            self._clear_gpu_cache()
+            # 設置8位量化配置
+            quantization_config = BitsAndBytesConfig(
+                load_in_8bit=True,
+                llm_int8_enable_fp32_cpu_offload=True
+            )
+            # 載入tokenizer
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                self.tokenizer_path,
+                padding_side="left",
+                use_fast=False,
+                token=self.hf_token
+            )
+            # 設置特殊標記
+            if self.tokenizer.pad_token is None:
+                self.tokenizer.pad_token = self.tokenizer.eos_token
+            # 載入模型
+            self.model = AutoModelForCausalLM.from_pretrained(
+                self.model_path,
+                quantization_config=quantization_config,
+                device_map="auto",
+                low_cpu_mem_usage=True,
+                token=self.hf_token
+            )
+            self._model_loaded = True
+            self.logger.info("Model loaded successfully")
+        except Exception as e:
+            error_msg = f"Failed to load model: {str(e)}"
+            self.logger.error(error_msg)
+            raise ModelLoadingError(error_msg) from e
+    def _clear_gpu_cache(self):
+        """清理GPU記憶體緩存"""
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            self.logger.debug("GPU cache cleared")
+    def generate_response(self, prompt: str, **generation_kwargs) -> str:
+        """
+        生成LLM回應
+        Args:
+            prompt: 輸入提示詞
+            **generation_kwargs: 額外的生成參數，可覆蓋預設值
+        Returns:
+            str: 生成的回應文本
+        Raises:
+            ModelGenerationError: 當生成失敗時
+        """
+        # 確保模型已載入
+        if not self._model_loaded:
+            self._load_model()
+        try:
+            self.call_count += 1
+            self.logger.info(f"Generating response (call #{self.call_count})")
+            # clean GPU
+            self._clear_gpu_cache()
+            # 設置固定種子以提高一致性
+            torch.manual_seed(42)
+            # prepare input
+            inputs = self.tokenizer(
+                prompt,
+                return_tensors="pt",
+                truncation=True,
+                max_length=self.max_length
+            ).to(self.device)
+            # 準備生成參數
+            generation_params = self._prepare_generation_params(**generation_kwargs)
+            generation_params.update({
+                "pad_token_id": self.tokenizer.eos_token_id,
+                "attention_mask": inputs.attention_mask,
+                "use_cache": True,
+            })
+            # resposne
+            with torch.no_grad():
+                outputs = self.model.generate(inputs.input_ids, **generation_params)
+            # 解碼回應
+            full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+            response = self._extract_generated_response(full_response, prompt)
+            if not response or len(response.strip()) < 10:
+                raise ModelGenerationError("Generated response is too short or empty")
+            self.logger.info(f"Response generated successfully ({len(response)} characters)")
+            return response
+        except Exception as e:
+            error_msg = f"Text generation failed: {str(e)}"
+            self.logger.error(error_msg)
+            raise ModelGenerationError(error_msg) from e
+    def _prepare_generation_params(self, **kwargs) -> Dict[str, Any]:
+        """
+        準備生成參數，支援模型特定的優化
+        Args:
+            **kwargs: 用戶提供的生成參數
+        Returns:
+            Dict[str, Any]: 完整的生成參數配置
+        """
+        # basic parameters
+        params = {
+            "max_new_tokens": 120,
+            "temperature": self.temperature,
+            "top_p": self.top_p,
+            "do_sample": True,
+        }
+        # 針對Llama模型的特殊優化
+        if "llama" in self.model_path.lower():
+            params.update({
+                "max_new_tokens": 600,
+                "temperature": 0.35, # not too big
+                "top_p": 0.75,
+                "repetition_penalty": 1.5,
+                "num_beams": 5,
+                "length_penalty": 1,
+                "no_repeat_ngram_size": 3
+            })
+        else:
+            params.update({
+                "max_new_tokens": 300,
+                "temperature": 0.6,
+                "top_p": 0.9,
+                "num_beams": 1,
+                "repetition_penalty": 1.05
+            })
+        # 用戶參數覆蓋預設值
+        params.update(kwargs)
+        return params
+    def _extract_generated_response(self, full_response: str, prompt: str) -> str:
+        """
+        從完整回應中提取生成的部分
+        Args:
+            full_response: 模型的完整輸出
+            prompt: 原始提示詞
+        Returns:
+            str: 提取的生成回應
+        """
+        # 尋找assistant標記
+        assistant_tag = "<|assistant|>"
+        if assistant_tag in full_response:
+            response = full_response.split(assistant_tag)[-1].strip()
+            # 檢查是否有未閉合的user標記
+            user_tag = "<|user|>"
+            if user_tag in response:
+                response = response.split(user_tag)[0].strip()
+            return response
+        # 移除輸入提示詞
+        if full_response.startswith(prompt):
+            return full_response[len(prompt):].strip()
+        return full_response.strip()
+    def reset_context(self):
+        """重置模型上下文，清理GPU緩存"""
+        if self._model_loaded:
+            self._clear_gpu_cache()
+            self.logger.info("Model context reset")
+        else:
+            self.logger.info("Model not loaded, no context to reset")
+    def get_current_device(self) -> str:
+        """
+        獲取當前運行設備
+        Returns:
+            str: 當前設備名稱
+        """
+        return self.device
+    def is_model_loaded(self) -> bool:
+        """
+        檢查模型是否已載入
+        Returns:
+            bool: 模型載入狀態
+        """
+        return self._model_loaded
+    def get_call_count(self) -> int:
+        """
+        獲取模型調用次數
+        Returns:
+            int: 調用次數
+        """
+        return self.call_count
+    def get_model_info(self) -> Dict[str, Any]:
+        """
+        獲取模型信息
+        Returns:
+            Dict[str, Any]: 包含模型路徑、設備、載入狀態等信息
+        """
+        return {
+            "model_path": self.model_path,
+            "device": self.device,
+            "is_loaded": self._model_loaded,
+            "call_count": self.call_count,
+            "has_hf_token": self.hf_token is not None
+        }

object_description_generator.py ADDED Viewed

	@@ -0,0 +1,1266 @@

+import logging
+import traceback
+from typing import Dict, List, Tuple, Optional, Any
+import numpy as np
+class ObjectDescriptionError(Exception):
+    """物件描述生成過程中的自定義異常"""
+    pass
+class ObjectDescriptionGenerator:
+    """
+    物件描述生成器 - 負責將檢測到的物件轉換為自然語言描述
+    該類別處理物件相關的所有描述生成邏輯，包括重要物件的識別、
+    空間位置描述、物件列表格式化以及描述文本的優化。
+    """
+    def __init__(self,
+                 min_prominence_score: float = 0.1,
+                 max_categories_to_return: int = 5,
+                 max_total_objects: int = 7,
+                 confidence_threshold_for_description: float = 0.25,
+                 region_analyzer: Optional[Any] = None):
+        """
+        初始化物件描述生成器
+        Args:
+            min_prominence_score: 物件顯著性的最低分數閾值
+            max_categories_to_return: 返回的物件類別最大數量
+            max_total_objects: 返回的物件總數上限
+            confidence_threshold_for_description: 用於描述的置信度閾值
+        """
+        self.logger = logging.getLogger(self.__class__.__name__)
+        self.min_prominence_score = min_prominence_score
+        self.max_categories_to_return = max_categories_to_return
+        self.max_total_objects = max_total_objects
+        self.confidence_threshold_for_description = confidence_threshold_for_description
+        self.region_analyzer = region_analyzer
+        self.logger.info("ObjectDescriptionGenerator initialized with prominence_score=%.2f, "
+                        "max_categories=%d, max_objects=%d, confidence_threshold=%.2f",
+                        min_prominence_score, max_categories_to_return,
+                        max_total_objects, confidence_threshold_for_description)
+    def get_prominent_objects(self, detected_objects: List[Dict],
+                          min_prominence_score: float = 0.5,
+                          max_categories_to_return: Optional[int] = None) -> List[Dict]:
+        """
+        獲取最重要的物件，基於置信度、大小和位置計算重要性評分
+        Args:
+            detected_objects: 檢測到的物件列表
+            min_prominence_score: 最小重要性分數閾值，範圍 0.0-1.0
+            max_categories_to_return: 可選的最大返回類別數量限制
+        Returns:
+            List[Dict]: 按重要性排序的物件列表
+        """
+        try:
+            if not detected_objects:
+                return []
+            prominent_objects = []
+            for obj in detected_objects:
+                # 計算重要性評分
+                prominence_score = self._calculate_prominence_score(obj)
+                # 只保留超過閾值的物件
+                if prominence_score >= min_prominence_score:
+                    obj_copy = obj.copy()
+                    obj_copy['prominence_score'] = prominence_score
+                    prominent_objects.append(obj_copy)
+            # 按重要性評分排序（從高到低）
+            prominent_objects.sort(key=lambda x: x.get('prominence_score', 0), reverse=True)
+            # 如果指定了最大類別數量限制，進行過濾
+            if max_categories_to_return is not None and max_categories_to_return > 0:
+                categories_seen = set()
+                filtered_objects = []
+                for obj in prominent_objects:
+                    class_name = obj.get("class_name", "unknown")
+                    # 如果是新類別且未達到限制
+                    if class_name not in categories_seen:
+                        if len(categories_seen) < max_categories_to_return:
+                            categories_seen.add(class_name)
+                            filtered_objects.append(obj)
+                    else:
+                        # 已見過的類別，直接添加
+                        filtered_objects.append(obj)
+                return filtered_objects
+            return prominent_objects
+        except Exception as e:
+            self.logger.error(f"Error calculating prominent objects: {str(e)}")
+            return []
+    def set_region_analyzer(self, region_analyzer: Any) -> None:
+        """
+        設置RegionAnalyzer，用於標準化空間描述生成
+        Args:
+            region_analyzer: RegionAnalyzer實例
+        """
+        try:
+            self.region_analyzer = region_analyzer
+            self.logger.info("RegionAnalyzer instance set for ObjectDescriptionGenerator")
+        except Exception as e:
+            self.logger.warning(f"Error setting RegionAnalyzer: {str(e)}")
+    def _get_standardized_spatial_description(self, obj: Dict) -> str:
+        """
+        使用RegionAnalyzer生成標準化空間描述的內部方法
+        Args:
+            obj: 物件字典
+        Returns:
+            str: 標準化空間描述，失敗時���回空字串
+        """
+        try:
+            if hasattr(self, 'region_analyzer') and self.region_analyzer:
+                region = obj.get("region", "")
+                object_type = obj.get("class_name", "")
+                if hasattr(self.region_analyzer, 'get_contextual_spatial_description'):
+                    return self.region_analyzer.get_contextual_spatial_description(region, object_type)
+                elif hasattr(self.region_analyzer, 'get_spatial_description_phrase'):
+                    return self.region_analyzer.get_spatial_description_phrase(region)
+            return ""
+        except Exception as e:
+            self.logger.warning(f"Error getting standardized spatial description: {str(e)}")
+            if object_type:
+                return f"visible in the scene"
+            return "present in the view"
+    def _calculate_prominence_score(self, obj: Dict) -> float:
+        """
+        計算物件的重要性評分
+        Args:
+            obj: 物件字典，包含檢測信息
+        Returns:
+            float: 重要性評分 (0.0-1.0)
+        """
+        try:
+            # 基礎置信度評分 (權重: 40%)
+            confidence = obj.get("confidence", 0.5)
+            confidence_score = confidence * 0.4
+            # 大小評分 (權重: 30%)
+            normalized_area = obj.get("normalized_area", 0.1)
+            # 使用對數縮放避免過大物件主導評分
+            size_score = min(np.log(normalized_area * 10 + 1) / np.log(11), 1.0) * 0.3
+            # 位置評分 (權重: 20%)
+            # 中心區域的物件通常更重要
+            center_x, center_y = obj.get("normalized_center", [0.5, 0.5])
+            distance_from_center = np.sqrt((center_x - 0.5)**2 + (center_y - 0.5)**2)
+            position_score = (1 - min(distance_from_center * 2, 1.0)) * 0.2
+            # 類別重要性評分 (權重: 10%)
+            class_importance = self._get_class_importance(obj.get("class_name", "unknown"))
+            class_score = class_importance * 0.1
+            total_score = confidence_score + size_score + position_score + class_score
+            # 確保評分在有效範圍內
+            return max(0.0, min(1.0, total_score))
+        except Exception as e:
+            self.logger.warning(f"Error calculating prominence score for object: {str(e)}")
+            return 0.5  # 返回中等評分作為備用
+    def _get_class_importance(self, class_name: str) -> float:
+        """
+        根據物件類別返回重要性係數
+        Args:
+            class_name: 物件類別名稱
+        Returns:
+            float: 類別重要性係數 (0.0-1.0)
+        """
+        # 高重要性物件（人、車輛、建築）
+        high_importance = ["person", "car", "truck", "bus", "motorcycle", "bicycle", "building"]
+        # 中等重要性物件（家具、電器）
+        medium_importance = ["chair", "couch", "tv", "laptop", "refrigerator", "dining table", "bed"]
+        # 低重要性物件（小物品、配件）
+        low_importance = ["handbag", "backpack", "umbrella", "cell phone", "remote", "mouse"]
+        class_name_lower = class_name.lower()
+        if any(item in class_name_lower for item in high_importance):
+            return 1.0
+        elif any(item in class_name_lower for item in medium_importance):
+            return 0.7
+        elif any(item in class_name_lower for item in low_importance):
+            return 0.4
+        else:
+            return 0.6  # 預設中等重要性
+    def format_object_list_for_description(self,
+                                          objects: List[Dict],
+                                          use_indefinite_article_for_one: bool = False,
+                                          count_threshold_for_generalization: int = -1,
+                                          max_types_to_list: int = 5) -> str:
+        """
+        將物件列表格式化為人類可讀的字符串，包含計數信息
+        Args:
+            objects: 物件字典列表，每個應包含 'class_name'
+            use_indefinite_article_for_one: 單個物件是否使用 "a/an"，否則使用 "one"
+            count_threshold_for_generalization: 超過此計數時使用通用術語，-1表示精確計數
+            max_types_to_list: 列表中包含的不同物件類型最大數量
+        Returns:
+            str: 格式化的物件描述字符串
+        """
+        try:
+            if not objects:
+                return "no specific objects clearly identified"
+            counts: Dict[str, int] = {}
+            for obj in objects:
+                name = obj.get("class_name", "unknown object")
+                if name == "unknown object" or not name:
+                    continue
+                counts[name] = counts.get(name, 0) + 1
+            if not counts:
+                return "no specific objects clearly identified"
+            descriptions = []
+            # 按計數降序然後按名稱升序排序，��制物件類型數量
+            sorted_counts = sorted(counts.items(), key=lambda item: (-item[1], item[0]))[:max_types_to_list]
+            for name, count in sorted_counts:
+                if count == 1:
+                    if use_indefinite_article_for_one:
+                        if name[0].lower() in 'aeiou':
+                            descriptions.append(f"an {name}")
+                        else:
+                            descriptions.append(f"a {name}")
+                    else:
+                        descriptions.append(f"one {name}")
+                else:
+                    # 處理複數形式
+                    plural_name = name
+                    if name.endswith("y") and not name.lower().endswith(("ay", "ey", "iy", "oy", "uy")):
+                        plural_name = name[:-1] + "ies"
+                    elif name.endswith(("s", "sh", "ch", "x", "z")):
+                        plural_name = name + "es"
+                    elif not name.endswith("s"):
+                        plural_name = name + "s"
+                    if count_threshold_for_generalization != -1 and count > count_threshold_for_generalization:
+                        if count <= count_threshold_for_generalization + 3:
+                            descriptions.append(f"several {plural_name}")
+                        else:
+                            descriptions.append(f"many {plural_name}")
+                    else:
+                        descriptions.append(f"{count} {plural_name}")
+            if not descriptions:
+                return "no specific objects clearly identified"
+            if len(descriptions) == 1:
+                return descriptions[0]
+            elif len(descriptions) == 2:
+                return f"{descriptions[0]} and {descriptions[1]}"
+            else:
+                # 使用牛津逗號格式
+                return ", ".join(descriptions[:-1]) + f", and {descriptions[-1]}"
+        except Exception as e:
+            self.logger.warning(f"Error formatting object list: {str(e)}")
+            return "various objects"
+    def get_spatial_description(self, obj: Dict, image_width: Optional[int] = None,
+                           image_height: Optional[int] = None,
+                           region_analyzer: Optional[Any] = None) -> str:
+        """
+        為物件生成空間位置描述
+        Args:
+            obj: 物件字典
+            image_width: 可選的圖像寬度
+            image_height: 可選的圖像高度
+            region_analyzer: 可選的RegionAnalyzer實例，用於生成標準化描述
+        Returns:
+            str: 空間描述字符串，空值region時返回空字串
+        """
+        try:
+            region = obj.get("region") or ""
+            # 處理空值或無效region，直接返回空字串避免不完整描述
+            if not region.strip() or region == "unknown":
+                # 根據物件類型提供合適的預設位置描述
+                if object_type and any(vehicle in object_type.lower() for vehicle in ["car", "truck", "bus"]):
+                    return "positioned in the scene"
+                elif object_type and "person" in object_type.lower():
+                    return "present in the area"
+                else:
+                    return "located in the scene"
+            # 如果提供了RegionAnalyzer實例，使用其標準化方法
+            if region_analyzer and hasattr(region_analyzer, 'get_spatial_description_phrase'):
+                object_type = obj.get("class_name", "")
+                if hasattr(region_analyzer, 'get_contextual_spatial_description'):
+                    spatial_desc = region_analyzer.get_contextual_spatial_description(region, object_type)
+                else:
+                    spatial_desc = region_analyzer.get_spatial_description_phrase(region)
+                if spatial_desc:
+                    return spatial_desc
+            # 備用邏輯：使用改進的內建映射
+            clean_region = region.replace('_', ' ').strip().lower()
+            region_map = {
+                "top left": "in the upper left area",
+                "top center": "in the upper area",
+                "top right": "in the upper right area",
+                "middle left": "on the left side",
+                "middle center": "in the center",
+                "center": "in the center",
+                "middle right": "on the right side",
+                "bottom left": "in the lower left area",
+                "bottom center": "in the lower area",
+                "bottom right": "in the lower right area"
+            }
+            # 直接映射匹配
+            if clean_region in region_map:
+                return region_map[clean_region]
+            # 模糊匹配處理
+            if "top" in clean_region and "left" in clean_region:
+                return "in the upper left area"
+            elif "top" in clean_region and "right" in clean_region:
+                return "in the upper right area"
+            elif "bottom" in clean_region and "left" in clean_region:
+                return "in the lower left area"
+            elif "bottom" in clean_region and "right" in clean_region:
+                return "in the lower right area"
+            elif "top" in clean_region:
+                return "in the upper area"
+            elif "bottom" in clean_region:
+                return "in the lower area"
+            elif "left" in clean_region:
+                return "on the left side"
+            elif "right" in clean_region:
+                return "on the right side"
+            elif "center" in clean_region or "middle" in clean_region:
+                return "in the center"
+            # 如果region無法識別，使用normalized_center作為最後備用
+            norm_center = obj.get("normalized_center")
+            if norm_center and image_width and image_height:
+                x_norm, y_norm = norm_center
+                h_pos = "left" if x_norm < 0.4 else "right" if x_norm > 0.6 else "center"
+                v_pos = "upper" if y_norm < 0.4 else "lower" if y_norm > 0.6 else "center"
+                if h_pos == "center" and v_pos == "center":
+                    return "in the center"
+                return f"in the {v_pos} {h_pos} area"
+            # 如果所有方法都失敗，返回空字串
+            return ""
+        except Exception as e:
+            self.logger.warning(f"Error generating spatial description: {str(e)}")
+            return ""
+    def optimize_object_description(self, description: str) -> str:
+        """
+        優化物件描述，避免重複列舉相同物件
+        Args:
+            description: 原始描述文本
+        Returns:
+            str: 優化後的描述文本
+        """
+        try:
+            import re
+            # 處理床鋪重複描述
+            if "bed in the room" in description:
+                description = description.replace("a bed in the room", "a bed")
+            # 處理重複的物件列表
+            object_lists = re.findall(r'with ([^\.]+?)(?:\.|\band\b)', description)
+            for obj_list in object_lists:
+                # 計算每個物件出現次數
+                items = re.findall(r'([a-zA-Z\s]+)(?:,|\band\b|$)', obj_list)
+                item_counts = {}
+                for item in items:
+                    item = item.strip()
+                    if item and item not in ["and", "with"]:
+                        if item not in item_counts:
+                            item_counts[item] = 0
+                        item_counts[item] += 1
+                # 生成優化後的物件列表
+                if item_counts:
+                    new_items = []
+                    for item, count in item_counts.items():
+                        if count > 1:
+                            new_items.append(f"{count} {item}s")
+                        else:
+                            new_items.append(item)
+                    # 格式化新列表
+                    if len(new_items) == 1:
+                        new_list = new_items[0]
+                    elif len(new_items) == 2:
+                        new_list = f"{new_items[0]} and {new_items[1]}"
+                    else:
+                        new_list = ", ".join(new_items[:-1]) + f", and {new_items[-1]}"
+                    # 替換原始列表
+                    description = description.replace(obj_list, new_list)
+            return description
+        except Exception as e:
+            self.logger.warning(f"Error optimizing object description: {str(e)}")
+            return description
+    def generate_dynamic_everyday_description(self,
+                                            detected_objects: List[Dict],
+                                            lighting_info: Optional[Dict] = None,
+                                            viewpoint: str = "eye_level",
+                                            spatial_analysis: Optional[Dict] = None,
+                                            image_dimensions: Optional[Tuple[int, int]] = None,
+                                            places365_info: Optional[Dict] = None,
+                                            object_statistics: Optional[Dict] = None) -> str:
+        """
+        為日常場景動態生成描述，基於所有相關的檢測物件、計數和上下文
+        Args:
+            detected_objects: 檢測到的物件列表
+            lighting_info: 照明信息
+            viewpoint: 視角類型
+            spatial_analysis: 空間分析結果
+            image_dimensions: 圖像尺寸
+            places365_info: Places365場景分類信息
+            object_statistics: 物件統計信息
+        Returns:
+            str: 動態生成的場景描述
+        """
+        try:
+            description_segments = []
+            image_width, image_height = image_dimensions if image_dimensions else (None, None)
+            self.logger.debug(f"Generating dynamic description for {len(detected_objects)} objects, "
+                            f"viewpoint: {viewpoint}, lighting: {lighting_info is not None}")
+            # 1. 整體氛圍（照明和視角）
+            ambiance_parts = []
+            if lighting_info:
+                time_of_day = lighting_info.get("time_of_day", "unknown lighting")
+                is_indoor = lighting_info.get("is_indoor")
+                ambiance_statement = "This is"
+                if is_indoor is True:
+                    ambiance_statement += " an indoor scene"
+                elif is_indoor is False:
+                    ambiance_statement += " an outdoor scene"
+                else:
+                    ambiance_statement += " a scene"
+                # remove underline
+                readable_lighting = f"with {time_of_day.replace('_', ' ')} lighting conditions"
+                ambiance_statement += f", likely {readable_lighting}."
+                ambiance_parts.append(ambiance_statement)
+            if viewpoint and viewpoint != "eye_level":
+                if not ambiance_parts:
+                    ambiance_parts.append(f"From {viewpoint.replace('_', ' ')}, the general layout of the scene is observed.")
+                else:
+                    ambiance_parts[-1] = ambiance_parts[-1].rstrip('.') + f", viewed from {viewpoint.replace('_', ' ')}."
+            if ambiance_parts:
+                description_segments.append(" ".join(ambiance_parts))
+            # 2. 描述所有檢測到的物件，按類別分組，使用準確計數和位置
+            if not detected_objects:
+                if not description_segments:
+                    description_segments.append("A general scene is visible, but no specific objects were clearly identified.")
+                else:
+                    description_segments.append("Within this setting, no specific objects were clearly identified.")
+            else:
+                objects_by_class: Dict[str, List[Dict]] = {}
+                # 使用置信度過濾
+                confident_objects = [obj for obj in detected_objects
+                                   if obj.get("confidence", 0) >= self.confidence_threshold_for_description]
+                if not confident_objects:
+                    no_confident_obj_msg = "While some elements might be present, no objects were identified with sufficient confidence for a detailed description."
+                    if not description_segments:
+                        description_segments.append(no_confident_obj_msg)
+                    else:
+                        description_segments.append(no_confident_obj_msg.lower().capitalize())
+                else:
+                    if object_statistics:
+                        # 使用預計算的統計信息，採用動態的信心度
+                        for class_name, stats in object_statistics.items():
+                            count = stats.get("count", 0)
+                            avg_confidence = stats.get("avg_confidence", 0)
+                            # 動態調整置信度閾值
+                            dynamic_threshold = self.confidence_threshold_for_description
+                            if class_name in ["potted plant", "vase", "clock", "book"]:
+                                dynamic_threshold = max(0.15, self.confidence_threshold_for_description * 0.6)
+                            elif count >= 3:
+                                dynamic_threshold = max(0.2, self.confidence_threshold_for_description * 0.8)
+                            if count > 0 and avg_confidence >= dynamic_threshold:
+                                matching_objects = [obj for obj in confident_objects if obj.get("class_name") == class_name]
+                                if not matching_objects:
+                                    matching_objects = [obj for obj in detected_objects
+                                                      if obj.get("class_name") == class_name and obj.get("confidence", 0) >= dynamic_threshold]
+                                if matching_objects:
+                                    actual_count = min(stats["count"], len(matching_objects))
+                                    objects_by_class[class_name] = matching_objects[:actual_count]
+                    else:
+                        # 備用邏輯，同樣使用動態閾值
+                        for obj in confident_objects:
+                            name = obj.get("class_name", "unknown object")
+                            if name == "unknown object" or not name:
+                                continue
+                            if name not in objects_by_class:
+                                objects_by_class[name] = []
+                            objects_by_class[name].append(obj)
+                    if not objects_by_class:
+                        description_segments.append("No common objects were confidently identified for detailed description.")
+                    else:
+                        # 物件組排序函數
+                        def sort_key_object_groups(item_tuple: Tuple[str, List[Dict]]):
+                            class_name_key, obj_group_list = item_tuple
+                            priority = 3
+                            count = len(obj_group_list)
+                            # 確保類別名稱已標準化
+                            normalized_class_name = self._normalize_object_class_name(class_name_key)
+                            # 動態優先級
+                            if normalized_class_name == "person":
+                                priority = 0
+                            elif normalized_class_name in ["dining table", "chair", "sofa", "bed"]:
+                                priority = 1
+                            elif normalized_class_name in ["car", "bus", "truck", "traffic light"]:
+                                priority = 2
+                            elif count >= 3:
+                                priority = max(1, priority - 1)
+                            elif normalized_class_name in ["potted plant", "vase", "clock", "book"] and count >= 2:
+                                priority = 2
+                            avg_area = sum(o.get("normalized_area", 0.0) for o in obj_group_list) / len(obj_group_list) if obj_group_list else 0
+                            quantity_bonus = min(count / 5.0, 1.0)
+                            return (priority, -len(obj_group_list), -avg_area, -quantity_bonus)
+                        # remove duplicate
+                        deduplicated_objects_by_class = {}
+                        processed_positions = []
+                        for class_name, group_of_objects in objects_by_class.items():
+                            unique_objects = []
+                            for obj in group_of_objects:
+                                obj_position = obj.get("normalized_center", [0.5, 0.5])
+                                is_duplicate = False
+                                for processed_pos in processed_positions:
+                                    position_distance = abs(obj_position[0] - processed_pos[0]) + abs(obj_position[1] - processed_pos[1])
+                                    if position_distance < 0.15:
+                                        is_duplicate = True
+                                        break
+                                if not is_duplicate:
+                                    unique_objects.append(obj)
+                                    processed_positions.append(obj_position)
+                            if unique_objects:
+                                deduplicated_objects_by_class[class_name] = unique_objects
+                        objects_by_class = deduplicated_objects_by_class
+                        sorted_object_groups = sorted(objects_by_class.items(), key=sort_key_object_groups)
+                        object_clauses = []
+                        for class_name, group_of_objects in sorted_object_groups:
+                            count = len(group_of_objects)
+                            if count == 0:
+                                continue
+                            # 標準化class name
+                            normalized_class_name = self._normalize_object_class_name(class_name)
+                            # 使用統計信息確保準確的數量描述
+                            if object_statistics and class_name in object_statistics:
+                                actual_count = object_statistics[class_name]["count"]
+                                formatted_name_with_exact_count = self._format_object_count_description(
+                                    normalized_class_name, actual_count
+                                )
+                            else:
+                                formatted_name_with_exact_count = self._format_object_count_description(
+                                    normalized_class_name, count
+                                )
+                            if formatted_name_with_exact_count == "no specific objects clearly identified" or not formatted_name_with_exact_count:
+                                continue
+                            # 確定群組的集體位置
+                            location_description_suffix = ""
+                            if count == 1:
+                                spatial_desc = self.get_spatial_description(group_of_objects[0], image_width, image_height, self.region_analyzer)
+                                if spatial_desc:
+                                    location_description_suffix = f"is {spatial_desc}"
+                                else:
+                                    distinct_regions = sorted(list(set(obj.get("region", "") for obj in group_of_objects if obj.get("region"))))
+                                    valid_regions = [r for r in distinct_regions if r and r != "unknown" and r.strip()]
+                                    if not valid_regions:
+                                        location_description_suffix = "is positioned in the scene"
+                                    elif len(valid_regions) == 1:
+                                        spatial_desc = self.get_spatial_description_phrase(valid_regions[0])
+                                        location_description_suffix = f"is primarily {spatial_desc}" if spatial_desc else "is positioned in the scene"
+                                    elif len(valid_regions) == 2:
+                                        clean_region1 = valid_regions[0].replace('_', ' ')
+                                        clean_region2 = valid_regions[1].replace('_', ' ')
+                                        location_description_suffix = f"is mainly across the {clean_region1} and {clean_region2} areas"
+                                    else:
+                                        location_description_suffix = "is distributed in various parts of the scene"
+                            else:
+                                distinct_regions = sorted(list(set(obj.get("region", "") for obj in group_of_objects if obj.get("region"))))
+                                valid_regions = [r for r in distinct_regions if r and r != "unknown" and r.strip()]
+                                if not valid_regions:
+                                    location_description_suffix = "are visible in the scene"
+                                elif len(valid_regions) == 1:
+                                    clean_region = valid_regions[0].replace('_', ' ')
+                                    location_description_suffix = f"are primarily in the {clean_region} area"
+                                elif len(valid_regions) == 2:
+                                    clean_region1 = valid_regions[0].replace('_', ' ')
+                                    clean_region2 = valid_regions[1].replace('_', ' ')
+                                    location_description_suffix = f"are mainly across the {clean_region1} and {clean_region2} areas"
+                                else:
+                                    location_description_suffix = "are distributed in various parts of the scene"
+                            # 首字母大寫
+                            formatted_name_capitalized = formatted_name_with_exact_count[0].upper() + formatted_name_with_exact_count[1:]
+                            object_clauses.append(f"{formatted_name_capitalized} {location_description_suffix}")
+                        if object_clauses:
+                            if not description_segments:
+                                if object_clauses:
+                                    first_clause = object_clauses.pop(0)
+                                    description_segments.append(first_clause + ".")
+                            else:
+                                if object_clauses:
+                                    description_segments.append("The scene features:")
+                            if object_clauses:
+                                joined_object_clauses = ". ".join(object_clauses)
+                                if joined_object_clauses and not joined_object_clauses.endswith("."):
+                                    joined_object_clauses += "."
+                                description_segments.append(joined_object_clauses)
+                        elif not description_segments:
+                            return "The image depicts a scene, but specific objects could not be described with confidence or detail."
+            # 最終組裝和格式化
+            raw_description = ""
+            for i, segment in enumerate(filter(None, description_segments)):
+                segment = segment.strip()
+                if not segment:
+                    continue
+                if not raw_description:
+                    raw_description = segment
+                else:
+                    if not raw_description.endswith(('.', '!', '?')):
+                        raw_description += "."
+                    raw_description += " " + (segment[0].upper() + segment[1:] if len(segment) > 1 else segment.upper())
+            if raw_description and not raw_description.endswith(('.', '!', '?')):
+                raw_description += "."
+            if not raw_description or len(raw_description.strip()) < 20:
+                if 'confident_objects' in locals() and confident_objects:
+                    return "The scene contains several detected objects, but a detailed textual description could not be fully constructed."
+                else:
+                    return "A general scene is depicted with no objects identified with high confidence."
+            return raw_description
+        except Exception as e:
+            error_msg = f"Error generating dynamic everyday description: {str(e)}"
+            self.logger.error(f"{error_msg}\n{traceback.format_exc()}")
+            raise ObjectDescriptionError(error_msg) from e
+    def _format_object_count_description(self, class_name: str, count: int) -> str:
+        """
+        格式化物件數量描述，提供多樣化的表達方式
+        Args:
+            class_name: 標準化後的類別名稱
+            count: 物件數量
+        Returns:
+            str: 格式化的數量描述
+        """
+        try:
+            if count <= 0:
+                return ""
+            # 單數情況
+            if count == 1:
+                article = "an" if class_name[0].lower() in 'aeiou' else "a"
+                return f"{article} {class_name}"
+            # 複數情況
+            plural_form = self._get_plural_form(class_name)
+            # 根據數量選擇不同的表達方式
+            if count == 2:
+                return f"two {plural_form}"
+            elif count == 3:
+                return f"three {plural_form}"
+            elif count <= 5:
+                return f"{count} {plural_form}"
+            elif count <= 10:
+                return f"several {plural_form}"
+            else:
+                return f"numerous {plural_form}"
+        except Exception as e:
+            self.logger.warning(f"Error formatting object count for '{class_name}': {str(e)}")
+            return f"{count} {class_name}s" if count > 1 else class_name
+    def _get_plural_form(self, word: str) -> str:
+        """
+        獲取詞彙的複數形式
+        Args:
+            word: 單數詞彙
+        Returns:
+            str: 複數形式
+        """
+        try:
+            # 特殊複數形式
+            irregular_plurals = {
+                'person': 'people',
+                'child': 'children',
+                'foot': 'feet',
+                'tooth': 'teeth',
+                'mouse': 'mice',
+                'man': 'men',
+                'woman': 'women'
+            }
+            if word.lower() in irregular_plurals:
+                return irregular_plurals[word.lower()]
+            # 規則複數形式
+            if word.endswith(('s', 'sh', 'ch', 'x', 'z')):
+                return word + 'es'
+            elif word.endswith('y') and word[-2] not in 'aeiou':
+                return word[:-1] + 'ies'
+            elif word.endswith('f'):
+                return word[:-1] + 'ves'
+            elif word.endswith('fe'):
+                return word[:-2] + 'ves'
+            else:
+                return word + 's'
+        except Exception as e:
+            self.logger.warning(f"Error getting plural form for '{word}': {str(e)}")
+            return word + 's'
+    def _normalize_object_class_name(self, class_name: str) -> str:
+        """
+        標準化物件類別名稱，確保輸出自然語言格式
+        Args:
+            class_name: 原始類別名稱
+        Returns:
+            str: 標準化後的類別名稱
+        """
+        try:
+            if not class_name or not isinstance(class_name, str):
+                return "object"
+            # 移除可能的技術性前綴或後綴
+            import re
+            normalized = re.sub(r'^(class_|id_|type_)', '', class_name.lower())
+            normalized = re.sub(r'(_class|_id|_type)$', '', normalized)
+            # 將下劃線和連字符替換為空格
+            normalized = normalized.replace('_', ' ').replace('-', ' ')
+            # 移除多餘空格
+            normalized = ' '.join(normalized.split())
+            # 特殊類別名稱的標準化映射
+            class_name_mapping = {
+                'traffic light': 'traffic light',
+                'stop sign': 'stop sign',
+                'fire hydrant': 'fire hydrant',
+                'dining table': 'dining table',
+                'potted plant': 'potted plant',
+                'tv monitor': 'television',
+                'cell phone': 'mobile phone',
+                'wine glass': 'wine glass',
+                'hot dog': 'hot dog',
+                'teddy bear': 'teddy bear',
+                'hair drier': 'hair dryer',
+                'toothbrush': 'toothbrush'
+            }
+            return class_name_mapping.get(normalized, normalized)
+        except Exception as e:
+            self.logger.warning(f"Error normalizing class name '{class_name}': {str(e)}")
+            return class_name if isinstance(class_name, str) else "object"
+    def generate_basic_details(self, scene_type: str, detected_objects: List[Dict]) -> str:
+        """
+        當模板不可用時生成基本詳細信息
+        Args:
+            scene_type: 識別的場景類型
+            detected_objects: 檢測到的物件列表
+        Returns:
+            str: 基本場景詳細信息
+        """
+        try:
+            # 處理特定場景類型的自定義邏輯
+            if scene_type == "living_room":
+                tv_objs = [obj for obj in detected_objects if obj.get("class_id") == 62]  # TV
+                sofa_objs = [obj for obj in detected_objects if obj.get("class_id") == 57]  # Sofa
+                if tv_objs and sofa_objs:
+                    tv_region = tv_objs[0].get("region", "center")
+                    sofa_region = sofa_objs[0].get("region", "center")
+                    arrangement = f"The TV is in the {tv_region.replace('_', ' ')} of the image, "
+                    arrangement += f"while the sofa is in the {sofa_region.replace('_', ' ')}. "
+                    return f"{arrangement}This appears to be a space designed for relaxation and entertainment."
+            elif scene_type == "bedroom":
+                bed_objs = [obj for obj in detected_objects if obj.get("class_id") == 59]  # Bed
+                if bed_objs:
+                    bed_region = bed_objs[0].get("region", "center")
+                    extra_items = []
+                    for obj in detected_objects:
+                        if obj.get("class_id") == 74:  # Clock
+                            extra_items.append("clock")
+                        elif obj.get("class_id") == 73:  # Book
+                            extra_items.append("book")
+                    extras = ""
+                    if extra_items:
+                        extras = f" There is also a {' and a '.join(extra_items)} visible."
+                    return f"The bed is located in the {bed_region.replace('_', ' ')} of the image.{extras}"
+            elif scene_type in ["dining_area", "kitchen"]:
+                # 計算食物和餐飲相關物品
+                food_items = []
+                for obj in detected_objects:
+                    if obj.get("class_id") in [39, 41, 42, 43, 44, 45]:  # 廚房物品
+                        food_items.append(obj.get("class_name", "kitchen item"))
+                food_str = ""
+                if food_items:
+                    unique_items = list(set(food_items))
+                    if len(unique_items) <= 3:
+                        food_str = f" with {', '.join(unique_items)}"
+                    else:
+                        food_str = f" with {', '.join(unique_items[:3])} and other items"
+                return f"{food_str}."
+            elif scene_type == "city_street":
+                # 計算人員和車輛
+                people_count = len([obj for obj in detected_objects if obj.get("class_id") == 0])
+                vehicle_count = len([obj for obj in detected_objects
+                                   if obj.get("class_id") in [1, 2, 3, 5, 7]])  # Bicycle, car, motorbike, bus, truck
+                traffic_desc = ""
+                if people_count > 0 and vehicle_count > 0:
+                    traffic_desc = f" with {people_count} {'people' if people_count > 1 else 'person'} and "
+                    traffic_desc += f"{vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}"
+                elif people_count > 0:
+                    traffic_desc = f" with {people_count} {'people' if people_count > 1 else 'person'}"
+                elif vehicle_count > 0:
+                    traffic_desc = f" with {vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}"
+                return f"{traffic_desc}."
+            elif scene_type == "asian_commercial_street":
+                # 尋找關鍵城市元素
+                people_count = len([obj for obj in detected_objects if obj.get("class_id") == 0])
+                vehicle_count = len([obj for obj in detected_objects if obj.get("class_id") in [1, 2, 3]])
+                # 分析行人分布
+                people_positions = []
+                for obj in detected_objects:
+                    if obj.get("class_id") == 0:  # Person
+                        people_positions.append(obj.get("normalized_center", (0.5, 0.5)))
+                # 檢查人員是否沿線分布（表示步行路徑）
+                structured_path = False
+                if len(people_positions) >= 3:
+                    # 簡化檢查 - 查看多個人員的y坐標是否相似
+                    y_coords = [pos[1] for pos in people_positions]
+                    y_mean = sum(y_coords) / len(y_coords)
+                    y_variance = sum((y - y_mean)**2 for y in y_coords) / len(y_coords)
+                    if y_variance < 0.05:  # 低變異數表示線性排列
+                        structured_path = True
+                street_desc = "A commercial street with "
+                if people_count > 0:
+                    street_desc += f"{people_count} {'pedestrians' if people_count > 1 else 'pedestrian'}"
+                    if vehicle_count > 0:
+                        street_desc += f" and {vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}"
+                elif vehicle_count > 0:
+                    street_desc += f"{vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}"
+                else:
+                    street_desc += "various commercial elements"
+                if structured_path:
+                    street_desc += ". The pedestrians appear to be following a defined walking path"
+                # 添加文化元素
+                street_desc += ". The signage and architectural elements suggest an Asian urban setting."
+                return street_desc
+            # 默認通用描述
+            return "The scene contains various elements characteristic of this environment."
+        except Exception as e:
+            self.logger.warning(f"Error generating basic details for scene_type '{scene_type}': {str(e)}")
+            return "The scene contains various elements characteristic of this environment."
+    def generate_placeholder_content(self, placeholder: str, detected_objects: List[Dict], scene_type: str) -> str:
+        """
+        為模板佔位符生成內容
+        Args:
+            placeholder: 模板佔位符
+            detected_objects: 檢測到的物件列表
+            scene_type: 場景類型
+        Returns:
+            str: 生成的佔位符內容
+        """
+        try:
+            # 處理不同類型的佔位符與自定義邏輯
+            if placeholder == "furniture":
+                # 提取家具物品
+                furniture_ids = [56, 57, 58, 59, 60, 61]  # 家具類別ID示例
+                furniture_objects = [obj for obj in detected_objects if obj.get("class_id") in furniture_ids]
+                if furniture_objects:
+                    furniture_names = []
+                    for obj in furniture_objects[:3]:
+                        raw_name = obj.get("class_name", "furniture")
+                        normalized_name = self._normalize_object_class_name(raw_name)
+                        furniture_names.append(normalized_name)
+                    unique_names = list(set(furniture_names))
+                    if len(unique_names) == 1:
+                        return unique_names[0]
+                    elif len(unique_names) == 2:
+                        return f"{unique_names[0]} and {unique_names[1]}"
+                    else:
+                        return ", ".join(unique_names[:-1]) + f", and {unique_names[-1]}"
+                return "various furniture items"
+            elif placeholder == "electronics":
+                # 提取電子物品
+                electronics_ids = [62, 63, 64, 65, 66, 67, 68, 69, 70]  # 電子設備類別ID示例
+                electronics_objects = [obj for obj in detected_objects if obj.get("class_id") in electronics_ids]
+                if electronics_objects:
+                    electronics_names = [obj.get("class_name", "electronic device") for obj in electronics_objects[:3]]
+                    return ", ".join(set(electronics_names))
+                return "electronic devices"
+            elif placeholder == "people_count":
+                # 計算人數
+                people_count = len([obj for obj in detected_objects if obj.get("class_id") == 0])
+                if people_count == 0:
+                    return "no people"
+                elif people_count == 1:
+                    return "one person"
+                elif people_count < 5:
+                    return f"{people_count} people"
+                else:
+                    return "several people"
+            elif placeholder == "seating":
+                # 提取座位物品
+                seating_ids = [56, 57]  # chair, sofa
+                seating_objects = [obj for obj in detected_objects if obj.get("class_id") in seating_ids]
+                if seating_objects:
+                    seating_names = [obj.get("class_name", "seating") for obj in seating_objects[:2]]
+                    return ", ".join(set(seating_names))
+                return "seating arrangements"
+            # 默認情況 - 空字符串
+            return ""
+        except Exception as e:
+            self.logger.warning(f"Error generating placeholder content for '{placeholder}': {str(e)}")
+            return ""
+    def describe_functional_zones(self, functional_zones: Dict) -> str:
+        """
+        生成場景功能區域的描述，優化處理行人區域、人數統計和物品重複問題
+        Args:
+            functional_zones: 識別出的功能區域字典
+        Returns:
+            str: 功能區域描述
+        """
+        try:
+            if not functional_zones:
+                return ""
+            # 處理不同類型的 functional_zones 參數
+            if isinstance(functional_zones, list):
+                # 如果是列表，轉換為字典格式
+                zones_dict = {}
+                for i, zone in enumerate(functional_zones):
+                    if isinstance(zone, dict) and 'name' in zone:
+                        zone_name = self._normalize_zone_name(zone['name'])
+                    else:
+                        zone_name = f"functional area {i+1}"
+                    zones_dict[zone_name] = zone if isinstance(zone, dict) else {"description": str(zone)}
+                functional_zones = zones_dict
+            elif not isinstance(functional_zones, dict):
+                return ""
+            # 標準化所有區域鍵名，移除內部標識符格式
+            normalized_zones = {}
+            for zone_key, zone_data in functional_zones.items():
+                normalized_key = self._normalize_zone_name(zone_key)
+                normalized_zones[normalized_key] = zone_data
+            functional_zones = normalized_zones
+            # 計算場景中的總人數
+            total_people_count = 0
+            people_by_zone = {}
+            # 計算每個區域的人數並累計總人數
+            for zone_name, zone_info in functional_zones.items():
+                if "objects" in zone_info:
+                    zone_people_count = zone_info["objects"].count("person")
+                    people_by_zone[zone_name] = zone_people_count
+                    total_people_count += zone_people_count
+            # 分類區域為行人區域和其他區域
+            pedestrian_zones = []
+            other_zones = []
+            for zone_name, zone_info in functional_zones.items():
+                # 檢查是否是行人相關區域
+                if any(keyword in zone_name.lower() for keyword in ["pedestrian", "crossing", "people"]):
+                    pedestrian_zones.append((zone_name, zone_info))
+                else:
+                    other_zones.append((zone_name, zone_info))
+            # 獲取最重要的行人區域和其他區域
+            main_pedestrian_zones = sorted(pedestrian_zones,
+                                        key=lambda z: people_by_zone.get(z[0], 0),
+                                        reverse=True)[:1]  # 最多1個主要行人區域
+            top_other_zones = sorted(other_zones,
+                                key=lambda z: len(z[1].get("objects", [])),
+                                reverse=True)[:2]  # 最多2個其他區域
+            # 合併區域
+            top_zones = main_pedestrian_zones + top_other_zones
+            if not top_zones:
+                return ""
+            # 生成匯總描述
+            summary = ""
+            max_mentioned_people = 0  # 追蹤已經提到的最大人數
+            # 如果總人數顯著且還沒在主描述中提到，添加總人數描述
+            if total_people_count > 5:
+                summary = f"The scene contains a significant number of pedestrians ({total_people_count} people). "
+                max_mentioned_people = total_people_count  # 更新已提到的最大人數
+            # 處理每個區域的描述，確保人數信息的一致性
+            processed_zones = []
+            for zone_name, zone_info in top_zones:
+                zone_desc = zone_info.get("description", "a functional zone")
+                zone_people_count = people_by_zone.get(zone_name, 0)
+                # 檢查描述中是否包含人數資訊
+                contains_people_info = "with" in zone_desc and ("person" in zone_desc.lower() or "people" in zone_desc.lower())
+                # 如果描述包含人數信息，且人數較小（小於已提到的最大人數），則修改描述
+                if contains_people_info and zone_people_count < max_mentioned_people:
+                    parts = zone_desc.split("with")
+                    if len(parts) > 1:
+                        # 移除人數部分
+                        zone_desc = parts[0].strip() + " area"
+                processed_zones.append((zone_name, {"description": zone_desc}))
+            # 根據處理後的區域數量生成最終描述
+            final_desc = ""
+            if len(processed_zones) == 1:
+                _, zone_info = processed_zones[0]
+                zone_desc = zone_info["description"]
+                final_desc = summary + f"The scene includes {zone_desc}."
+            elif len(processed_zones) == 2:
+                _, zone1_info = processed_zones[0]
+                _, zone2_info = processed_zones[1]
+                zone1_desc = zone1_info["description"]
+                zone2_desc = zone2_info["description"]
+                final_desc = summary + f"The scene is divided into two main areas: {zone1_desc} and {zone2_desc}."
+            else:
+                zones_desc = ["The scene contains multiple functional areas including"]
+                zone_descriptions = [z[1]["description"] for z in processed_zones]
+                # 格式化最終的多區域描述
+                if len(zone_descriptions) == 3:
+                    formatted_desc = f"{zone_descriptions[0]}, {zone_descriptions[1]}, and {zone_descriptions[2]}"
+                else:
+                    formatted_desc = ", ".join(zone_descriptions[:-1]) + f", and {zone_descriptions[-1]}"
+                final_desc = summary + f"{zones_desc[0]} {formatted_desc}."
+            return self.optimize_object_description(final_desc)
+        except Exception as e:
+            self.logger.warning(f"Error describing functional zones: {str(e)}")
+            return ""
+    def _normalize_zone_name(self, zone_name: str) -> str:
+        """
+        將內部區域鍵名標準化為自然語言描述
+        Args:
+            zone_name: 原始區域名稱
+        Returns:
+            str: 標準化後的區域名稱
+        """
+        try:
+            if not zone_name or not isinstance(zone_name, str):
+                return "functional area"
+            # 移除數字後綴（如 crossing_zone_1 -> crossing_zone）
+            import re
+            base_name = re.sub(r'_\d+$', '', zone_name)
+            # 將下劃線替換為空格
+            normalized = base_name.replace('_', ' ')
+            # 標準化常見的區域類型名稱
+            zone_type_mapping = {
+                'crossing zone': 'pedestrian crossing area',
+                'vehicle zone': 'vehicle movement area',
+                'pedestrian zone': 'pedestrian activity area',
+                'traffic zone': 'traffic flow area',
+                'waiting zone': 'waiting area',
+                'seating zone': 'seating area',
+                'dining zone': 'dining area',
+                'furniture zone': 'furniture arrangement area',
+                'electronics zone': 'electronics area',
+                'people zone': 'social activity area',
+                'functional area': 'activity area'
+            }
+            # 檢查是否有對應的標準化名稱
+            for pattern, replacement in zone_type_mapping.items():
+                if pattern in normalized.lower():
+                    return replacement
+            # 如果沒有特定映射，使用通用格式
+            if 'zone' in normalized.lower():
+                normalized = normalized.replace('zone', 'area')
+            elif not any(keyword in normalized.lower() for keyword in ['area', 'space', 'region']):
+                normalized += ' area'
+            return normalized.strip()
+        except Exception as e:
+            self.logger.warning(f"Error normalizing zone name '{zone_name}': {str(e)}")
+            return "activity area"
+    def get_configuration(self) -> Dict[str, Any]:
+        """
+        獲取當前配置參數
+        Returns:
+            Dict[str, Any]: 配置參數字典
+        """
+        return {
+            "min_prominence_score": self.min_prominence_score,
+            "max_categories_to_return": self.max_categories_to_return,
+            "max_total_objects": self.max_total_objects,
+            "confidence_threshold_for_description": self.confidence_threshold_for_description
+        }
+    def update_configuration(self, **kwargs):
+        """
+        更新配置參數
+        Args:
+            **kwargs: 要更新的配置參數
+        """
+        try:
+            for key, value in kwargs.items():
+                if hasattr(self, key):
+                    old_value = getattr(self, key)
+                    setattr(self, key, value)
+                    self.logger.info(f"Updated {key}: {old_value} -> {value}")
+                else:
+                    self.logger.warning(f"Unknown configuration parameter: {key}")
+        except Exception as e:
+            self.logger.error(f"Error updating configuration: {str(e)}")
+            raise ObjectDescriptionError(f"Failed to update configuration: {str(e)}") from e

object_extractor.py ADDED Viewed

	@@ -0,0 +1,358 @@

+import logging
+import traceback
+from typing import Dict, List, Any, Optional
+# 設置日誌記錄器
+logger = logging.getLogger(__name__)
+class ObjectExtractor:
+    """
+    專門處理物件檢測結果的提取和預處理
+    負責從YOLO檢測結果提取物件資訊、物件分類和核心物件的辨識
+    """
+    def __init__(self, class_names: Dict[int, str] = None, object_categories: Dict[str, List[int]] = None):
+        """
+        初始化物件提取器
+        Args:
+            class_names: 類別ID到類別名稱的映射字典
+            object_categories: 物件類別分組字典
+        """
+        try:
+            self.class_names = class_names or {}
+            self.object_categories = object_categories or {}
+            # 1. 讀取並設定基本信心度門檻（如果外部沒傳，就預設 0.25）
+            self.base_conf_threshold = 0.25
+            # 2. 動態信心度調整映射表 (key: 小寫 class_name, value: 調整係數)
+            #    最終的門檻 = base_conf_threshold * factor
+            #    如果某個 class_name 沒在這裡，就直接用 base_conf_threshold（相當於 factor=1.0）
+            self.dynamic_conf_map = {
+                "traffic light": 0.6,  # 0.25 * 0.6 = 0.15
+                "car": 0.8,            # 0.25 * 0.8 = 0.20
+                "person": 0.7,         # 0.25 * 0.7 = 0.175
+            }
+            logger.info(f"ObjectExtractor initialized with {len(self.class_names)} class names and {len(self.object_categories)} object categories")
+        except Exception as e:
+            logger.error(f"Failed to initialize ObjectExtractor: {str(e)}")
+            logger.error(traceback.format_exc())
+            raise
+    def _get_dynamic_threshold(self, class_name: str) -> float:
+        """
+        根據 class_name 從 dynamic_conf_map 拿到 factor，計算最終的信心度門檻：
+            threshold = base_conf_threshold * factor
+        如果 class_name 不在映射表裡，就回傳 base_conf_threshold。
+        """
+        # 使用小寫做匹配，確保在 dynamic_conf_map 裡的 key 也都用小寫
+        key = class_name.lower()
+        factor = self.dynamic_conf_map.get(key, 1.0)
+        return self.base_conf_threshold * factor
+    def extract_detected_objects(
+            self,
+            detection_result: Any,
+            confidence_threshold: float = 0.25,
+            region_analyzer=None
+        ) -> List[Dict]:
+            """
+            從檢測結果中提取物件資訊，包含位置資訊
+            Args:
+                detection_result: YOLO檢測結果
+                confidence_threshold: 改由動態門檻決定
+                region_analyzer: 區域分析器實例，用於判斷物件所屬區域
+            Returns:
+                包含檢測物件資訊的字典列表
+            """
+            try:
+                # 調試信息：記錄當前類別映射狀態
+                logger.info(f"ObjectExtractor.extract_detected_objects called")
+                logger.info(f"Current class_names keys: {list(self.class_names.keys()) if self.class_names else 'None'}")
+                if detection_result is None:
+                    logger.warning("Detection result is None")
+                    return []
+                if not hasattr(detection_result, 'boxes'):
+                    logger.error("Detection result does not have boxes attribute")
+                    return []
+                boxes = detection_result.boxes.xyxy.cpu().numpy()
+                classes = detection_result.boxes.cls.cpu().numpy().astype(int)
+                confidences = detection_result.boxes.conf.cpu().numpy()
+                # 獲取圖像尺寸
+                img_height, img_width = detection_result.orig_shape[:2]
+                detected_objects = []
+                for box, class_id, confidence in zip(boxes, classes, confidences):
+                    try:
+                        # 1. 先拿到這筆偵測物件的 class_name
+                        class_name = self.class_names.get(int(class_id), f"unknown_class_{class_id}")
+                        # 2. 計算這個 class 應該採用的動態 threshold
+                        dyn_thr = self._get_dynamic_threshold(class_name)  # e.g. 0.25 * factor
+                        # 3. 如果 confidence < dyn_thr，就跳過這一筆
+                        if confidence < dyn_thr:
+                            continue
+                        # 後面維持原本的座標、中心、大小、區域等資訊計算
+                        x1, y1, x2, y2 = box
+                        width = x2 - x1
+                        height = y2 - y1
+                        # 中心點計算
+                        center_x = (x1 + x2) / 2
+                        center_y = (y1 + y2) / 2
+                        # 標準化位置 (0-1)
+                        norm_x = center_x / img_width
+                        norm_y = center_y / img_height
+                        norm_width = width / img_width
+                        norm_height = height / img_height
+                        # 面積計算
+                        area = width * height
+                        norm_area = area / (img_width * img_height)
+                        # 區域判斷
+                        object_region = "unknown"
+                        if region_analyzer:
+                            object_region = region_analyzer.determine_region(norm_x, norm_y)
+                        # 調試信息：記錄映射過程
+                        if class_name.startswith("unknown_class_"):
+                            logger.warning(
+                                f"Class ID {class_id} not found in class_names. "
+                                f"Available keys: {list(self.class_names.keys())}"
+                            )
+                        else:
+                            logger.debug(f"Successfully mapped class ID {class_id} to '{class_name}'")
+                        detected_objects.append({
+                            "class_id": int(class_id),
+                            "class_name": class_name,
+                            "confidence": float(confidence),
+                            "box": [float(x1), float(y1), float(x2), float(y2)],
+                            "center": [float(center_x), float(center_y)],
+                            "normalized_center": [float(norm_x), float(norm_y)],
+                            "size": [float(width), float(height)],
+                            "normalized_size": [float(norm_width), float(norm_height)],
+                            "area": float(area),
+                            "normalized_area": float(norm_area),
+                            "region": object_region
+                        })
+                    except Exception as e:
+                        logger.error(f"Error processing object with class_id {class_id}: {str(e)}")
+                        continue
+                logger.info(f"Extracted {len(detected_objects)} objects from detection result")
+                return detected_objects
+            except Exception as e:
+                logger.error(f"Error extracting detected objects: {str(e)}")
+                logger.error(traceback.format_exc())
+                return []
+    def update_class_names(self, class_names: Dict[int, str]):
+        """
+        動態更新類別名稱映射
+        Args:
+            class_names: 新的類別名稱映射字典
+        """
+        try:
+            self.class_names = class_names or {}
+            logger.info(f"Class names updated: {len(self.class_names)} classes")
+            logger.debug(f"Updated class names: {self.class_names}")
+        except Exception as e:
+            logger.error(f"Failed to update class names: {str(e)}")
+    def categorize_object(self, obj: Dict) -> str:
+        """
+        將檢測到的物件分類到功能類別中，用於區域識別
+        Args:
+            obj: 物件字典
+        Returns:
+            物件功能類別字串
+        """
+        try:
+            class_id = obj.get("class_id", -1)
+            class_name = obj.get("class_name", "").lower()
+            # 使用現有的類別映射（如果可用）
+            if self.object_categories:
+                for category, ids in self.object_categories.items():
+                    if class_id in ids:
+                        return category
+            # 基於COCO類別名稱的後備分類
+            furniture_items = ["chair", "couch", "bed", "dining table", "toilet"]
+            plant_items = ["potted plant"]
+            electronic_items = ["tv", "laptop", "mouse", "remote", "keyboard", "cell phone"]
+            vehicle_items = ["bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat"]
+            person_items = ["person"]
+            kitchen_items = ["bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl",
+                            "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog",
+                            "pizza", "donut", "cake", "refrigerator", "oven", "toaster", "sink", "microwave"]
+            sports_items = ["frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat",
+                        "baseball glove", "skateboard", "surfboard", "tennis racket"]
+            personal_items = ["handbag", "tie", "suitcase", "umbrella", "backpack"]
+            if any(item in class_name for item in furniture_items):
+                return "furniture"
+            elif any(item in class_name for item in plant_items):
+                return "plant"
+            elif any(item in class_name for item in electronic_items):
+                return "electronics"
+            elif any(item in class_name for item in vehicle_items):
+                return "vehicle"
+            elif any(item in class_name for item in person_items):
+                return "person"
+            elif any(item in class_name for item in kitchen_items):
+                return "kitchen_items"
+            elif any(item in class_name for item in sports_items):
+                return "sports"
+            elif any(item in class_name for item in personal_items):
+                return "personal_items"
+            else:
+                return "misc"
+        except Exception as e:
+            logger.error(f"Error categorizing object: {str(e)}")
+            logger.error(traceback.format_exc())
+            return "misc"
+    def get_object_categories(self, detected_objects: List[Dict]) -> set:
+        """
+        從檢測到的物件中取得唯一的物件類別
+        Args:
+            detected_objects: 檢測到的物件列表
+        Returns:
+            唯一物件類別的集合
+        """
+        try:
+            object_categories = set()
+            for obj in detected_objects:
+                category = self.categorize_object(obj)
+                if category:
+                    object_categories.add(category)
+            logger.info(f"Found {len(object_categories)} unique object categories")
+            return object_categories
+        except Exception as e:
+            logger.error(f"Error getting object categories: {str(e)}")
+            logger.error(traceback.format_exc())
+            return set()
+    def identify_core_objects_for_scene(self, detected_objects: List[Dict], scene_type: str) -> List[Dict]:
+        """
+        識別定義特定場景類型的核心物件
+        Args:
+            detected_objects: 檢測到的物件列表
+            scene_type: 場景類型
+        Returns:
+            場景的核心物件列表
+        """
+        try:
+            core_objects = []
+            # 場景核心物件映射
+            scene_core_mapping = {
+                "bedroom": [59],  # bed
+                "kitchen": [68, 69, 71, 72],  # microwave, oven, sink, refrigerator
+                "living_room": [57, 58, 62],  # sofa, chair, tv
+                "dining_area": [60, 46, 47],  # dining table, fork, knife
+                "office_workspace": [63, 64, 66, 73]  # laptop, mouse, keyboard, book
+            }
+            if scene_type in scene_core_mapping:
+                core_class_ids = scene_core_mapping[scene_type]
+                for obj in detected_objects:
+                    if obj.get("class_id") in core_class_ids and obj.get("confidence", 0) >= 0.4:
+                        core_objects.append(obj)
+            logger.info(f"Identified {len(core_objects)} core objects for scene type '{scene_type}'")
+            return core_objects
+        except Exception as e:
+            logger.error(f"Error identifying core objects for scene '{scene_type}': {str(e)}")
+            logger.error(traceback.format_exc())
+            return []
+    def group_objects_by_category_and_region(self, detected_objects: List[Dict]) -> Dict:
+        """
+        將物件按類別和區域分組
+        Args:
+            detected_objects: 檢測到的物件列表
+        Returns:
+            按類別和區域分組的物件字典
+        """
+        try:
+            category_regions = {}
+            for obj in detected_objects:
+                category = self.categorize_object(obj)
+                if not category:
+                    continue
+                if category not in category_regions:
+                    category_regions[category] = {}
+                region = obj.get("region", "center")
+                if region not in category_regions[category]:
+                    category_regions[category][region] = []
+                category_regions[category][region].append(obj)
+            logger.info(f"Grouped objects into {len(category_regions)} categories across regions")
+            return category_regions
+        except Exception as e:
+            logger.error(f"Error grouping objects by category and region: {str(e)}")
+            logger.error(traceback.format_exc())
+            return {}
+    def filter_objects_by_confidence(self, detected_objects: List[Dict], min_confidence: float) -> List[Dict]:
+        """
+        根據信心度過濾物件
+        Args:
+            detected_objects: 檢測到的物件列表
+            min_confidence: 最小信心度閾值
+        Returns:
+            過濾後的物件列表
+        """
+        try:
+            filtered_objects = [
+                obj for obj in detected_objects
+                if obj.get("confidence", 0) >= min_confidence
+            ]
+            logger.info(f"Filtered {len(detected_objects)} objects to {len(filtered_objects)} objects with confidence >= {min_confidence}")
+            return filtered_objects
+        except Exception as e:
+            logger.error(f"Error filtering objects by confidence: {str(e)}")
+            logger.error(traceback.format_exc())
+            return detected_objects  # 發生錯誤時返回原始列表

prompt_template_manager.py ADDED Viewed

	@@ -0,0 +1,547 @@

+import logging
+import traceback
+from typing import Dict, List, Any, Optional
+class PromptTemplateError(Exception):
+    """提示模板相關錯誤的自定義異常"""
+    pass
+class PromptTemplateManager:
+    """
+    負責管理和格式化各種LLM提示模板。
+    包含場景描述增強、錯誤檢測、無檢測處理等不同場景的模板。
+    """
+    def __init__(self):
+        """初始化提示模板管理器"""
+        # set the logger
+        self.logger = logging.getLogger(self.__class__.__name__)
+        if not self.logger.handlers:
+            handler = logging.StreamHandler()
+            formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+            handler.setFormatter(formatter)
+            self.logger.addHandler(handler)
+            self.logger.setLevel(logging.INFO)
+        # initialize all templates
+        self._initialize_templates()
+        self.logger.info("PromptTemplateManager initialized successfully")
+    def _initialize_templates(self):
+        """初始化所有提示模板"""
+        try:
+            self._setup_enhancement_template()
+            self._setup_verification_template()
+            self._setup_no_detection_template()
+            self.logger.info("All prompt templates initialized")
+        except Exception as e:
+            self.logger.error(f"Failed to initialize templates: {str(e)}")
+            self.logger.error(traceback.format_exc())
+            raise PromptTemplateError(f"Template initialization failed: {str(e)}") from e
+    def format_enhancement_prompt_with_landmark(self, scene_data: Dict[str, Any], object_list: str, original_description: str) -> str:
+        try:
+            # 確保場景類型被正確清理
+            scene_type = scene_data.get("scene_type", "unknown scene")
+            cleaned_scene_type = self._clean_scene_type(scene_type)
+            # 通用文本格式清理：處理底線和格式化問題
+            cleaned_description = self._clean_text_formatting(original_description)
+            # 額外清理場景類型底線格式
+            cleaned_description = self._clean_scene_type_underscores(cleaned_description)
+            # 強化輸入清理
+            cleaned_description = self._enhance_input_cleaning(cleaned_description)
+            # 在原始描述中替換未清理的場景類型
+            if scene_type != cleaned_scene_type:
+                cleaned_description = cleaned_description.replace(scene_type, cleaned_scene_type)
+            # 檢查是否有地標資訊
+            landmark_info = scene_data.get("landmark_location_info")
+            is_fallback = scene_data.get("is_fallback", False)
+            # 準備額外的地標指導內容
+            additional_guidance = ""
+            if landmark_info:
+                landmark_name = landmark_info.get("name", "")
+                landmark_location = landmark_info.get("location", "")
+                additional_guidance = f"""
+            LANDMARK LOCATION REQUIREMENT: This scene features {landmark_name} located in {landmark_location}.
+            16. MANDATORY: Include the specific location "{landmark_location}" when first mentioning {landmark_name}. Use natural phrasing such as "Located in {landmark_location}, the {landmark_name}..." or "The {landmark_name} in {landmark_location}..." or "Standing majestically in {landmark_location}, {landmark_name}...".
+            17. Avoid mechanical openings like "The tourist landmark is centered around" or "The scene is centered around". Instead, begin with the landmark itself as the subject.
+            18. NEVER use terms with underscores like "tourist_landmark" or "historical_site" in your response. Use natural language: "tourist landmark", "historical site", "cultural attraction" etc.
+            19. The geographical reference must appear naturally in the opening sentence, integrated as essential context rather than supplementary information."""
+            elif is_fallback:
+                additional_guidance = """
+            FALLBACK MODE: The previous enhancement was insufficient. Provide a more detailed description focusing on key visual elements, human activities, atmospheric details, and architectural features."""
+            # 建構完整的模板內容
+            if additional_guidance:
+                # 在CRITICAL RULES後添加地標相關指導
+                enhanced_template = self.enhance_description_template.replace(
+                    "15. When describing quantities or arrangements, use only information explicitly confirmed by the object detection system.",
+                    f"15. When describing quantities or arrangements, use only information explicitly confirmed by the object detection system.{additional_guidance}"
+                )
+            else:
+                enhanced_template = self.enhance_description_template
+            formatted_prompt = enhanced_template.format(
+                original_description=cleaned_description,
+                object_list=object_list
+            )
+            return formatted_prompt
+        except Exception as e:
+            self.logger.error(f"Failed to format enhancement prompt: {str(e)}")
+            self.logger.error(traceback.format_exc())
+            raise PromptTemplateError(f"Prompt formatting failed: {e}") from e
+    def _clean_text_formatting(self, text: str) -> str:
+        """
+        通用文本格式清理方法，處理底線、格式化等問題
+        Args:
+            text: 需要清理的原始文本
+        Returns:
+            str: 清理後的文本
+        """
+        if not text:
+            return text
+        try:
+            import re
+            # 替換常見的技術性詞彙
+            replacements = {
+                'tourist_landmark': 'tourist landmark',
+                'historical_site': 'historical site',
+                'religious_building': 'religious building',
+                'cultural_landmark': 'cultural landmark',
+                'architectural_site': 'architectural site',
+                'natural_landmark': 'natural landmark'
+            }
+            cleaned = text
+            for old_term, new_term in replacements.items():
+                cleaned = cleaned.replace(old_term, new_term)
+            # 處理其他底線情況
+            cleaned = re.sub(r'(\w+)_(\w+)', lambda m: f"{m.group(1)} {m.group(2)}", cleaned)
+            # 處理多個連續底線
+            cleaned = re.sub(r'_+', ' ', cleaned)
+            # 清理多餘空格
+            cleaned = re.sub(r'\s+', ' ', cleaned)
+            return cleaned.strip()
+        except Exception as e:
+            self.logger.warning(f"Error in text formatting cleanup: {str(e)}")
+            return text
+    def _clean_scene_type_underscores(self, text: str) -> str:
+        """
+        專門清理場景類型中的底線格式
+        Args:
+            text: 需要清理的文本
+        Returns:
+            str: 清理後的文本
+        """
+        if not text:
+            return text
+        try:
+            import re
+            # 專門處理場景類型的底線格式
+            scene_type_patterns = [
+                'urban_intersection', 'city_street', 'downtown_area', 'business_district',
+                'residential_area', 'commercial_zone', 'industrial_area', 'shopping_center',
+                'traffic_intersection', 'pedestrian_crossing', 'public_square'
+            ]
+            for pattern in scene_type_patterns:
+                if pattern in text:
+                    replacement = pattern.replace('_', ' ')
+                    text = text.replace(pattern, replacement)
+            # 處理任何剩餘的場景類型底線模式
+            text = re.sub(r'\b([a-z]+)_([a-z]+)(?=\s+(?:features|shows|displays|contains|is|area|zone|scene))',
+                        r'\1 \2', text, flags=re.IGNORECASE)
+            return text
+        except Exception as e:
+            self.logger.warning(f"Error in scene type underscore cleanup: {str(e)}")
+            return text
+    def _enhance_input_cleaning(self, description: str) -> str:
+        """
+        增強輸入描述的清理功能
+        Args:
+            description: 待清理的描述
+        Returns:
+            str: 清理後的描述
+        """
+        if not description:
+            return description
+        try:
+            import re
+            # 預防性清理底線格式
+            description = re.sub(r'\b(\w+)_(\w+)\b', r'\1 \2', description)
+            # 清理可能導致語法問題的模式
+            problematic_patterns = [
+                (r'\s+,\s+', ', '),  # 修正空格-逗號問題
+                (r'\bIn\s*,', 'In the area,'),  # 預防性修正
+                (r'\s+\.', '.'),  # 修正句號前空格
+            ]
+            for pattern, replacement in problematic_patterns:
+                description = re.sub(pattern, replacement, description)
+            return description.strip()
+        except Exception as e:
+            self.logger.warning(f"Error in enhanced input cleaning: {str(e)}")
+            return description
+    def _setup_enhancement_template(self):
+        """設置場景描述增強模板"""
+        self.enhance_description_template = """
+            <|system|>
+            You are an expert visual analyst. Your task is to improve the readability and fluency of scene descriptions using STRICT factual accuracy.
+            Your **top priority is to avoid hallucination** or fabrication. You are working in a computer vision pipeline using object detection (YOLO) and image embeddings. You MUST treat the input object list as a whitelist. Do not speculate beyond this list.
+            </|system|>
+            <|user|>
+            Rewrite the following scene description to be fluent and clear. DO NOT add any objects, events, or spatial relationships that are not explicitly present in the original or object list.
+            ORIGINAL:
+            {original_description}
+            CRITICAL RULES:
+            1. CRITICAL ADHERENCE TO INPUT: Strictly adhere to the information explicitly provided in the ORIGINAL description and the {object_list}.
+               a. NEVER assume or infer room types, object functions, scene purposes, or abstract conceptual zones (e.g., 'personal items zone', 'activity area') unless such concepts, along with their specific constituent objects and locations, are explicitly detailed in the ORIGINAL description or clearly supported by multiple items in the {object_list}.
+               b. Your role is to rephrase and enhance the provided factual data, not to introduce new conceptual layers or interpretations not directly supported by the input.
+            2. OBJECT WHITELIST & DETAIL ACCURACY:
+               a. The provided {object_list} is an exhaustive list of objects confirmed by the vision system. Mention ONLY objects from this list or objects explicitly detailed in the ORIGINAL description.
+               b. DO NOT invent additional objects or infer the presence of 'various scattered objects' if only a single specific item (e.g., one 'handbag') is mentioned in relation to a category or area. Describe only what is explicitly listed.
+            3. NEVER speculate on object quantity. If the description says "10 people" , DO NOT say "dozens" or "many". Maintain the original quantity unless specified.
+            4. SPATIAL ACCURACY - STRICTLY FROM ORIGINAL:
+               a. Base ALL descriptions of object locations (e.g., 'foreground', 'background', 'middle center') and spatial relationships STRICTLY on the information explicitly provided in the ORIGINAL description.
+               b. If the ORIGINAL description states an object is 'in the background,' use that exact term. If it specifies 'in the foreground,' use that. If it describes an object as being 'carried by a person', reflect this precise relationship.
+               c. If the ORIGINAL description is less specific about an object's location (e.g., 'a car is present'), then use general, non-committal terms like 'visible in the scene' or 'present in the image.'
+               d. DO NOT re-interpret object positions from any perceived understanding of the raw image; your sole source for spatial information is the ORIGINAL description. Do not relocate objects (e.g., moving a carried handbag from the person to 'the background').
+            5. You MAY describe confirmed materials, colors, and composition style if visually obvious and non-speculative, AND if such details are hinted at or present in the ORIGINAL description or {object_list}.
+            6. Write 2–4 complete, well-structured sentences with punctuation.
+            7. Final output MUST be a single fluent paragraph of 60–200 words (not longer). Within this concise format, every sentence should aim to introduce new information or build upon previous statements without significant overlap.
+            8. Begin your response directly with the scene description. Do NOT include any introductory phrases, explanations, or formatting indicators.
+            9. Ensure grammatical completeness in all sentences. Each sentence must have a complete subject and predicate structure.
+               a. NEVER use underscore formatting (e.g., tourist_landmark, urban_intersection). Always use natural spacing (tourist landmark, urban intersection).
+               b. NEVER begin sentences with incomplete phrases like "In ," or "Overall," without proper subjects. Always ensure complete sentence structure.
+               c. AVOID redundant or circular phrasing such as "with lights turned illuminating" or "atmosphere of is one of."
+               d. If you encounter incomplete spatial descriptions like "visible in ," or "positioned in the middle of.", complete them naturally by adding appropriate context such as "visible in the scene" or "positioned in the middle of the frame", ensuring these completions are consistent with the ORIGINAL description. Always ensure spatial descriptions have complete prepositional phrases.
+               e. GRAMMAR AND FLUENCY CHECK: Ensure all sentences are grammatically flawless and flow naturally. Avoid awkward phrasing or dangling prepositions (e.g., 'glow over ,'). Mentally re-read your generated description to catch and correct such minor errors before finalizing.
+            10. Vary sentence structures naturally while maintaining grammatical accuracy.
+            11. CRITICAL: Avoid repeating the mention of specific objects, groups of objects, or their spatial arrangements. Once an object or layout aspect is described, only refer to it again if providing genuinely NEW and DISTINCT information or a significantly different perspective that adds substantial value. Strive for conciseness and information density.
+            12. Create natural spatial flow by connecting object descriptions organically rather than listing positions mechanically.
+            13. Use transitional phrases to connect ideas smoothly, varying expression patterns throughout the description.
+            14. For the concluding sentence, focus on the overall atmosphere, style, perceived activity, or overarching impression of the scene. DO NOT simply restate the primary objects or their layout as a summary or 'backdrop' if they have already been clearly described earlier in the paragraph. The conclusion should offer a higher-level takeaway.
+            15. When describing quantities or arrangements, use only information explicitly confirmed by the object detection system or ORIGINAL description.
+            </|user|>
+            <|assistant|>
+            """
+    def _setup_verification_template(self):
+        """設置檢測結果驗證模板"""
+        self.verify_detection_template = """
+            Task: You are an advanced vision system that verifies computer vision detections for accuracy.
+            Analyze the following detection results and identify any potential errors or inconsistencies:
+            SCENE TYPE: {scene_type}
+            SCENE NAME: {scene_name}
+            CONFIDENCE: {confidence:.2f}
+            DETECTED OBJECTS: {detected_objects}
+            CLIP ANALYSIS RESULTS:
+            {clip_analysis}
+            Possible Errors to Check:
+            1. Objects misidentified (e.g., architectural elements labeled as vehicles)
+            2. Cultural elements misunderstood (e.g., Asian temple structures labeled as boats)
+            3. Objects that seem out of place for this type of scene
+            4. Inconsistencies between different detection systems
+            If you find potential errors, list them clearly with explanations. If the detections seem reasonable, state that they appear accurate.
+            Verification Results:
+            """
+    def _setup_no_detection_template(self):
+        """設置無檢測結果處理模板"""
+        self.no_detection_template = """
+            Task: You are an advanced scene understanding system analyzing an image where standard object detection failed to identify specific objects.
+            Based on advanced image embeddings (CLIP analysis), we have the following information:
+            MOST LIKELY SCENE: {top_scene} (confidence: {top_confidence:.2f})
+            VIEWPOINT: {viewpoint}
+            LIGHTING: {lighting_condition}
+            CULTURAL ANALYSIS: {cultural_analysis}
+            Create a detailed description of what might be in this scene, considering:
+            1. The most likely type of location or setting
+            2. Possible architectural or natural elements present
+            3. The lighting and atmosphere
+            4. Potential cultural or regional characteristics
+            Your description should be natural, flowing, and offer insights into what the image likely contains despite the lack of specific object detection.
+            Scene Description:
+            """
+    def format_enhancement_prompt(self, scene_data: Dict[str, Any], object_list: str, original_description: str) -> str:
+        try:
+            # 確保場景類型被正確清理
+            scene_type = scene_data.get("scene_type", "unknown scene")
+            cleaned_scene_type = self._clean_scene_type(scene_type)
+            # 在原始描述中替換未清理的場景類型
+            if scene_type != cleaned_scene_type:
+                original_description = original_description.replace(scene_type, cleaned_scene_type)
+            formatted_prompt = self.enhance_description_template.format(
+                original_description=original_description,
+                object_list=object_list
+            )
+            return formatted_prompt
+        except Exception as e:
+            self.logger.error(f"Failed to format enhancement prompt: {str(e)}")
+            self.logger.error(traceback.format_exc())
+            raise PromptTemplateError(f"Prompt formatting failed: {e}") from e
+    def format_verification_prompt(self,
+                                 detected_objects: List[Dict],
+                                 clip_analysis: Dict[str, Any],
+                                 scene_type: str,
+                                 scene_name: str,
+                                 confidence: float) -> str:
+        """
+        格式化檢測結果驗證提示
+        Args:
+            detected_objects: 檢測到的物件列表
+            clip_analysis: CLIP分析結果
+            scene_type: 場景類型
+            scene_name: 場景名稱
+            confidence: 場景分類信心度
+        Returns:
+            str: 格式化後的驗證提示字符串
+        Raises:
+            PromptTemplateError: 當模板格式化失敗時
+        """
+        try:
+            self.logger.debug("Formatting verification prompt")
+            # 格式化物件列表和CLIP分析結果
+            objects_str = self._format_objects_for_prompt(detected_objects)
+            clip_str = self._format_clip_results(clip_analysis)
+            # 格式化提示
+            formatted_prompt = self.verify_detection_template.format(
+                scene_type=scene_type,
+                scene_name=scene_name,
+                confidence=confidence,
+                detected_objects=objects_str,
+                clip_analysis=clip_str
+            )
+            self.logger.debug(f"Verification prompt formatted successfully (length: {len(formatted_prompt)})")
+            return formatted_prompt
+        except Exception as e:
+            error_msg = f"Failed to format verification prompt: {str(e)}"
+            self.logger.error(error_msg)
+            self.logger.error(traceback.format_exc())
+            raise PromptTemplateError(error_msg) from e
+    def format_no_detection_prompt(self, clip_analysis: Dict[str, Any]) -> str:
+        """
+        格式化無檢測結果處理提示
+        Args:
+            clip_analysis: CLIP分析結果字典
+        Returns:
+            str: 格式化後的無檢測處理提示字符串
+        Raises:
+            PromptTemplateError: 當模板格式化失敗時
+        """
+        try:
+            self.logger.debug("Formatting no-detection prompt")
+            # 提取CLIP分析結果
+            top_scene, top_confidence = clip_analysis.get("top_scene", ("unknown", 0))
+            viewpoint = clip_analysis.get("viewpoint", ("standard", 0))[0]
+            lighting = clip_analysis.get("lighting_condition", ("unknown", 0))[0]
+            # 格式化文化分析
+            cultural_str = self._format_cultural_analysis(clip_analysis.get("cultural_analysis", {}))
+            # 格式化提示
+            formatted_prompt = self.no_detection_template.format(
+                top_scene=top_scene,
+                top_confidence=top_confidence,
+                viewpoint=viewpoint,
+                lighting_condition=lighting,
+                cultural_analysis=cultural_str
+            )
+            self.logger.debug(f"No-detection prompt formatted successfully (length: {len(formatted_prompt)})")
+            return formatted_prompt
+        except Exception as e:
+            error_msg = f"Failed to format no-detection prompt: {str(e)}"
+            self.logger.error(error_msg)
+            self.logger.error(traceback.format_exc())
+            raise PromptTemplateError(error_msg) from e
+    def _clean_scene_type(self, scene_type: str) -> str:
+        """
+        清理場景類型，使其更適合用於提示詞
+        Args:
+            scene_type: 原始場景類型
+        Returns:
+            str: 清理後的場景類型
+        """
+        if not scene_type:
+            return "scene"
+        # 將底線替換為空格並首字母大寫
+        if '_' in scene_type:
+            return ' '.join(word.capitalize() for word in scene_type.split('_'))
+        return scene_type
+    def _format_objects_for_prompt(self, objects: List[Dict]) -> str:
+        """
+        格式化物件列表以用於提示
+        Args:
+            objects: 檢測到的物件列表
+        Returns:
+            str: 格式化後的物件字符串
+        """
+        if not objects:
+            return "No objects detected"
+        try:
+            formatted = []
+            for obj in objects:
+                class_name = obj.get("class_name", "unknown")
+                confidence = obj.get("confidence", 0)
+                formatted.append(f"{class_name} (confidence: {confidence:.2f})")
+            return "\n- " + "\n- ".join(formatted)
+        except Exception as e:
+            self.logger.warning(f"Error formatting objects: {str(e)}")
+            return "Object formatting error"
+    def _format_clip_results(self, clip_analysis: Dict) -> str:
+        """
+        格式化CLIP分析結果以用於提示
+        Args:
+            clip_analysis: CLIP分析結果字典
+        Returns:
+            str: 格式化後的CLIP分析字符串
+        """
+        if not clip_analysis or "error" in clip_analysis:
+            return "No CLIP analysis available"
+        try:
+            parts = ["CLIP Analysis Results:"]
+            # 添加頂級場景
+            top_scene, confidence = clip_analysis.get("top_scene", ("unknown", 0))
+            parts.append(f"- Most likely scene: {top_scene} (confidence: {confidence:.2f})")
+            # 添加視角
+            viewpoint, vp_conf = clip_analysis.get("viewpoint", ("standard", 0))
+            parts.append(f"- Camera viewpoint: {viewpoint} (confidence: {vp_conf:.2f})")
+            # 添加物件組合
+            if "object_combinations" in clip_analysis:
+                combos = []
+                for combo, score in clip_analysis["object_combinations"][:3]:
+                    combos.append(f"{combo} ({score:.2f})")
+                parts.append(f"- Object combinations: {', '.join(combos)}")
+            # 添加文化分析
+            if "cultural_analysis" in clip_analysis:
+                parts.append("- Cultural analysis:")
+                for culture_type, data in clip_analysis["cultural_analysis"].items():
+                    best_desc = data.get("best_description", "")
+                    desc_conf = data.get("confidence", 0)
+                    parts.append(f"  * {culture_type}: {best_desc} ({desc_conf:.2f})")
+            return "\n".join(parts)
+        except Exception as e:
+            self.logger.warning(f"Error formatting CLIP results: {str(e)}")
+            return "CLIP analysis formatting error"
+    def _format_cultural_analysis(self, cultural_analysis: Dict) -> str:
+        """
+        格式化文化分析結果
+        Args:
+            cultural_analysis: 文化分析結果字典
+        Returns:
+            str: 格式化後的文化分析字符串
+        """
+        if not cultural_analysis:
+            return "No specific cultural elements detected"
+        try:
+            parts = []
+            for culture_type, data in cultural_analysis.items():
+                best_desc = data.get("best_description", "")
+                desc_conf = data.get("confidence", 0)
+                parts.append(f"{culture_type}: {best_desc} (confidence: {desc_conf:.2f})")
+            return "\n".join(parts)
+        except Exception as e:
+            self.logger.warning(f"Error formatting cultural analysis: {str(e)}")
+            return "Cultural analysis formatting error"
+    def get_template_info(self) -> Dict[str, Any]:
+        """
+        獲取模板管理器的信息
+        Returns:
+            Dict[str, Any]: 包含模板數量和狀態的信息
+        """
+        return {
+            "templates_count": 3,
+            "available_templates": [
+                "enhance_description_template",
+                "verify_detection_template",
+                "no_detection_template"
+            ],
+            "initialization_status": "success"
+        }

region_analyzer.py ADDED Viewed

	@@ -0,0 +1,487 @@

+import logging
+import traceback
+from typing import Dict, List, Any
+logger = logging.getLogger(__name__)
+class RegionAnalyzer:
+    """
+    負責處理圖像區域劃分和基礎空間分析功能
+    專注於3x3網格的區域劃分、物件分布分析和空間多樣性計算
+    """
+    def __init__(self):
+        """初始化區域分析器，定義3x3網格區域"""
+        try:
+            # 定義圖像的3x3網格區域
+            self.regions = {
+                "top_left": (0, 0, 1/3, 1/3),
+                "top_center": (1/3, 0, 2/3, 1/3),
+                "top_right": (2/3, 0, 1, 1/3),
+                "middle_left": (0, 1/3, 1/3, 2/3),
+                "middle_center": (1/3, 1/3, 2/3, 2/3),
+                "middle_right": (2/3, 1/3, 1, 2/3),
+                "bottom_left": (0, 2/3, 1/3, 1),
+                "bottom_center": (1/3, 2/3, 2/3, 1),
+                "bottom_right": (2/3, 2/3, 1, 1)
+            }
+            logger.info("RegionAnalyzer initialized successfully with 3x3 grid regions")
+        except Exception as e:
+            logger.error(f"Failed to initialize RegionAnalyzer: {str(e)}")
+            logger.error(traceback.format_exc())
+            raise
+    def determine_region(self, x: float, y: float) -> str:
+        """
+        判斷點位於哪個區域
+        Args:
+            x: 標準化x座標 (0-1)
+            y: 標準化y座標 (0-1)
+        Returns:
+            區域名稱
+        """
+        try:
+            for region_name, (x1, y1, x2, y2) in self.regions.items():
+                if x1 <= x < x2 and y1 <= y < y2:
+                    return region_name
+            logger.warning(f"Point ({x}, {y}) does not fall into any defined region")
+            return "unknown"
+        except Exception as e:
+            logger.error(f"Error determining region for point ({x}, {y}): {str(e)}")
+            logger.error(traceback.format_exc())
+            return "unknown"
+    def get_spatial_description_phrase(self, region: str) -> str:
+        """
+        將region ID轉換為完整的空間描述短語，包含適當的介詞結構
+        Args:
+            region: 區域標識符（如 "middle_center", "top_left"）
+        Returns:
+            str: 完整的空間描述短語，空值時返回空字串
+        """
+        try:
+            # 處理空值或無效輸入
+            if not region or region.strip() == "" or region == "unknown":
+                return "within the visible area"
+            # 清理region格式，移除底線
+            clean_region = region.replace('_', ' ').strip().lower()
+            # 根據區域位置生成自然語言描述
+            region_mappings = {
+                "top left": "in the upper left area",
+                "top center": "in the upper area",
+                "top right": "in the upper right area",
+                "middle left": "on the left side",
+                "middle center": "in the center",
+                "center": "in the center",
+                "middle right": "on the right side",
+                "bottom left": "in the lower left area",
+                "bottom center": "in the lower area",
+                "bottom right": "in the lower right area"
+            }
+            # 直接映射匹配
+            if clean_region in region_mappings:
+                return region_mappings[clean_region]
+            # 模糊匹配方位的處理
+            if "top" in clean_region and "left" in clean_region:
+                return "in the upper left area"
+            elif "top" in clean_region and "right" in clean_region:
+                return "in the upper right area"
+            elif "bottom" in clean_region and "left" in clean_region:
+                return "in the lower left area"
+            elif "bottom" in clean_region and "right" in clean_region:
+                return "in the lower right area"
+            elif "top" in clean_region:
+                return "in the upper area"
+            elif "bottom" in clean_region:
+                return "in the lower area"
+            elif "left" in clean_region:
+                return "on the left side"
+            elif "right" in clean_region:
+                return "on the right side"
+            elif "center" in clean_region or "middle" in clean_region:
+                return "in the center"
+            else:
+                # 對於無法辨識的區域，返回通用描述
+                return f"in the {clean_region} area"
+        except Exception as e:
+            logger.warning(f"Error generating spatial description for region '{region}': {str(e)}")
+            return ""
+    def get_contextual_spatial_description(self, region: str, object_type: str = "") -> str:
+        """
+        根據物件類型提供更具情境的空間描述
+        Args:
+            region: 區域標識符
+            object_type: 物件類型，用於優化描述語境
+        Returns:
+            str: 情境化的空間描述短語
+        """
+        try:
+            # 獲取基礎空間描述
+            base_description = self.get_spatial_description_phrase(region)
+            if not base_description:
+                return ""
+            # 根據物件類型調整描述語境
+            if object_type:
+                object_type_lower = object_type.lower()
+                # 對於辨識到人相關，用更自然的位置描述
+                if "person" in object_type_lower or "people" in object_type_lower:
+                    if "center" in base_description:
+                        return "in the central area"
+                    elif "upper" in base_description:
+                        return "in the background"
+                    elif "lower" in base_description:
+                        return "in the foreground"
+                # 對於車輛，強調道路位置
+                elif any(vehicle in object_type_lower for vehicle in ["car", "vehicle", "truck", "bus"]):
+                    if "left" in base_description:
+                        return "on the left side of the scene"
+                    elif "right" in base_description:
+                        return "on the right side of the scene"
+                    elif "center" in base_description:
+                        return "in the central area"
+                # 對於交通設施，使用更具體的位置描述
+                elif "traffic" in object_type_lower:
+                    if "upper" in base_description:
+                        return "positioned in the upper portion"
+                    elif "center" in base_description:
+                        return "centrally positioned"
+                    else:
+                        return base_description.replace("in the", "positioned in the")
+            return base_description
+        except Exception as e:
+            logger.warning(f"Error generating contextual spatial description: {str(e)}")
+            return self.get_spatial_description_phrase(region)
+    def validate_region_input(self, region: str) -> bool:
+        """
+        驗證region輸入是否有效
+        Args:
+            region: 待驗證的區域標識符
+        Returns:
+            bool: 是否為有效的region
+        """
+        try:
+            if not region or region.strip() == "":
+                return False
+            # 清理並檢查是否為已知區域
+            clean_region = region.replace('_', ' ').strip().lower()
+            known_regions = [
+                "top left", "top center", "top right",
+                "middle left", "middle center", "middle right",
+                "bottom left", "bottom center", "bottom right",
+                "center", "unknown"
+            ]
+            # 直接匹配或包含關鍵詞匹配
+            if clean_region in known_regions:
+                return True
+            # 檢查是否包含有效的位置關鍵詞組合
+            position_keywords = ["top", "bottom", "left", "right", "center", "middle"]
+            has_valid_keyword = any(keyword in clean_region for keyword in position_keywords)
+            return has_valid_keyword
+        except Exception as e:
+            logger.warning(f"Error validating region input '{region}': {str(e)}")
+            return False
+    def get_enhanced_directional_description(self, region: str) -> str:
+        """
+        增強版的方位描述生成，提供更豐富的方位資訊
+        擴展原有的get_directional_description方法功能
+        Args:
+            region: 區域名稱
+        Returns:
+            str: 增強的方位描述字串
+        """
+        try:
+            if not self.validate_region_input(region):
+                return "central"
+            region_lower = region.replace('_', ' ').strip().lower()
+            # 用比較準確的方位映射
+            direction_mappings = {
+                "top left": "northwest",
+                "top center": "north",
+                "top right": "northeast",
+                "middle left": "west",
+                "middle center": "central",
+                "center": "central",
+                "middle right": "east",
+                "bottom left": "southwest",
+                "bottom center": "south",
+                "bottom right": "southeast"
+            }
+            if region_lower in direction_mappings:
+                return direction_mappings[region_lower]
+            # 模糊匹配邏輯保持與原方法相同
+            if "top" in region_lower and "left" in region_lower:
+                return "northwest"
+            elif "top" in region_lower and "right" in region_lower:
+                return "northeast"
+            elif "bottom" in region_lower and "left" in region_lower:
+                return "southwest"
+            elif "bottom" in region_lower and "right" in region_lower:
+                return "southeast"
+            elif "top" in region_lower:
+                return "north"
+            elif "bottom" in region_lower:
+                return "south"
+            elif "left" in region_lower:
+                return "west"
+            elif "right" in region_lower:
+                return "east"
+            else:
+                return "central"
+        except Exception as e:
+            logger.error(f"Error getting enhanced directional description for region '{region}': {str(e)}")
+            return "central"
+    def analyze_regions(self, detected_objects: List[Dict]) -> Dict:
+        """
+        分析物件在各區域的分布情況
+        Args:
+            detected_objects: 包含位置資訊的檢測物件列表
+        Returns:
+            包含區域分析結果的字典
+        """
+        try:
+            if not detected_objects:
+                logger.warning("No detected objects provided for region analysis")
+                return {
+                    "counts": {region: 0 for region in self.regions.keys()},
+                    "main_focus": [],
+                    "objects_by_region": {region: [] for region in self.regions.keys()}
+                }
+            # 計算每個區域的物件數量
+            region_counts = {region: 0 for region in self.regions.keys()}
+            region_objects = {region: [] for region in self.regions.keys()}
+            for obj in detected_objects:
+                try:
+                    region = obj.get("region", "unknown")
+                    if region in region_counts:
+                        region_counts[region] += 1
+                        region_objects[region].append({
+                            "class_id": obj.get("class_id"),
+                            "class_name": obj.get("class_name")
+                        })
+                    else:
+                        logger.warning(f"Unknown region '{region}' found in object")
+                except Exception as e:
+                    logger.error(f"Error processing object in region analysis: {str(e)}")
+                    continue
+            # 確定主要焦點區域（按物件數量排序的前1-2個區域）
+            sorted_regions = sorted(region_counts.items(), key=lambda x: x[1], reverse=True)
+            main_regions = [region for region, count in sorted_regions if count > 0][:2]
+            result = {
+                "counts": region_counts,
+                "main_focus": main_regions,
+                "objects_by_region": region_objects
+            }
+            logger.info(f"Region analysis completed. Main focus areas: {main_regions}")
+            return result
+        except Exception as e:
+            logger.error(f"Error in region analysis: {str(e)}")
+            logger.error(traceback.format_exc())
+            # 返回空的結果結構而不是拋出異常
+            return {
+                "counts": {region: 0 for region in self.regions.keys()},
+                "main_focus": [],
+                "objects_by_region": {region: [] for region in self.regions.keys()}
+            }
+    def create_distribution_map(self, detected_objects: List[Dict]) -> Dict:
+        """
+        創建物件在各區域分布的詳細地圖，用於空間分析
+        Args:
+            detected_objects: 檢測到的物件列表
+        Returns:
+            包含各區域分布詳情的字典
+        """
+        try:
+            if not detected_objects:
+                logger.warning("No detected objects provided for distribution map creation")
+                return self._get_empty_distribution_map()
+            distribution = {}
+            # 初始化所有區域
+            for region in self.regions.keys():
+                distribution[region] = {
+                    "total": 0,
+                    "objects": {},
+                    "density": 0
+                }
+            # 填充分布資料
+            for obj in detected_objects:
+                try:
+                    region = obj.get("region", "unknown")
+                    class_id = obj.get("class_id")
+                    class_name = obj.get("class_name", "unknown")
+                    if region not in distribution:
+                        logger.warning(f"Unknown region '{region}' found, skipping object")
+                        continue
+                    distribution[region]["total"] += 1
+                    if class_id not in distribution[region]["objects"]:
+                        distribution[region]["objects"][class_id] = {
+                            "name": class_name,
+                            "count": 0,
+                            "positions": []
+                        }
+                    distribution[region]["objects"][class_id]["count"] += 1
+                    # 儲存位置資訊用於空間關係分析
+                    normalized_center = obj.get("normalized_center")
+                    if normalized_center:
+                        distribution[region]["objects"][class_id]["positions"].append(normalized_center)
+                except Exception as e:
+                    logger.error(f"Error processing object in distribution map: {str(e)}")
+                    continue
+            # 計算每個區域的物件密度
+            for region, data in distribution.items():
+                # 假設所有區域在網格中大小相等
+                data["density"] = data["total"] / 1
+            logger.info("Distribution map created successfully")
+            return distribution
+        except Exception as e:
+            logger.error(f"Error creating distribution map: {str(e)}")
+            logger.error(traceback.format_exc())
+            return self._get_empty_distribution_map()
+    def calculate_spatial_diversity(self, detected_objects: List[Dict]) -> float:
+        """
+        計算物件空間分布的多樣性
+        評估物件是否分散在不同區域，避免所有物件集中在單一區域
+        Args:
+            detected_objects: 檢測到的物件列表
+        Returns:
+            空間多樣性評分 (0.0-1.0)
+        """
+        try:
+            if not detected_objects:
+                logger.warning("No detected objects provided for spatial diversity calculation")
+                return 0.0
+            regions = set()
+            for obj in detected_objects:
+                region = obj.get("region", "center")
+                regions.add(region)
+            unique_regions = len(regions)
+            diversity_score = min(unique_regions / 2.0, 1.0)
+            logger.info(f"Spatial diversity calculated: {diversity_score:.3f} (regions: {unique_regions})")
+            return diversity_score
+        except Exception as e:
+            logger.error(f"Error calculating spatial diversity: {str(e)}")
+            logger.error(traceback.format_exc())
+            return 0.0
+    def get_directional_description(self, region: str) -> str:
+        """
+        將區域名稱轉換為方位描述（東西南北）
+        Args:
+            region: 區域名稱
+        Returns:
+            方位描述字串
+        """
+        try:
+            region_lower = region.lower()
+            if "top" in region_lower and "left" in region_lower:
+                return "northwest"
+            elif "top" in region_lower and "right" in region_lower:
+                return "northeast"
+            elif "bottom" in region_lower and "left" in region_lower:
+                return "southwest"
+            elif "bottom" in region_lower and "right" in region_lower:
+                return "southeast"
+            elif "top" in region_lower:
+                return "north"
+            elif "bottom" in region_lower:
+                return "south"
+            elif "left" in region_lower:
+                return "west"
+            elif "right" in region_lower:
+                return "east"
+            else:
+                return "central"
+        except Exception as e:
+            logger.error(f"Error getting directional description for region '{region}': {str(e)}")
+            return "central"
+    def _get_empty_distribution_map(self) -> Dict:
+        """
+        返回空的分布地圖結構
+        Returns:
+            空的分布地圖字典
+        """
+        distribution = {}
+        for region in self.regions.keys():
+            distribution[region] = {
+                "total": 0,
+                "objects": {},
+                "density": 0
+            }
+        return distribution

requirements.txt CHANGED Viewed

@@ -14,5 +14,4 @@ accelerate
 bitsandbytes
 sentencepiece
 huggingface_hub>=0.19.0
-scikit-image
 urllib3>=1.26.0

 bitsandbytes
 sentencepiece
 huggingface_hub>=0.19.0
 urllib3>=1.26.0

response_processor.py ADDED Viewed

	@@ -0,0 +1,1049 @@

+import re
+import logging
+import traceback
+from typing import Dict, List, Any, Optional, Set
+class ResponseProcessingError(Exception):
+    """回應處理相關錯誤的自定義異常"""
+    pass
+class ResponseProcessor:
+    """
+    負責處理和清理LLM模型輸出的回應。
+    包含格式清理、重複內容檢測、語法完整性確保等功能。
+    """
+    def __init__(self):
+        """初始化回應處理器"""
+        # set the logger
+        self.logger = logging.getLogger(self.__class__.__name__)
+        if not self.logger.handlers:
+            handler = logging.StreamHandler()
+            formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+            handler.setFormatter(formatter)
+            self.logger.addHandler(handler)
+            self.logger.setLevel(logging.INFO)
+        # 初始化清理規則和替換字典
+        self._initialize_cleaning_rules()
+        self.logger.info("ResponseProcessor initialized successfully")
+    def _initialize_cleaning_rules(self):
+        """初始化各種清理規則和替換字典，把常見有問題情況優化"""
+        try:
+            # 設置重複詞彙的替換字典
+            self.replacement_alternatives = {
+                'visible': ['present', 'evident', 'apparent', 'observable'],
+                'positioned': ['arranged', 'placed', 'set', 'organized'],
+                'located': ['found', 'placed', 'situated', 'established'],
+                'situated': ['placed', 'positioned', 'arranged', 'set'],
+                'appears': ['seems', 'looks', 'presents', 'exhibits'],
+                'features': ['includes', 'contains', 'displays', 'showcases'],
+                'shows': ['reveals', 'presents', 'exhibits', 'demonstrates'],
+                'displays': ['presents', 'exhibits', 'shows', 'reveals']
+            }
+            # 設置需要移除的前綴短語
+            self.prefixes_to_remove = [
+                "Here's the enhanced description:",
+                "Enhanced description:",
+                "Here is the enhanced scene description:",
+                "I've enhanced the description while preserving all factual details:",
+                "Enhanced Description:",
+                "Scene Description:",
+                "Description:",
+                "Here is the enhanced description:",
+                "Here's the enhanced description:",
+                "Here is a rewritten scene description that adheres to the provided critical rules:",
+                "Here is the rewritten scene description:",
+                "Here's a rewritten scene description:",
+                "The rewritten scene description is as follows:"
+            ]
+            # 設置需要移除的後綴短語
+            self.suffixes_to_remove = [
+                "I've maintained all the key factual elements",
+                "I've preserved all the factual details",
+                "All factual elements have been maintained"
+            ]
+            # 設置重複檢測模式
+            self.repetitive_patterns = [
+                (r'\b(visible)\b.*?\b(visible)\b', 'Multiple uses of "visible" detected'),
+                (r'\b(positioned)\b.*?\b(positioned)\b', 'Multiple uses of "positioned" detected'),
+                (r'\b(located)\b.*?\b(located)\b', 'Multiple uses of "located" detected'),
+                (r'\b(situated)\b.*?\b(situated)\b', 'Multiple uses of "situated" detected'),
+                (r'\b(appears)\b.*?\b(appears)\b', 'Multiple uses of "appears" detected'),
+                (r'\b(features)\b.*?\b(features)\b', 'Multiple uses of "features" detected'),
+                (r'\bThis\s+(\w+)\s+.*?\bThis\s+\1\b', 'Repetitive sentence structure detected')
+            ]
+            # 斜線組合的形容詞替換字典(有時會有斜線格式問題)
+            self.slash_replacements = {
+                'sunrise/sunset': 'warm lighting',
+                'sunset/sunrise': 'warm lighting',
+                'day/night': 'ambient lighting',
+                'night/day': 'ambient lighting',
+                'morning/evening': 'soft lighting',
+                'evening/morning': 'soft lighting',
+                'dawn/dusk': 'gentle lighting',
+                'dusk/dawn': 'gentle lighting',
+                'sunny/cloudy': 'natural lighting',
+                'cloudy/sunny': 'natural lighting',
+                'bright/dark': 'varied lighting',
+                'dark/bright': 'varied lighting',
+                'light/shadow': 'contrasting illumination',
+                'shadow/light': 'contrasting illumination',
+                'indoor/outdoor': 'mixed environment',
+                'outdoor/indoor': 'mixed environment',
+                'inside/outside': 'transitional space',
+                'outside/inside': 'transitional space',
+                'urban/rural': 'diverse landscape',
+                'rural/urban': 'diverse landscape',
+                'modern/traditional': 'architectural blend',
+                'traditional/modern': 'architectural blend',
+                'old/new': 'varied architecture',
+                'new/old': 'varied architecture',
+                'busy/quiet': 'dynamic atmosphere',
+                'quiet/busy': 'dynamic atmosphere',
+                'crowded/empty': 'varying occupancy',
+                'empty/crowded': 'varying occupancy',
+                'hot/cold': 'comfortable temperature',
+                'cold/hot': 'comfortable temperature',
+                'wet/dry': 'mixed conditions',
+                'dry/wet': 'mixed conditions',
+                'summer/winter': 'seasonal atmosphere',
+                'winter/summer': 'seasonal atmosphere',
+                'spring/autumn': 'transitional season',
+                'autumn/spring': 'transitional season',
+                'left/right': 'balanced composition',
+                'right/left': 'balanced composition',
+                'near/far': 'layered perspective',
+                'far/near': 'layered perspective',
+                'high/low': 'varied elevation',
+                'low/high': 'varied elevation',
+                'big/small': 'diverse scale',
+                'small/big': 'diverse scale',
+                'wide/narrow': 'varied width',
+                'narrow/wide': 'varied width',
+                'open/closed': 'flexible space',
+                'closed/open': 'flexible space',
+                'public/private': 'community space',
+                'private/public': 'community space',
+                'formal/informal': 'relaxed setting',
+                'informal/formal': 'relaxed setting',
+                'commercial/residential': 'mixed-use area',
+                'residential/commercial': 'mixed-use area'
+            }
+            # 新增：擴展的底線替換字典
+            self.underscore_replacements = {
+                'urban_intersection': 'urban intersection',
+                'tourist_landmark': 'tourist landmark',
+                'historical_site': 'historical site',
+                'religious_building': 'religious building',
+                'natural_landmark': 'natural landmark',
+                'commercial_area': 'commercial area',
+                'residential_area': 'residential area',
+                'public_space': 'public space',
+                'outdoor_scene': 'outdoor scene',
+                'indoor_scene': 'indoor scene',
+                'street_scene': 'street scene',
+                'city_center': 'city center',
+                'shopping_district': 'shopping district',
+                'business_district': 'business district',
+                'traffic_light': 'traffic light',
+                'street_lamp': 'street lamp',
+                'parking_meter': 'parking meter',
+                'fire_hydrant': 'fire hydrant',
+                'bus_stop': 'bus stop',
+                'train_station': 'train station',
+                'police_car': 'police car',
+                'fire_truck': 'fire truck',
+                'school_bus': 'school bus',
+                'time_of_day': 'time of day',
+                'weather_condition': 'weather condition',
+                'lighting_condition': 'lighting condition',
+                'atmospheric_condition': 'atmospheric condition',
+                'human_activity': 'human activity',
+                'pedestrian_traffic': 'pedestrian traffic',
+                'vehicle_traffic': 'vehicle traffic',
+                'social_gathering': 'social gathering',
+                'object_detection': 'object detection',
+                'scene_analysis': 'scene analysis',
+                'image_classification': 'image classification',
+                'computer_vision': 'computer vision'
+            }
+            self.logger.info("Cleaning rules initialized successfully")
+        except Exception as e:
+            error_msg = f"Failed to initialize cleaning rules: {str(e)}"
+            self.logger.error(error_msg)
+            self.logger.error(traceback.format_exc())
+            raise ResponseProcessingError(error_msg) from e
+    def clean_response(self, response: str, model_type: str = "general") -> str:
+        """
+        清理LLM回應
+        Args:
+            response: 原始LLM回應
+            model_type: 模型類型（用於特定清理規則）
+        Returns:
+            str: 清理後的回應
+        Raises:
+            ResponseProcessingError: 當回應處理失敗時
+        """
+        if not response:
+            raise ResponseProcessingError("Empty response provided for cleaning")
+        try:
+            self.logger.debug(f"Starting response cleaning (original length: {len(response)})")
+            # 保存原始回應作為備份
+            original_response = response
+            # 根據模型類型選擇清理策略
+            if "llama" in model_type.lower():
+                cleaned_response = self._clean_llama_response(response)
+            else:
+                cleaned_response = self._clean_general_response(response)
+            # 如果清理後內容過短，嘗試���原始回應中恢復
+            if len(cleaned_response.strip()) < 40:
+                self.logger.warning("Cleaned response too short, attempting recovery")
+                cleaned_response = self._recover_from_overcleaning(original_response)
+            # 最終驗證
+            self._validate_cleaned_response(cleaned_response)
+            self.logger.debug(f"Response cleaning completed (final length: {len(cleaned_response)})")
+            return cleaned_response
+        except Exception as e:
+            error_msg = f"Response cleaning failed: {str(e)}"
+            self.logger.error(error_msg)
+            self.logger.error(traceback.format_exc())
+            raise ResponseProcessingError(error_msg) from e
+    def _clean_llama_response(self, response: str) -> str:
+        """
+        專門處理Llama模型的回應清理
+        Args:
+            response: 原始Llama回應
+        Returns:
+            str: 清理後的回應
+        """
+        # 首先應用通用清理
+        response = self._clean_general_response(response)
+        # Llama特有的前綴清理
+        llama_prefixes = [
+            "Here's the enhanced description:",
+            "Enhanced description:",
+            "Here is the enhanced scene description:",
+            "I've enhanced the description while preserving all factual details:"
+        ]
+        for prefix in llama_prefixes:
+            if response.lower().startswith(prefix.lower()):
+                response = response[len(prefix):].strip()
+        # Llama特有的後綴清理
+        llama_suffixes = [
+            "I've maintained all the key factual elements",
+            "I've preserved all the factual details",
+            "All factual elements have been maintained"
+        ]
+        for suffix in llama_suffixes:
+            if response.lower().endswith(suffix.lower()):
+                response = response[:response.rfind(suffix)].strip()
+        return response
+    def _clean_general_response(self, response: str) -> str:
+        """
+        通用回應清理方法
+        Args:
+            response: 原始回應
+        Returns:
+            str: 清理後的回應
+        """
+        response = self._critical_format_preprocess(response)
+        # 1. 移除系統remark
+        response = self._remove_system_markers(response)
+        # 2. 移除介紹性prefix
+        response = self._remove_introduction_prefixes(response)
+        # 3. 移除格式標記和上下文標籤
+        response = self._remove_format_markers(response)
+        # 4. 清理場景類型引用
+        response = self._clean_scene_type_references(response)
+        # 5. 標準化標點符號
+        response = self._normalize_punctuation(response)
+        # 6. 移除重複句子
+        response = self._remove_duplicate_sentences(response)
+        # 7. 處理重複詞彙
+        response = self._handle_repetitive_vocabulary(response)
+        # 8. ensure completement
+        response = self._ensure_grammatical_completeness(response)
+        # 9. 控制字數長度
+        response = self._control_word_length(response)
+        # 10. 最終格式化
+        response = self._final_formatting(response)
+        return response
+    def _critical_format_preprocess(self, response: str) -> str:
+        """
+        關鍵格式預處理，處理最常見的格式問題
+        Args:
+            response: 原始回應
+        Returns:
+            str: 預處理後的回應
+        """
+        if not response:
+            return response
+        try:
+            import re
+            # 第一優先級：處理斜線問題
+            # 首先處理已知的斜線組合，使用形容詞替換
+            for slash_combo, replacement in self.slash_replacements.items():
+                if slash_combo.lower() in response.lower():
+                    # 保持原始大小寫格式
+                    if slash_combo.upper() in response:
+                        replacement_formatted = replacement.upper()
+                    elif slash_combo.title() in response:
+                        replacement_formatted = replacement.title()
+                    else:
+                        replacement_formatted = replacement
+                    # 執行替換（不區分大小寫）
+                    response = re.sub(re.escape(slash_combo), replacement_formatted, response, flags=re.IGNORECASE)
+                    self.logger.debug(f"Replaced slash pattern '{slash_combo}' with '{replacement_formatted}'")
+            # 處理其他未預定義的斜線模式
+            # 標準斜線模式：word/word
+            slash_pattern = r'\b([a-zA-Z]+)/([a-zA-Z]+)\b'
+            matches = list(re.finditer(slash_pattern, response))
+            for match in reversed(matches):  # 從後往前處理避免位置偏移
+                word1, word2 = match.groups()
+                # 選擇較短或更常見的詞作為替換
+                if len(word1) <= len(word2):
+                    replacement = word1
+                else:
+                    replacement = word2
+                response = response[:match.start()] + replacement + response[match.end():]
+                self.logger.debug(f"Replaced general slash pattern '{match.group(0)}' with '{replacement}'")
+            # 第二優先級：處理底線格式
+            # 首先處理已知的底線組合
+            for underscore_combo, replacement in self.underscore_replacements.items():
+                if underscore_combo in response:
+                    response = response.replace(underscore_combo, replacement)
+                    self.logger.debug(f"Replaced underscore pattern '{underscore_combo}' with '{replacement}'")
+            # 處理三個詞的底線組合：word_word_word → word word word
+            response = re.sub(r'\b([a-z]+)_([a-z]+)_([a-z]+)\b', r'\1 \2 \3', response)
+            # 處理任何剩餘的底線模式：word_word → word word
+            response = re.sub(r'\b([a-zA-Z]+)_([a-zA-Z]+)\b', r'\1 \2', response)
+            # 第三優先級：修正不完整句子
+            incomplete_sentence_fixes = [
+                (r'\bIn\s*,\s*', 'Throughout the area, '),
+                (r'\bOverall,\s+exudes\b', 'Overall, the scene exudes'),
+                (r'\bThe overall atmosphere of\s+is\b', 'The overall atmosphere'),
+                (r'\bwith its lights turned illuminating\b', 'with its lights illuminating'),
+                (r'\bwhere it stands as\b', 'where it stands as'),
+            ]
+            for pattern, replacement in incomplete_sentence_fixes:
+                response = re.sub(pattern, replacement, response, flags=re.IGNORECASE)
+            # 第四優先級：語法修正處理(像是person and people)
+            grammar_fixes = [
+                (r'\b(\d+)\s+persons\b', r'\1 people'),
+                (r'\bone\s+persons\b', 'one person'),
+                (r'\btwo\s+persons\b', 'two people'),
+                (r'\bthree\s+persons\b', 'three people'),
+                (r'\bfour\s+persons\b', 'four people'),
+                (r'\bfive\s+persons\b', 'five people'),
+                (r'\bsix\s+persons\b', 'six people'),
+                (r'\bseven\s+persons\b', 'seven people'),
+                (r'\beight\s+persons\b', 'eight people'),
+                (r'\bnine\s+persons\b', 'nine people'),
+                (r'\bten\s+persons\b', 'ten people'),
+                (r'\bmultiple\s+persons\b', 'multiple people'),
+                (r'\bseveral\s+persons\b', 'several people'),
+                (r'\bmany\s+persons\b', 'many people'),
+                (r'\ba\s+few\s+persons\b', 'a few people'),
+                (r'\bsome\s+persons\b', 'some people')
+            ]
+            for pattern, replacement in grammar_fixes:
+                response = re.sub(pattern, replacement, response, flags=re.IGNORECASE)
+            return response
+        except Exception as e:
+            self.logger.warning(f"Error in critical format preprocessing: {str(e)}")
+            return response
+    def _remove_system_markers(self, response: str) -> str:
+        """移除系統樣式標記"""
+        # 移除對話remark
+        response = re.sub(r'<\|.*?\|>', '', response)
+        # 移除輸出remark
+        output_start = response.find("[OUTPUT_START]")
+        output_end = response.find("[OUTPUT_END]")
+        if output_start != -1 and output_end != -1 and output_end > output_start:
+            response = response[output_start + len("[OUTPUT_START]"):output_end].strip()
+        # 移除其他remark
+        section_markers = [
+            r'\[.*?\]',
+            r'OUTPUT_START\s*:|OUTPUT_END\s*:',
+            r'ENHANCED DESCRIPTION\s*:',
+            r'Scene Type\s*:.*?(?=\n|$)',
+            r'Original Description\s*:.*?(?=\n|$)',
+            r'GOOD\s*:|BAD\s*:',
+            r'PROBLEM\s*:.*?(?=\n|$)',
+            r'</?\|(?:assistant|system|user)\|>',
+            r'\(Note:.*?\)',
+            r'\(.*?I\'ve.*?\)',
+            r'\(.*?as per your request.*?\)'
+        ]
+        for marker in section_markers:
+            response = re.sub(marker, '', response, flags=re.IGNORECASE)
+        return response
+    def _remove_introduction_prefixes(self, response: str) -> str:
+        """移除介紹性前綴"""
+        # 處理 "Here is..." 類型的prefix
+        intro_prefixes = [
+            r'^Here\s+is\s+(?:a\s+|the\s+)?(?:rewritten\s+|enhanced\s+)?scene\s+description.*?:\s*',
+            r'^The\s+(?:rewritten\s+|enhanced\s+)?(?:scene\s+)?description\s+is.*?:\s*',
+            r'^Here\'s\s+(?:a\s+|the\s+)?(?:rewritten\s+|enhanced\s+)?description.*?:\s*'
+        ]
+        for prefix_pattern in intro_prefixes:
+            response = re.sub(prefix_pattern, '', response, flags=re.IGNORECASE)
+        # 處理固定prefix
+        for prefix in self.prefixes_to_remove:
+            if response.lower().startswith(prefix.lower()):
+                response = response[len(prefix):].strip()
+        return response
+    def _remove_format_markers(self, response: str) -> str:
+        """移除格式標記和上下文標籤（保留括號內的地理與細節資訊）"""
+        # 移除上下文相關remark
+        response = re.sub(r'<\s*Context:.*?>', '', response)
+        response = re.sub(r'Context:.*?(?=\n|$)', '', response)
+        response = re.sub(r'Note:.*?(?=\n|$)', '', response, flags=re.IGNORECASE)
+        # 移除Markdown格式
+        response = re.sub(r'\*\*|\*|__|\|', '', response)
+        # 移除任何剩餘的特殊標記 (避開括號內容，以免剔除地理位置等有用資訊)
+        response = re.sub(r'</?\|.*?\|>', '', response)
+        # ※ 以下移除「刪除整個括號及其內文」的方式已註解，以保留地理位置資訊
+        # response = re.sub(r'\(.*?\)', '', response)
+        return response
+    def _clean_scene_type_references(self, response: str) -> str:
+        """清理不當的場景類型引用"""
+        scene_type_pattern = r'This ([a-zA-Z_]+) (features|shows|displays|contains)'
+        match = re.search(scene_type_pattern, response)
+        if match and '_' in match.group(1):
+            fixed_text = f"This scene {match.group(2)}"
+            response = re.sub(scene_type_pattern, fixed_text, response)
+        return response
+    def _normalize_punctuation(self, response: str) -> str:
+        """標準化標點符號"""
+        # 減少破折號使用
+        response = re.sub(r'—', ', ', response)
+        response = re.sub(r' - ', ', ', response)
+        # 處理連續標點符號
+        response = re.sub(r'([.,;:!?])\1+', r'\1', response)
+        # 修復不完整句子的標點
+        response = re.sub(r',\s*$', '.', response)
+        # 修復句號後缺少空格的問題
+        response = re.sub(r'([.!?])([A-Z])', r'\1 \2', response)
+        # 清理多餘空格和換行
+        response = response.replace('\r', ' ')
+        response = re.sub(r'\n+', ' ', response)
+        response = re.sub(r'\s{2,}', ' ', response)
+        return response
+    def _remove_duplicate_sentences(self, response: str, similarity_threshold: float = 0.85) -> str:
+        """
+        移除重複或高度相似的句子，使用 Jaccard 相似度進行比較。
+        Args:
+            response: 原始回應文本。
+            similarity_threshold: 認定句子重複的相似度閾值 (0.0 到 1.0)。
+                                  較高的閾值表示句子需要非常相似才會被移除。
+        Returns:
+            str: 移除重複句子後的文本。
+        """
+        try:
+            if not response or not response.strip():
+                return ""
+            # (?<=[.!?]) 會保留分隔符在句尾, \s+ 會消耗句尾的空格
+            # 這樣用 ' ' join 回去時, 標點和下個句子間剛好一個空格
+            sentences = re.split(r'(?<=[.!?])\s+', response.strip())
+            unique_sentences_data = [] # Store tuples of (original_sentence, simplified_word_set)
+            min_sentence_len_for_check = 8 # 簡化後詞彙數少於此值，除非完全相同否則不輕易判斷為重複
+            for sentence in sentences:
+                sentence = sentence.strip()
+                if not sentence:
+                    continue
+                # 創建簡化版本用於比較 (小寫，移除標點，分割為詞彙集合)
+                # 保留數字，因為數字可能是關鍵資訊
+                simplified_text = re.sub(r'[^\w\s\d]', '', sentence.lower())
+                current_sentence_words = set(simplified_text.split())
+                if not current_sentence_words: # 如果處理後是空集合，跳過
+                    continue
+                is_duplicate = False
+                # 與已保留的唯一句子比較
+                for i, (kept_sentence_text, kept_sentence_words) in enumerate(unique_sentences_data):
+                    # Jaccard Index
+                    intersection_len = len(current_sentence_words.intersection(kept_sentence_words))
+                    union_len = len(current_sentence_words.union(kept_sentence_words))
+                    if union_len == 0: # 兩個都是空集合，代表相同句子
+                        jaccard_similarity = 1.0
+                    else:
+                        jaccard_similarity = intersection_len / union_len
+                    # 用Jaccard 相似度超過閾值，不是兩個都非常短的句子 (避免 "Yes." 和 "No." 被錯誤合併)
+                    # 新句子完全被舊句子包含 (且舊句子更長)
+                    # 舊句子完全被新句子包含 (且新句子更長) -> 這種情況就需要替換
+                    if jaccard_similarity >= similarity_threshold:
+                        # 如果當前句子比已保留的句子短，且高度相似，則認為是重複
+                        if len(current_sentence_words) < len(kept_sentence_words):
+                            is_duplicate = True
+                            self.logger.debug(f"Sentence \"{sentence[:30]}...\" marked duplicate (shorter, similar to \"{kept_sentence_text[:30]}...\") Jaccard: {jaccard_similarity:.2f}")
+                            break
+                        # 如果當前句子比已保留的句子長，且高度相似，則替換掉已保留的
+                        elif len(current_sentence_words) > len(kept_sentence_words):
+                            self.logger.debug(f"Sentence \"{kept_sentence_text[:30]}...\" replaced by longer similar sentence \"{sentence[:30]}...\" Jaccard: {jaccard_similarity:.2f}")
+                            unique_sentences_data.pop(i) # 移除舊的、較短的句子
+                        # 如果長度差不多，但相似度高，保留第一個出現的
+                        elif current_sentence_words != kept_sentence_words : # 避免完全相同的句子被錯誤地跳過替換邏輯
+                             is_duplicate = True # 保留先出現的
+                             self.logger.debug(f"Sentence \"{sentence[:30]}...\" marked duplicate (similar length, similar to \"{kept_sentence_text[:30]}...\") Jaccard: {jaccard_similarity:.2f}")
+                             break
+                if not is_duplicate:
+                    unique_sentences_data.append((sentence, current_sentence_words))
+            # 重組唯一句子
+            final_sentences = [s_data[0] for s_data in unique_sentences_data]
+            # 確保每個句子以標點結尾 (因為 split 可能會產生沒有標點的最後一個片段)
+            reconstructed_response = ""
+            for i, s in enumerate(final_sentences):
+                s = s.strip()
+                if not s: continue
+                if not s[-1] in ".!?":
+                    s += "."
+                reconstructed_response += s
+                if i < len(final_sentences) - 1:
+                     reconstructed_response += " " # 在句子間添加空格
+            return reconstructed_response.strip()
+        except Exception as e:
+            self.logger.error(f"Error in _remove_duplicate_sentences: {str(e)}")
+            self.logger.error(traceback.format_exc())
+            return response # 發生錯誤時返回原始回應
+    def _handle_repetitive_vocabulary(self, response: str) -> str:
+        """處理重複詞彙，使用 re.sub 和可呼叫的替換函數以提高效率和準確性。"""
+        try:
+            # 檢測重複模式 (僅警告)
+            if hasattr(self, 'repetitive_patterns'):
+                for pattern, issue in self.repetitive_patterns:
+                    if re.search(pattern, response, re.IGNORECASE | re.DOTALL):
+                        self.logger.warning(f"Text quality issue detected: {issue} in response: \"{response[:100]}...\"")
+            if not hasattr(self, 'replacement_alternatives') or not self.replacement_alternatives:
+                return response
+            processed_response = response
+            for word_to_replace, alternatives in self.replacement_alternatives.items():
+                if not alternatives:  # 如果沒有可用的替代詞，則跳過
+                    continue
+                # 為每個詞創建一個獨立的計數器和替代索引
+                # 使用閉包或一個小類來封裝狀態
+                class WordReplacer:
+                    def __init__(self, alternatives_list):
+                        self.count = 0
+                        self.alternative_idx = 0
+                        self.alternatives_list = alternatives_list
+                    def __call__(self, match_obj):
+                        self.count += 1
+                        original_word = match_obj.group(0)
+                        if self.count > 1:  # 從第二次出現開始替換
+                            replacement = self.alternatives_list[self.alternative_idx % len(self.alternatives_list)]
+                            self.alternative_idx += 1
+                            # 保持原始大小寫格式
+                            if original_word.isupper():
+                                return replacement.upper()
+                            elif original_word.istitle():
+                                return replacement.capitalize()
+                            return replacement
+                        return original_word # 因為第一次出現, 就不用替換
+                replacer_instance = WordReplacer(alternatives)
+                # 使用 \b 確保匹配的是整個單詞
+                pattern = re.compile(r'\b' + re.escape(word_to_replace) + r'\b', re.IGNORECASE)
+                processed_response = pattern.sub(replacer_instance, processed_response)
+            return processed_response
+        except Exception as e:
+            self.logger.error(f"Error in _handle_repetitive_vocabulary: {str(e)}")
+            self.logger.error(traceback.format_exc())
+            return response # 發生錯誤時返回原始回應
+    def _ensure_grammatical_completeness(self, response: str) -> str:
+        """
+        確保語法完整性，處理不��整句子和格式問題
+        Args:
+            response: 待檢查的回應文本
+        Returns:
+            str: 語法完整的回應文本
+        """
+        try:
+            if not response or not response.strip():
+                return response
+            # 第一階段：檢查並修正不完整的句子模式
+            incomplete_patterns = [
+                # 介詞後直接結束的問題（針對 "over ." 等情況）
+                (r'\b(over|under|through|across|along|beneath|beyond|throughout)\s*\.', 'incomplete_preposition'),
+                (r'\b(with|without|against|towards|beside|between|among)\s*\.', 'incomplete_preposition'),
+                (r'\b(into|onto|upon|within|behind|below|above)\s*\.', 'incomplete_preposition'),
+                # 處理 "In ," 這類缺失詞彙的問題
+                (r'\bIn\s*,', 'incomplete_location'),
+                (r'\bAt\s*,', 'incomplete_location'),
+                (r'\bOn\s*,', 'incomplete_location'),
+                (r'\bWith\s*,', 'incomplete_context'),
+                # 不完整的描述模式
+                (r'\b(fine|the)\s+(the\s+)?(?:urban|area|scene)\b(?!\s+\w)', 'incomplete_description'),
+                # 連詞或介詞後直接標點的問題
+                (r'\b(and|or|but|with|from|in|at|on|by|for|to)\s*[.!?]', 'incomplete_conjunction'),
+                # 重複詞彙
+                (r'\b(\w+)\s+\1\b', 'word_repetition'),
+                # 不完整的場景類型引用（如 "urban_intersection" 格式問題）
+                (r'\b(\w+)_(\w+)\b', 'underscore_format'),
+                # 地標場景特有問題
+                (r'\btourist_landmark\b', 'underscore_format'),
+                (r'\burban_intersection\b', 'underscore_format'),
+                (r'\bIn\s*,\s*(?=\w)', 'incomplete_prepositional'),
+                (r'\bOverall,\s+(?=exudes|shows|displays)(?!\s+(?:the|this|it))', 'missing_subject'),
+                (r'\batmosphere of\s+is one of\b', 'redundant_structure'),
+                (r'\bwith.*?turned\s+illuminating\b', 'redundant_participle')
+            ]
+            for pattern, issue_type in incomplete_patterns:
+                try:
+                    matches = list(re.finditer(pattern, response, re.IGNORECASE))
+                    for match in matches:
+                        if issue_type == 'incomplete_preposition':
+                            # 處理介詞後直接結束的情況
+                            response = self._fix_incomplete_preposition(response, match)
+                        elif issue_type == 'underscore_format':
+                            # 將下劃線格式轉換為空格分隔
+                            original = match.group(0)
+                            replacement = original.replace('_', ' ')
+                            response = response.replace(original, replacement)
+                        elif issue_type == 'word_repetition':
+                            # 移除重複的詞彙
+                            repeated_word = match.group(1)
+                            response = response.replace(f"{repeated_word} {repeated_word}", repeated_word)
+                        elif issue_type == 'incomplete_location' or issue_type == 'incomplete_context':
+                            # 移除不完整的位置或上下文引用
+                            response = response.replace(match.group(0), '')
+                        elif issue_type == 'incomplete_prepositional':
+                            # 處理不完整的介詞短語
+                            response = re.sub(r'\bIn\s*,\s*', 'Throughout the scene, ', response)
+                        elif issue_type == 'missing_subject':
+                            # 為Overall句子添加主語
+                            response = re.sub(r'\bOverall,\s+(?=exudes)', 'Overall, the scene ', response)
+                        elif issue_type == 'redundant_structure':
+                            # 簡化冗餘結構
+                            response = re.sub(r'\batmosphere of\s+is one of\b', 'atmosphere is one of', response)
+                        elif issue_type == 'redundant_participle':
+                            # 清理冗餘分詞
+                            response = re.sub(r'turned\s+illuminating', 'illuminating', response)
+                        else:
+                            # 對於其他不完整模式，直接移除
+                            response = response.replace(match.group(0), '')
+                    # 清理多餘空格
+                    response = re.sub(r'\s{2,}', ' ', response).strip()
+                except re.error as e:
+                    self.logger.warning(f"Regular expression pattern error for {issue_type}: {pattern} - {str(e)}")
+                    continue
+            # 第二階段：處理物件類別格式問題
+            response = self._clean_object_class_references(response)
+            # 第三階段：確保句子正確結束
+            response = self._ensure_proper_sentence_ending(response)
+            # 第四階段：最終語法檢查
+            response = self._final_grammar_check(response)
+            return response.strip()
+        except Exception as e:
+            self.logger.error(f"Error in _ensure_grammatical_completeness: {str(e)}")
+            return response
+    def _fix_incomplete_preposition(self, response: str, match) -> str:
+        """
+        修正不完整的介詞短語
+        Args:
+            response: 回應文本
+            match: 正則匹配對象
+        Returns:
+            str: 修正後的回應
+        """
+        preposition = match.group(1)
+        match_start = match.start()
+        # 找到句子的開始位置
+        sentence_start = response.rfind('.', 0, match_start)
+        sentence_start = sentence_start + 1 if sentence_start != -1 else 0
+        # 提取句子片段
+        sentence_fragment = response[sentence_start:match_start].strip()
+        # 如果句子片段有意義，嘗試移除不完整的介詞部分
+        if len(sentence_fragment) > 10:
+            # 移除介詞及其後的內容，添加適當的句號
+            response = response[:match_start].rstrip() + '.'
+        else:
+            # 如果句子片段太短，移除整個不完整的句子
+            response = response[:sentence_start] + response[match.end():]
+        return response
+    def _clean_object_class_references(self, response: str) -> str:
+        """
+        清理物件類別引用中的格式問題
+        Args:
+            response: 回應文本
+        Returns:
+            str: 清理後的回應
+        """
+        # 移除類別ID引用（如 "unknown-class 2", "Class 0" 等）
+        class_id_patterns = [
+            r'\bunknown[- ]?class\s*\d+\s*objects?',
+            r'\bclass[- ]?\d+\s*objects?',
+            r'\b[Cc]lass\s*\d+\s*objects?',
+            r'\bunknown[- ][Cc]lass\s*\d+\s*objects?'
+        ]
+        for pattern in class_id_patterns:
+            try:
+                # 替換為更自然的描述
+                response = re.sub(pattern, 'objects', response, flags=re.IGNORECASE)
+            except re.error as e:
+                self.logger.warning(f"Error cleaning class reference pattern {pattern}: {str(e)}")
+                continue
+        # 處理數量描述中的問題
+        response = re.sub(r'\b(\w+)\s+unknown[- ]?\w*\s*objects?', r'\1 objects', response, flags=re.IGNORECASE)
+        return response
+    def _ensure_proper_sentence_ending(self, response: str) -> str:
+        """
+        確保句子有適當的結尾
+        Args:
+            response: 回應文本
+        Returns:
+            str: 具有適當結尾的回應
+        """
+        if not response or not response.strip():
+            return response
+        response = response.strip()
+        # 檢查是否以標點符號結尾
+        if response and response[-1] not in ['.', '!', '?']:
+            # 常見介詞和連詞列表
+            problematic_endings = [
+                "into", "onto", "about", "above", "across", "after", "along", "around",
+                "at", "before", "behind", "below", "beneath", "beside", "between",
+                "beyond", "by", "down", "during", "except", "for", "from", "in",
+                "inside", "near", "of", "off", "on", "over", "through", "to",
+                "toward", "under", "up", "upon", "with", "within", "and", "or", "but"
+            ]
+            words = response.split()
+            if words:
+                last_word = words[-1].lower().rstrip('.,!?')
+                if last_word in problematic_endings:
+                    # 找到最後完整的句子
+                    last_period_pos = max(
+                        response.rfind('.'),
+                        response.rfind('!'),
+                        response.rfind('?')
+                    )
+                    if last_period_pos > len(response) // 2:  # 如果有較近的完整句子
+                        response = response[:last_period_pos + 1]
+                    else:
+                        # 移除問題詞彙並添加句號
+                        if len(words) > 1:
+                            response = " ".join(words[:-1]) + "."
+                        else:
+                            response = "The scene displays various elements."
+                else:
+                    # 正常情況下添加句號
+                    response += "."
+        return response
+    def _final_grammar_check(self, response: str) -> str:
+        """
+        最終語法檢查和清理
+        Args:
+            response: 回應文本
+        Returns:
+            str: 最終清理後的回應
+        """
+        if not response:
+            return response
+        # 修正連續標點符號
+        response = re.sub(r'([.!?]){2,}', r'\1', response)
+        # 修正句號前的空格
+        response = re.sub(r'\s+([.!?])', r'\1', response)
+        # 修正句號後缺少空格的問題
+        response = re.sub(r'([.!?])([A-Z])', r'\1 \2', response)
+        # 確保首字母大寫
+        if response and response[0].islower():
+            response = response[0].upper() + response[1:]
+        # 移除多餘的空格
+        response = re.sub(r'\s{2,}', ' ', response)
+        # 處理空句子或過短的回應
+        if len(response.strip()) < 20:
+            return "The scene contains various visual elements."
+        return response.strip()
+    def _control_word_length(self, response: str) -> str:
+        """控制文字長度在合理範圍內"""
+        words = response.split()
+        if len(words) > 200:
+            # 找到接近字數限制的句子結束處
+            truncated = ' '.join(words[:200])
+            last_period = max(truncated.rfind('.'), truncated.rfind('!'), truncated.rfind('?'))
+            if last_period > 0:
+                response = truncated[:last_period+1]
+            else:
+                response = truncated + "."
+        return response
+    def _final_formatting(self, response: str) -> str:
+        """最終格式化處理"""
+        # 確保首字母大寫
+        if response and response[0].islower():
+            response = response[0].upper() + response[1:]
+        # 統一格式為單一段落
+        response = re.sub(r'\s*\n\s*', ' ', response)
+        response = ' '.join(response.split())
+        return response.strip()
+    def _recover_from_overcleaning(self, original_response: str) -> str:
+        """從過度清理中恢復內容"""
+        try:
+            # 嘗試從原始回應中找到最佳段落
+            paragraphs = [p for p in original_response.split('\n\n') if p.strip()]
+            if paragraphs:
+                # 選擇最長的段落作為主要描述
+                best_para = max(paragraphs, key=len)
+                # 使用基本清理規則
+                best_para = re.sub(r'\[.*?\]', '', best_para)
+                best_para = re.sub(r'\s{2,}', ' ', best_para).strip()
+                if len(best_para) >= 40:
+                    return best_para
+            return "Unable to generate a valid enhanced description."
+        except Exception as e:
+            self.logger.error(f"Recovery from overcleaning failed: {str(e)}")
+            return "Description generation error."
+    def _validate_cleaned_response(self, response: str):
+        """驗證清理後的回應"""
+        if not response:
+            raise ResponseProcessingError("Response is empty after cleaning")
+        if len(response.strip()) < 20:
+            raise ResponseProcessingError("Response is too short after cleaning")
+        # 檢查是否包含基本的句子結構
+        if not re.search(r'[.!?]', response):
+            raise ResponseProcessingError("Response lacks proper sentence structure")
+    def remove_explanatory_notes(self, response: str) -> str:
+        """
+        移除解釋性注釋和說明
+        Args:
+            response: 包含可能注釋的回應
+        Returns:
+            str: 移除注釋後的回應
+        """
+        try:
+            # 識別常見的注釋和解釋模式
+            note_patterns = [
+                r'(?:^|\n)Note:.*?(?:\n|$)',
+                r'(?:^|\n)I have (?:followed|adhered to|ensured).*?(?:\n|$)',
+                r'(?:^|\n)This description (?:follows|adheres to|maintains).*?(?:\n|$)',
+                r'(?:^|\n)The enhanced description (?:maintains|preserves).*?(?:\n|$)'
+            ]
+            # 尋找段落
+            paragraphs = [p.strip() for p in response.split('\n\n') if p.strip()]
+            # 如果只有一個段落，檢查並清理它
+            if len(paragraphs) == 1:
+                for pattern in note_patterns:
+                    paragraphs[0] = re.sub(pattern, '', paragraphs[0], flags=re.IGNORECASE)
+                return paragraphs[0].strip()
+            # 如果有多個段落，移除注釋段落
+            content_paragraphs = []
+            for paragraph in paragraphs:
+                is_note = False
+                for pattern in note_patterns:
+                    if re.search(pattern, paragraph, flags=re.IGNORECASE):
+                        is_note = True
+                        break
+                # 檢查段落是否以常見的注釋詞開頭
+                if paragraph.lower().startswith(('note:', 'please note:', 'remember:')):
+                    is_note = True
+                if not is_note:
+                    content_paragraphs.append(paragraph)
+            return '\n\n'.join(content_paragraphs).strip()
+        except Exception as e:
+            self.logger.error(f"Failed to remove explanatory notes: {str(e)}")
+            return response
+    def get_processor_info(self) -> Dict[str, Any]:
+        """
+        獲取處理器信息
+        Returns:
+            Dict[str, Any]: 包含處理器狀態和配置的信息
+        """
+        return {
+            "replacement_alternatives_count": len(self.replacement_alternatives),
+            "prefixes_to_remove_count": len(self.prefixes_to_remove),
+            "suffixes_to_remove_count": len(self.suffixes_to_remove),
+            "repetitive_patterns_count": len(self.repetitive_patterns),
+            "initialization_status": "success"
+        }

result_cache_manager.py ADDED Viewed

	@@ -0,0 +1,234 @@

+import logging
+import traceback
+from typing import Dict, Any, Tuple, Optional, Union
+from PIL import Image
+import numpy as np
+class ResultCacheManager:
+    """
+    專門處理結果快取和性能優化，包括快取策略管理、快取大小控制和快取命中率優化
+    """
+    def __init__(self, cache_max_size: int = 100):
+        """
+        初始化結果快取管理器
+        Args:
+            cache_max_size: 最大快取項目數
+        """
+        self.logger = logging.getLogger(__name__)
+        # 初始化結果快取
+        self.results_cache = {}  # 使用圖像hash作為鍵
+        self.cache_max_size = cache_max_size  # 最大快取項目數
+    def generate_cache_key(self, image_hash: int, additional_params: Tuple) -> Tuple:
+        """
+        生成快取鍵
+        Args:
+            image_hash
+            additional_params: 附加參數元組
+        Returns:
+            Tuple: 快取鍵
+        """
+        try:
+            return (image_hash, additional_params)
+        except Exception as e:
+            self.logger.error(f"Error generating cache key: {e}")
+            self.logger.error(traceback.format_exc())
+            return (0, additional_params)
+    def get_region_cache_key(self, image_hash: int, box: Tuple[float, ...],
+                           detection_type: str) -> Tuple:
+        """
+        生成區域分析的快取鍵
+        Args:
+            image_hash
+            box: 邊界框
+            detection_type: 檢測類型
+        Returns:
+            Tuple: 區域快取鍵
+        """
+        try:
+            return self.generate_cache_key(image_hash, (tuple(box), detection_type))
+        except Exception as e:
+            self.logger.error(f"Error generating region cache key: {e}")
+            self.logger.error(traceback.format_exc())
+            return (0, (tuple(box), detection_type))
+    def get_image_cache_key(self, image_hash: int, analysis_type: str,
+                          detailed_analysis: bool = False) -> Tuple:
+        """
+        生成整張圖像分析的快取鍵
+        Args:
+            image_hash: 圖像哈希值
+            analysis_type: 分析類型
+            detailed_analysis: 是否詳細分析
+        Returns:
+            Tuple: 圖像快取鍵
+        """
+        try:
+            return self.generate_cache_key(image_hash, (analysis_type, detailed_analysis))
+        except Exception as e:
+            self.logger.error(f"Error generating image cache key: {e}")
+            self.logger.error(traceback.format_exc())
+            return (0, (analysis_type, detailed_analysis))
+    def get_cached_result(self, cache_key: Tuple) -> Optional[Dict[str, Any]]:
+        """
+        獲取快取結果
+        Args:
+            cache_key: 快取鍵
+        Returns:
+            Optional[Dict[str, Any]]: 快取結果，如果不存在則返回None
+        """
+        try:
+            return self.results_cache.get(cache_key)
+        except Exception as e:
+            self.logger.error(f"Error getting cached result: {e}")
+            self.logger.error(traceback.format_exc())
+            return None
+    def set_cached_result(self, cache_key: Tuple, result: Dict[str, Any]):
+        """
+        設置快取結果
+        Args:
+            cache_key: 快取鍵
+            result: 要快取的結果
+        """
+        try:
+            self.results_cache[cache_key] = result
+            self.manage_cache_size()
+        except Exception as e:
+            self.logger.error(f"Error setting cached result: {e}")
+            self.logger.error(traceback.format_exc())
+    def manage_cache_size(self):
+        """
+        管理結果快取大小
+        """
+        try:
+            if len(self.results_cache) > self.cache_max_size:
+                oldest_key = next(iter(self.results_cache))
+                del self.results_cache[oldest_key]
+        except Exception as e:
+            self.logger.error(f"Error managing cache size: {e}")
+            self.logger.error(traceback.format_exc())
+    def clear_cache(self):
+        """
+        清空快取
+        """
+        try:
+            self.results_cache.clear()
+            self.logger.info("Cache cleared successfully")
+        except Exception as e:
+            self.logger.error(f"Error clearing cache: {e}")
+            self.logger.error(traceback.format_exc())
+    def get_cache_stats(self) -> Dict[str, Any]:
+        """
+        獲取快取統計信息
+        Returns:
+            Dict[str, Any]: 快取統計信息
+        """
+        try:
+            return {
+                "cache_size": len(self.results_cache),
+                "max_cache_size": self.cache_max_size,
+                "cache_usage_ratio": len(self.results_cache) / self.cache_max_size if self.cache_max_size > 0 else 0
+            }
+        except Exception as e:
+            self.logger.error(f"Error getting cache stats: {e}")
+            self.logger.error(traceback.format_exc())
+            return {
+                "cache_size": 0,
+                "max_cache_size": self.cache_max_size,
+                "cache_usage_ratio": 0
+            }
+    def set_max_cache_size(self, max_size: int):
+        """
+        設置最大快取大小
+        Args:
+            max_size: 新的最大快取大小
+        """
+        try:
+            self.cache_max_size = max(1, max_size)
+            self.manage_cache_size()
+            self.logger.info(f"Max cache size set to {self.cache_max_size}")
+        except Exception as e:
+            self.logger.error(f"Error setting max cache size: {e}")
+            self.logger.error(traceback.format_exc())
+    def remove_cached_result(self, cache_key: Tuple) -> bool:
+        """
+        移除特定的快取結果
+        Args:
+            cache_key: 快取鍵
+        Returns:
+            bool: 是否成功移除
+        """
+        try:
+            if cache_key in self.results_cache:
+                del self.results_cache[cache_key]
+                return True
+            return False
+        except Exception as e:
+            self.logger.error(f"Error removing cached result: {e}")
+            self.logger.error(traceback.format_exc())
+            return False
+    def is_cache_enabled(self) -> bool:
+        """
+        檢查快取是否啟用
+        Returns:
+            bool: 快取啟用狀態
+        """
+        return self.cache_max_size > 0
+    def get_cache_keys(self) -> list:
+        """
+        獲取所有快取鍵
+        Returns:
+            list: 快取鍵列表
+        """
+        try:
+            return list(self.results_cache.keys())
+        except Exception as e:
+            self.logger.error(f"Error getting cache keys: {e}")
+            self.logger.error(traceback.format_exc())
+            return []
+    def has_cached_result(self, cache_key: Tuple) -> bool:
+        """
+        檢查是否存在快取結果
+        Args:
+            cache_key: 快取鍵
+        Returns:
+            bool: 是否存在快取結果
+        """
+        try:
+            return cache_key in self.results_cache
+        except Exception as e:
+            self.logger.error(f"Error checking cached result: {e}")
+            self.logger.error(traceback.format_exc())
+            return False

scene_analysis_coordinator.py ADDED Viewed

	@@ -0,0 +1,973 @@

+import logging
+import traceback
+import numpy as np
+from typing import Dict, List, Tuple, Optional, Any
+from PIL import Image
+class SceneAnalysisCoordinator:
+    """
+    負責整個場景分析流程的協調和控制邏輯，包含主要的分析流程、
+    處理無檢測結果的回退邏輯，以及多源分析結果的整合。
+    """
+    def __init__(self, component_initializer, scene_scoring_engine, landmark_processing_manager,
+                 scene_confidence_threshold: float = 0.6):
+        """
+        初始化場景分析協調器。
+        Args:
+            component_initializer: 組件初始化器實例
+            scene_scoring_engine: 場景評分引擎實例
+            landmark_processing_manager: 地標處理管理器實例
+            scene_confidence_threshold: 場景置信度閾值
+        """
+        self.logger = logging.getLogger(__name__)
+        self.component_initializer = component_initializer
+        self.scene_scoring_engine = scene_scoring_engine
+        self.landmark_processing_manager = landmark_processing_manager
+        self.scene_confidence_threshold = scene_confidence_threshold
+        # 獲取必要的組件和數據
+        self.spatial_analyzer = component_initializer.get_component('spatial_analyzer')
+        self.descriptor = component_initializer.get_component('descriptor')
+        self.scene_describer = component_initializer.get_component('scene_describer')
+        self.clip_analyzer = component_initializer.get_component('clip_analyzer')
+        self.llm_enhancer = component_initializer.get_component('llm_enhancer')
+        self.scene_types = component_initializer.get_data_structure('SCENE_TYPES')
+        # 從組件初始化器獲取功能開關狀態
+        self.use_clip = component_initializer.use_clip
+        self.use_llm = component_initializer.use_llm
+        self.enable_landmark = component_initializer.enable_landmark
+    def analyze(self, detection_result: Any, lighting_info: Optional[Dict] = None,
+                class_confidence_threshold: float = 0.25, scene_confidence_threshold: float = 0.6,
+                enable_landmark: bool = True, places365_info: Optional[Dict] = None) -> Dict:
+        """
+        分析檢測結果以確定場景類型並提供理解。
+        Args:
+            detection_result: 來自 YOLOv8 或類似系統的檢測結果
+            lighting_info: 可選的照明條件分析結果
+            class_confidence_threshold: 考慮物體的最小置信度
+            scene_confidence_threshold: 確定場景的最小置信度
+            enable_landmark: 是否為此次運行啟用地標檢測和識別
+            places365_info: 可選的 Places365 場景分類結果
+        Returns:
+            包含場景分析結果的字典
+        """
+        current_run_enable_landmark = enable_landmark
+        self.logger.info(f"DIAGNOSTIC (SceneAnalyzer.analyze): Called with current_run_enable_landmark={current_run_enable_landmark}")
+        self.logger.debug(f"SceneAnalyzer received lighting_info type: {type(lighting_info)}")
+        self.logger.debug(f"SceneAnalyzer lighting_info source: {lighting_info.get('source', 'unknown') if isinstance(lighting_info, dict) else 'not_dict'}")
+        # 記錄 Places365 資訊
+        if places365_info:
+            self.logger.info(f"DIAGNOSTIC: Places365 info received - scene: {places365_info.get('scene_label', 'unknown')}, "
+                           f"mapped: {places365_info.get('mapped_scene_type', 'unknown')}, "
+                           f"confidence: {places365_info.get('confidence', 0.0):.3f}")
+        # 同步 enable_landmark 狀態到子組件（為此次分析運行）
+        self._sync_landmark_status_to_components(current_run_enable_landmark)
+        # 提取和處理原始圖像
+        original_image_pil, image_dims_val = self._extract_image_info(detection_result)
+        # 處理無 YOLO 檢測結果的情況
+        no_yolo_detections = self._check_no_yolo_detections(detection_result)
+        if no_yolo_detections:
+            return self._handle_no_yolo_detections(
+                original_image_pil, image_dims_val, current_run_enable_landmark,
+                lighting_info, places365_info
+            )
+        # 主處理流程（有 YOLO 檢測結果）
+        return self._handle_main_analysis_flow(
+            detection_result, original_image_pil, image_dims_val,
+            class_confidence_threshold, scene_confidence_threshold,
+            current_run_enable_landmark, lighting_info, places365_info
+        )
+    def _sync_landmark_status_to_components(self, current_run_enable_landmark: bool):
+        """同步地標狀態到所有相關組件。"""
+        # 更新場景評分引擎
+        self.scene_scoring_engine.update_enable_landmark_status(current_run_enable_landmark)
+        # 更新地標處理管理器
+        self.landmark_processing_manager.update_enable_landmark_status(current_run_enable_landmark)
+        # 更新其他組件的地標狀態
+        for component_name in ['scene_describer', 'clip_analyzer', 'landmark_classifier']:
+            component = self.component_initializer.get_component(component_name)
+            if component and hasattr(component, 'enable_landmark'):
+                component.enable_landmark = current_run_enable_landmark
+        # 更新實例狀態
+        self.enable_landmark = current_run_enable_landmark
+    def _extract_image_info(self, detection_result) -> Tuple[Optional[Image.Image], Optional[Tuple[int, int]]]:
+        """從檢測結果中提取圖像信息。"""
+        original_image_pil = None
+        image_dims_val = None  # 將是 (width, height)
+        if (detection_result is not None and hasattr(detection_result, 'orig_img') and
+            detection_result.orig_img is not None):
+            if isinstance(detection_result.orig_img, np.ndarray):
+                try:
+                    img_array = detection_result.orig_img
+                    if img_array.ndim == 3 and img_array.shape[2] == 4:  # RGBA
+                        img_array = img_array[:, :, :3]  # 轉換為 RGB
+                    if img_array.ndim == 2:  # 灰度
+                        original_image_pil = Image.fromarray(img_array).convert("RGB")
+                    else:  # 假設 RGB 或 BGR（如果源是 cv2 BGR，PIL 在 fromarray 時會處理 BGR->RGB，但明確處理更好）
+                        original_image_pil = Image.fromarray(img_array)
+                    if hasattr(original_image_pil, 'mode') and original_image_pil.mode == 'BGR':  # 明確將 OpenCV 的 BGR 轉換為 PIL 的 RGB
+                        original_image_pil = original_image_pil.convert('RGB')
+                    image_dims_val = (original_image_pil.width, original_image_pil.height)
+                except Exception as e:
+                    self.logger.warning(f"Error converting NumPy orig_img to PIL: {e}")
+            elif hasattr(detection_result.orig_img, 'size') and callable(getattr(detection_result.orig_img, 'convert', None)):
+                original_image_pil = detection_result.orig_img.copy().convert("RGB")  # 確保 RGB
+                image_dims_val = original_image_pil.size
+            else:
+                self.logger.warning(f"detection_result.orig_img (type: {type(detection_result.orig_img)}) is not a recognized NumPy array or PIL Image.")
+        else:
+            self.logger.warning("detection_result.orig_img not available. Image-based analysis will be limited.")
+        return original_image_pil, image_dims_val
+    def _check_no_yolo_detections(self, detection_result) -> bool:
+        """檢查是否沒有 YOLO 檢測結果。"""
+        return (detection_result is None or
+                not hasattr(detection_result, 'boxes') or
+                not hasattr(detection_result.boxes, 'xyxy') or
+                len(detection_result.boxes.xyxy) == 0)
+    def _handle_no_yolo_detections(self, original_image_pil, image_dims_val,
+                                 current_run_enable_landmark, lighting_info, places365_info) -> Dict:
+        """處理無 YOLO 檢測結果的情況。"""
+        tried_landmark_detection = False
+        landmark_detection_result = None
+        # 嘗試地標檢測
+        if original_image_pil and self.use_clip and current_run_enable_landmark:
+            landmark_detection_result = self._attempt_landmark_detection_no_yolo(
+                original_image_pil, image_dims_val, lighting_info
+            )
+            tried_landmark_detection = True
+            if landmark_detection_result:
+                return landmark_detection_result
+        # 如果地標檢測失敗或未嘗試，使用 CLIP 進行一般場景分析
+        if not landmark_detection_result and self.use_clip and original_image_pil:
+            clip_fallback_result = self._attempt_clip_fallback_analysis(
+                original_image_pil, image_dims_val, current_run_enable_landmark, lighting_info
+            )
+            if clip_fallback_result:
+                return clip_fallback_result
+        # 最終回退邏輯
+        return self._get_final_fallback_result(places365_info, lighting_info)
+    def _attempt_landmark_detection_no_yolo(self, original_image_pil, image_dims_val, lighting_info) -> Optional[Dict]:
+        """在無 YOLO 檢測的情況下嘗試地標檢測。"""
+        try:
+            # 初始化地標分類器（如果需要）
+            landmark_classifier = self.component_initializer.get_component('landmark_classifier')
+            if not landmark_classifier and self.clip_analyzer:
+                if hasattr(self.clip_analyzer, 'get_clip_instance'):
+                    try:
+                        model, preprocess, device = self.clip_analyzer.get_clip_instance()
+                        landmark_classifier = CLIPZeroShotClassifier(device=device)
+                        self.landmark_processing_manager.set_landmark_classifier(landmark_classifier)
+                        self.logger.info("Initialized landmark classifier with shared CLIP model")
+                    except Exception as e:
+                        self.logger.warning(f"Could not initialize landmark classifier: {e}")
+                        return None
+            if landmark_classifier:
+                self.logger.info("Attempting landmark detection with no YOLO boxes")
+                landmark_results_no_yolo = landmark_classifier.intelligent_landmark_search(
+                    original_image_pil, yolo_boxes=None, base_threshold=0.2  # 略微降低閾值，提高靈敏度
+                )
+                # 確保在無地標場景時返回有效結果
+                if landmark_results_no_yolo is None:
+                    landmark_results_no_yolo = {"is_landmark_scene": False, "detected_landmarks": []}
+                if (landmark_results_no_yolo and landmark_results_no_yolo.get("is_landmark_scene", False)):
+                    return self._process_landmark_detection_result(
+                        landmark_results_no_yolo, image_dims_val, lighting_info
+                    )
+        except Exception as e:
+            self.logger.error(f"Error in landmark-only detection path (analyze method): {e}")
+            traceback.print_exc()
+        return None
+    def _process_landmark_detection_result(self, landmark_results, image_dims_val, lighting_info) -> Dict:
+        """處理地標檢測結果並生成最終輸出。"""
+        primary_landmark = landmark_results.get("primary_landmark")
+        # 放寬閾值條件，以便捕獲更多潛在地標
+        if not primary_landmark or primary_landmark.get("confidence", 0) <= 0.25:
+            return None
+        detected_objects_from_landmarks_list = []
+        w_img, h_img = image_dims_val if image_dims_val else (1, 1)
+        for lm_info_item in landmark_results.get("detected_landmarks", []):
+            if lm_info_item.get("confidence", 0) > 0.25:  # 降低閾值與上面保持一致
+                # 安全獲取 box 值，避免索引錯誤
+                box = lm_info_item.get("box", [0, 0, w_img, h_img])
+                if len(box) < 4:
+                    box = [0, 0, w_img, h_img]
+                # 計算中心點和標準化坐標
+                center_x, center_y = (box[0] + box[2]) / 2, (box[1] + box[3]) / 2
+                norm_cx = center_x / w_img if w_img > 0 else 0.5
+                norm_cy = center_y / h_img if h_img > 0 else 0.5
+                # 決定地標類型
+                landmark_type = "architectural"  # 預設類型
+                landmark_id = lm_info_item.get("landmark_id", "")
+                landmark_classifier = self.component_initializer.get_component('landmark_classifier')
+                if (landmark_classifier and hasattr(landmark_classifier, '_determine_landmark_type') and landmark_id):
+                    try:
+                        landmark_type = landmark_classifier._determine_landmark_type(landmark_id)
+                    except Exception as e:
+                        self.logger.error(f"Error determining landmark type: {e}")
+                else:
+                    # 使用簡單的基於 ID 的啟發式方法推斷類型
+                    landmark_id_lower = landmark_id.lower() if isinstance(landmark_id, str) else ""
+                    if "natural" in landmark_id_lower or any(term in landmark_id_lower for term in ["mountain", "waterfall", "canyon", "lake"]):
+                        landmark_type = "natural"
+                    elif "monument" in landmark_id_lower or "memorial" in landmark_id_lower or "historical" in landmark_id_lower:
+                        landmark_type = "monument"
+                # 決定區域位置
+                region = "center"  # 預設值
+                if self.spatial_analyzer and hasattr(self.spatial_analyzer, '_determine_region'):
+                    try:
+                        region = self.spatial_analyzer._determine_region(norm_cx, norm_cy)
+                    except Exception as e:
+                        self.logger.error(f"Error determining region: {e}")
+                # 取得並補 location
+                loc_lm = lm_info_item.get("location", "")
+                if not loc_lm and landmark_id in ALL_LANDMARKS:
+                    loc_lm = ALL_LANDMARKS[landmark_id].get("location", "")
+                # 創建地標物體
+                landmark_obj = {
+                    "class_id": lm_info_item.get("landmark_id", f"LM_{lm_info_item.get('landmark_name','unk')}")[:15],
+                    "class_name": lm_info_item.get("landmark_name", "Unknown Landmark"),
+                    "confidence": lm_info_item.get("confidence", 0.0),
+                    "box": box,
+                    "center": (center_x, center_y),
+                    "normalized_center": (norm_cx, norm_cy),
+                    "size": (box[2] - box[0], box[3] - box[1]),
+                    "normalized_size": (
+                        (box[2] - box[0])/(w_img if w_img>0 else 1),
+                        (box[3] - box[1])/(h_img if h_img>0 else 1)
+                    ),
+                    "area": (box[2] - box[0]) * (box[3] - box[1]),
+                    "normalized_area": (
+                        (box[2] - box[0]) * (box[3] - box[1])
+                    ) / ((w_img*h_img) if w_img*h_img >0 else 1),
+                    "is_landmark": True,
+                    "landmark_id": landmark_id,
+                    "location": loc_lm or "Unknown Location",
+                    "region": region,
+                    "year_built": lm_info_item.get("year_built", ""),
+                    "architectural_style": lm_info_item.get("architectural_style", ""),
+                    "significance": lm_info_item.get("significance", ""),
+                    "landmark_type": landmark_type
+                }
+                detected_objects_from_landmarks_list.append(landmark_obj)
+        if not detected_objects_from_landmarks_list:
+            return None
+        # 設定場景類型
+        best_scene_val = "tourist_landmark"  # 預設
+        if primary_landmark:
+            try:
+                lm_type = primary_landmark.get("landmark_type", "architectural")
+                if lm_type and "natural" in lm_type.lower():
+                    best_scene_val = "natural_landmark"
+                elif lm_type and ("historical" in lm_type.lower() or "monument" in lm_type.lower()):
+                    best_scene_val = "historical_monument"
+            except Exception as e:
+                self.logger.error(f"Error determining scene type from landmark type: {e}")
+        # 確保場景類型有效
+        if best_scene_val not in self.scene_types:
+            best_scene_val = "tourist_landmark"  # 預設場景類型
+        # 設定置信度
+        scene_confidence = primary_landmark.get("confidence", 0.0) if primary_landmark else 0.0
+        # 生成其他必要的分析結果
+        region_analysis = self._generate_region_analysis(detected_objects_from_landmarks_list)
+        functional_zones = self._generate_functional_zones(
+            detected_objects_from_landmarks_list,
+            best_scene_val
+        )
+        scene_description = self._generate_scene_description(
+            best_scene_val, detected_objects_from_landmarks_list, scene_confidence,
+            lighting_info, functional_zones, image_dims_val
+        )
+        enhanced_description = self._enhance_description_with_llm(
+            scene_description, best_scene_val, detected_objects_from_landmarks_list,
+            scene_confidence, lighting_info, functional_zones, landmark_results, image_dims_val
+        )
+        possible_activities = self._extract_possible_activities(detected_objects_from_landmarks_list, landmark_results)
+        # 準備最終結果
+        return {
+            "scene_type": best_scene_val,
+            "scene_name": self.scene_types.get(best_scene_val, {}).get("name", "Landmark"),
+            "confidence": round(float(scene_confidence), 4),
+            "description": scene_description,
+            "enhanced_description": enhanced_description,
+            "objects_present": detected_objects_from_landmarks_list,
+            "object_count": len(detected_objects_from_landmarks_list),
+            "regions": region_analysis,
+            "possible_activities": possible_activities,
+            "functional_zones": functional_zones,
+            "detected_landmarks": [lm for lm in detected_objects_from_landmarks_list if lm.get("is_landmark", False)],
+            "primary_landmark": primary_landmark,
+            "lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0.0}
+        }
+    def _attempt_clip_fallback_analysis(self, original_image_pil, image_dims_val,
+                                      current_run_enable_landmark, lighting_info) -> Optional[Dict]:
+        """嘗試使用 CLIP 進行一般場景分析。"""
+        try:
+            clip_analysis_val = None
+            if self.clip_analyzer and hasattr(self.clip_analyzer, 'analyze_image'):
+                try:
+                    clip_analysis_val = self.clip_analyzer.analyze_image(
+                        original_image_pil,
+                        enable_landmark=current_run_enable_landmark
+                    )
+                except Exception as e:
+                    self.logger.error(f"Error in CLIP analysis: {e}")
+            scene_type_llm = "llm_inferred_no_yolo"
+            confidence_llm = 0.0
+            if clip_analysis_val and isinstance(clip_analysis_val, dict):
+                top_scene = clip_analysis_val.get("top_scene")
+                if top_scene and isinstance(top_scene, tuple) and len(top_scene) >= 2:
+                    confidence_llm = top_scene[1]
+                    if isinstance(top_scene[0], str):
+                        scene_type_llm = top_scene[0]
+            desc_llm = "Primary object detection did not yield results. This description is based on overall image context."
+            w_llm, h_llm = image_dims_val if image_dims_val else (1, 1)
+            enhanced_desc_llm = self._enhance_no_detection_description(
+                desc_llm, scene_type_llm, confidence_llm, lighting_info,
+                clip_analysis_val, current_run_enable_landmark, w_llm, h_llm
+            )
+            # 安全類型轉換
+            try:
+                confidence_float = float(confidence_llm)
+            except (ValueError, TypeError):
+                confidence_float = 0.0
+            # 確保增強描述不為空
+            if not enhanced_desc_llm or not isinstance(enhanced_desc_llm, str):
+                enhanced_desc_llm = desc_llm
+            # 返回結果
+            return {
+                "scene_type": scene_type_llm,
+                "confidence": round(confidence_float, 4),
+                "description": desc_llm,
+                "enhanced_description": enhanced_desc_llm,
+                "objects_present": [],
+                "object_count": 0,
+                "regions": {},
+                "possible_activities": [],
+                "safety_concerns": [],
+                "lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0.0}
+            }
+        except Exception as e:
+            self.logger.error(f"Error in CLIP no-detection fallback (analyze method): {e}")
+            traceback.print_exc()
+            return None
+    def _get_final_fallback_result(self, places365_info, lighting_info) -> Dict:
+        """獲取最終的回退結果。"""
+        # 檢查 Places365 是否提供有用的場景信息（即使沒有 YOLO 檢測）
+        fallback_scene_type = "unknown"
+        fallback_confidence = 0.0
+        fallback_description = "No objects were detected in the image, and contextual analysis could not be performed or failed."
+        if places365_info and places365_info.get('confidence', 0) > 0.3:
+            fallback_scene_type = places365_info.get('mapped_scene_type', 'unknown')
+            fallback_confidence = places365_info.get('confidence', 0.0)
+            fallback_description = f"Scene appears to be {places365_info.get('scene_label', 'an unidentified location')} based on overall visual context."
+        return {
+            "scene_type": fallback_scene_type,
+            "confidence": fallback_confidence,
+            "description": fallback_description,
+            "enhanced_description": "The image analysis system could not detect any recognizable objects or landmarks in this image.",
+            "objects_present": [],
+            "object_count": 0,
+            "regions": {},
+            "possible_activities": [],
+            "safety_concerns": [],
+            "lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0.0}
+        }
+    def _handle_main_analysis_flow(self, detection_result, original_image_pil, image_dims_val,
+                                 class_confidence_threshold, scene_confidence_threshold,
+                                 current_run_enable_landmark, lighting_info, places365_info) -> Dict:
+        """處理主要的分析流程（有 YOLO 檢測結果）。"""
+        # 更新類別名稱映射
+        if hasattr(detection_result, 'names'):
+            if hasattr(self.spatial_analyzer, 'class_names'):
+                self.spatial_analyzer.class_names = detection_result.names
+        # 提取檢測到的物體
+        detected_objects_main = self.spatial_analyzer._extract_detected_objects(
+            detection_result,
+            confidence_threshold=class_confidence_threshold
+        )
+        if not detected_objects_main:
+            return {
+                "scene_type": "unknown", "confidence": 0.0,
+                "description": "No objects detected with sufficient confidence by the primary vision system.",
+                "objects_present": [], "object_count": 0, "regions": {}, "possible_activities": [],
+                "safety_concerns": [], "lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0.0}
+            }
+        # 空間分析
+        region_analysis_val = self.spatial_analyzer._analyze_regions(detected_objects_main)
+        # 地標處理和整合
+        landmark_objects_identified = []
+        landmark_specific_activities = []
+        final_landmark_info = {}
+        if self.use_clip and current_run_enable_landmark:
+            detected_objects_main, landmark_objects_identified = self.landmark_processing_manager.process_unknown_objects(
+                detection_result, detected_objects_main, self.clip_analyzer
+            )
+            if landmark_objects_identified:
+                landmark_specific_activities = self.landmark_processing_manager.extract_landmark_specific_activities(
+                    landmark_objects_identified
+                )
+                final_landmark_info = {
+                    "detected_landmarks": landmark_objects_identified,
+                    "primary_landmark": max(landmark_objects_identified, key=lambda x: x.get("confidence", 0.0), default=None),
+                    "detailed_landmarks": landmark_objects_identified
+                }
+        # 如果當前運行禁用地標檢測��清理地標物體
+        if not current_run_enable_landmark:
+            detected_objects_main = [obj for obj in detected_objects_main if not obj.get("is_landmark", False)]
+            final_landmark_info = {}
+        # 計算場景分數並進行融合
+        yolo_scene_scores = self.scene_scoring_engine.compute_scene_scores(
+            detected_objects_main, spatial_analysis_results=region_analysis_val
+        )
+        clip_scene_scores = {}
+        clip_analysis_results = None
+        if self.use_clip and original_image_pil is not None:
+            clip_analysis_results, clip_scene_scores = self._perform_clip_analysis(
+                original_image_pil, current_run_enable_landmark, lighting_info
+            )
+        # 融合場景分數
+        yolo_only_objects = [obj for obj in detected_objects_main if not obj.get("is_landmark")]
+        num_yolo_detections = len(yolo_only_objects)
+        avg_yolo_confidence = (sum(obj.get('confidence', 0.0) for obj in yolo_only_objects) / num_yolo_detections
+                              if num_yolo_detections > 0 else 0.0)
+        scene_scores_fused = self.scene_scoring_engine.fuse_scene_scores(
+            yolo_scene_scores, clip_scene_scores,
+            num_yolo_detections=num_yolo_detections,
+            avg_yolo_confidence=avg_yolo_confidence,
+            lighting_info=lighting_info,
+            places365_info=places365_info
+        )
+        # 確定最終場景類型
+        final_best_scene, final_scene_confidence = self.scene_scoring_engine.determine_scene_type(scene_scores_fused)
+        # 處理禁用地標檢測時的替代場景類型
+        if (not current_run_enable_landmark and
+            final_best_scene in ["tourist_landmark", "natural_landmark", "historical_monument"]):
+            alt_scene_type = self.landmark_processing_manager.get_alternative_scene_type(
+                final_best_scene, detected_objects_main, scene_scores_fused
+            )
+            final_best_scene = alt_scene_type
+            final_scene_confidence = scene_scores_fused.get(alt_scene_type, 0.6)
+        # 生成最終的描述性內容
+        final_result = self._generate_final_result(
+            final_best_scene, final_scene_confidence, detected_objects_main,
+            landmark_specific_activities, landmark_objects_identified, final_landmark_info,
+            region_analysis_val, lighting_info, scene_scores_fused, current_run_enable_landmark,
+            clip_analysis_results, image_dims_val, scene_confidence_threshold
+        )
+        return final_result
+    def _perform_clip_analysis(self, original_image_pil, current_run_enable_landmark, lighting_info) -> Tuple[Optional[Dict], Dict]:
+        """執行 CLIP 分析。"""
+        clip_analysis_results = None
+        clip_scene_scores = {}
+        try:
+            clip_analysis_results = self.clip_analyzer.analyze_image(
+                original_image_pil,
+                enable_landmark=current_run_enable_landmark,
+                exclude_categories=["landmark", "tourist", "monument", "tower", "attraction", "scenic", "historical", "famous"] if not current_run_enable_landmark else None
+            )
+            if isinstance(clip_analysis_results, dict):
+                clip_scene_scores = clip_analysis_results.get("scene_scores", {})
+                # 如果禁用地標檢測，再次過濾
+                if not current_run_enable_landmark:
+                    clip_scene_scores = {k: v for k, v in clip_scene_scores.items()
+                                       if not any(kw in k.lower() for kw in ["landmark", "monument", "tourist"])}
+                    if "cultural_analysis" in clip_analysis_results:
+                        del clip_analysis_results["cultural_analysis"]
+                    if ("top_scene" in clip_analysis_results and
+                        any(term in clip_analysis_results.get("top_scene", ["unknown", 0.0])[0].lower()
+                            for term in ["landmark", "monument", "tourist"])):
+                        non_lm_cs = sorted([item for item in clip_scene_scores.items() if item[1] > 0],
+                                         key=lambda x: x[1], reverse=True)
+                        clip_analysis_results["top_scene"] = non_lm_cs[0] if non_lm_cs else ("unknown", 0.0)
+                # 處理照明信息回退
+                if (not lighting_info and "lighting_condition" in clip_analysis_results):
+                    lt, lc = clip_analysis_results.get("lighting_condition", ("unknown", 0.0))
+                    lighting_info = {"time_of_day": lt, "confidence": lc, "source": "CLIP_fallback"}
+        except Exception as e:
+            self.logger.error(f"Error in main CLIP analysis for YOLO path (analyze method): {e}")
+        return clip_analysis_results, clip_scene_scores
+    def _generate_final_result(self, final_best_scene, final_scene_confidence, detected_objects_main,
+                             landmark_specific_activities, landmark_objects_identified, final_landmark_info,
+                             region_analysis_val, lighting_info, scene_scores_fused, current_run_enable_landmark,
+                             clip_analysis_results, image_dims_val, scene_confidence_threshold) -> Dict:
+        """生成最終的分析結果。"""
+        # 生成最終的描述性內容（活動、安全、區域）
+        final_activities = []
+        # 通用活動推斷
+        generic_activities = []
+        if self.descriptor and hasattr(self.descriptor, '_infer_possible_activities'):
+            generic_activities = self.descriptor._infer_possible_activities(
+                final_best_scene, detected_objects_main,
+                enable_landmark=current_run_enable_landmark, scene_scores=scene_scores_fused
+            )
+        # 優先處理策略：使用特定地標活動，不足時才從通用活動補充
+        if landmark_specific_activities:
+            # 如果有特定活動，優先保留，去除與特定活動重複的通用活動
+            unique_generic_activities = [act for act in generic_activities if act not in landmark_specific_activities]
+            # 如果特定活動少於3個，從通用活動中補充
+            if len(landmark_specific_activities) < 3:
+                # 補充通用活動但總數不超過7個
+                supplement_count = min(3 - len(landmark_specific_activities), len(unique_generic_activities))
+                if supplement_count > 0:
+                    final_activities.extend(unique_generic_activities[:supplement_count])
+        else:
+            # 若無特定活動，則使用所有通用活動
+            final_activities.extend(generic_activities)
+        # 去重並排序，但確保特定地標活動保持在前面
+        final_activities_set = set(final_activities)
+        final_activities = []
+        # 先加入特定地標活動（按原順序）
+        for activity in landmark_specific_activities:
+            if activity in final_activities_set:
+                final_activities.append(activity)
+                final_activities_set.remove(activity)
+        # 再加入通用活動（按字母排序）
+        final_activities.extend(sorted(list(final_activities_set)))
+        # 安全問題識別
+        final_safety_concerns = []
+        if self.descriptor and hasattr(self.descriptor, '_identify_safety_concerns'):
+            final_safety_concerns = self.descriptor._identify_safety_concerns(detected_objects_main, final_best_scene)
+        # 功能區域識別
+        final_functional_zones = {}
+        if self.spatial_analyzer and hasattr(self.spatial_analyzer, '_identify_functional_zones'):
+            general_zones = self.spatial_analyzer._identify_functional_zones(detected_objects_main, final_best_scene)
+            final_functional_zones.update(general_zones)
+        # 地標相關的功能區域
+        if landmark_objects_identified and self.spatial_analyzer and hasattr(self.spatial_analyzer, '_identify_landmark_zones'):
+            landmark_zones = self.spatial_analyzer._identify_landmark_zones(landmark_objects_identified)
+            final_functional_zones.update(landmark_zones)
+        # 如果當前運行禁用地標檢測，過濾相關內容
+        if not current_run_enable_landmark:
+            final_functional_zones = {
+                        str(k): v
+                        for k, v in final_functional_zones.items()
+                        if (not str(k).isdigit())
+                        and (not any(kw in str(k).lower() for kw in ["landmark", "monument", "viewing", "tourist"]))
+                    }
+            current_activities_temp = [act for act in final_activities
+                                     if not any(kw in act.lower() for kw in ["sightsee", "photograph", "tour", "histor", "landmark", "monument", "cultur"])]
+            final_activities = current_activities_temp
+            if not final_activities and self.descriptor and hasattr(self.descriptor, '_infer_possible_activities'):
+                final_activities = self.descriptor._infer_possible_activities("generic_street_view", detected_objects_main, enable_landmark=False)
+        # 創建淨化的光線資訊，避免不合理的時間描述
+        lighting_info_clean = None
+        if lighting_info:
+            lighting_info_clean = {
+                "is_indoor": lighting_info.get("is_indoor"),
+                "confidence": lighting_info.get("confidence", 0.0),
+                "time_of_day": lighting_info.get("time_of_day", "unknown")
+            }
+        # 生成場景描述
+        base_scene_description = self._generate_scene_description(
+            final_best_scene, detected_objects_main, final_scene_confidence,
+            lighting_info_clean, final_functional_zones, image_dims_val
+        )
+        # 清理地標引用（如果禁用地標檢測）
+        if not current_run_enable_landmark:
+            base_scene_description = self.landmark_processing_manager.remove_landmark_references(base_scene_description)
+        # LLM 增強
+        enhanced_final_description = self._enhance_final_description(
+            base_scene_description, final_best_scene, final_scene_confidence, detected_objects_main,
+            final_functional_zones, final_activities, final_safety_concerns, lighting_info,
+            clip_analysis_results, current_run_enable_landmark, image_dims_val, final_landmark_info
+        )
+        # 清理增強描述的地標引用
+        if not current_run_enable_landmark:
+            enhanced_final_description = self.landmark_processing_manager.remove_landmark_references(enhanced_final_description)
+        # 構建最終輸出字典
+        output_result = {
+            "scene_type": final_best_scene if final_scene_confidence >= scene_confidence_threshold else "unknown",
+            "scene_name": (self.scene_types.get(final_best_scene, {}).get("name", "Unknown Scene")
+                          if final_scene_confidence >= scene_confidence_threshold else "Unknown Scene"),
+            "confidence": round(float(final_scene_confidence), 4),
+            "description": base_scene_description,
+            "enhanced_description": enhanced_final_description,
+            "objects_present": [{"class_id": obj.get("class_id", -1),
+                               "class_name": obj.get("class_name", "unknown"),
+                               "confidence": round(float(obj.get("confidence", 0.0)), 4)}
+                              for obj in detected_objects_main],
+            "object_count": len(detected_objects_main),
+            "regions": region_analysis_val,
+            "possible_activities": final_activities,
+            "safety_concerns": final_safety_concerns,
+            "functional_zones": final_functional_zones,
+            "lighting_conditions": lighting_info if lighting_info else {"time_of_day": "unknown", "confidence": 0.0, "source": "default"}
+        }
+        # 添加替代場景
+        if self.descriptor and hasattr(self.descriptor, '_get_alternative_scenes'):
+            output_result["alternative_scenes"] = self.descriptor._get_alternative_scenes(
+                scene_scores_fused, scene_confidence_threshold, top_k=2
+            )
+        # 添加地標相關信息
+        if current_run_enable_landmark and final_landmark_info and final_landmark_info.get("detected_landmarks"):
+            output_result.update(final_landmark_info)
+            if final_best_scene in ["tourist_landmark", "natural_landmark", "historical_monument"]:
+                output_result["scene_source"] = "landmark_detection"
+        elif not current_run_enable_landmark:
+            for key_rm in ["detected_landmarks", "primary_landmark", "detailed_landmarks", "scene_source"]:
+                if key_rm in output_result:
+                    del output_result[key_rm]
+        # 添加 CLIP 分析結果
+        if clip_analysis_results and isinstance(clip_analysis_results, dict) and "error" not in clip_analysis_results:
+            top_scene_clip = clip_analysis_results.get("top_scene", ("unknown", 0.0))
+            output_result["clip_analysis"] = {
+                "top_scene": (top_scene_clip[0], round(float(top_scene_clip[1]), 4)),
+                "cultural_analysis": clip_analysis_results.get("cultural_analysis", {}) if current_run_enable_landmark else {}
+            }
+        return output_result
+    # 輔助方法
+    def _generate_region_analysis(self, detected_objects):
+        """生成區域分析結果。"""
+        if self.spatial_analyzer and hasattr(self.spatial_analyzer, '_analyze_regions'):
+            try:
+                return self.spatial_analyzer._analyze_regions(detected_objects)
+            except Exception as e:
+                self.logger.error(f"Error analyzing regions: {e}")
+        return {}
+    def _generate_functional_zones(self, detected_objects, scene_type):
+        """
+        生成功能區域。
+        由於原本直接呼叫 _identify_landmark_zones，導致非地標場景必定回 {}。
+        這裡改為呼叫 _identify_functional_zones，並帶入 scene_type。
+        """
+        try:
+            # 如果 spatial_analyzer 可以識別 functional zones，就調用它
+            if self.spatial_analyzer and hasattr(self.spatial_analyzer, '_identify_functional_zones'):
+                return self.spatial_analyzer._identify_functional_zones(detected_objects, scene_type)
+        except Exception as e:
+            self.logger.error(f"Error identifying functional zones: {e}")
+            self.logger.error(traceback.format_exc())
+        return {}
+    def _generate_scene_description(self, scene_type, detected_objects, confidence,
+                                  lighting_info, functional_zones, image_dims):
+        """生成場景描述。"""
+        if self.scene_describer and hasattr(self.scene_describer, 'generate_description'):
+            try:
+                for obj in detected_objects:
+                    if obj.get("is_landmark"):
+                        loc_obj = obj.get("location", "")
+                        lm_id_obj = obj.get("landmark_id")
+                        if (not loc_obj) and lm_id_obj and lm_id_obj in ALL_LANDMARKS:
+                            obj["location"] = ALL_LANDMARKS[lm_id_obj].get("location", "")
+                return self.scene_describer.generate_description(
+                    scene_type=scene_type,
+                    detected_objects=detected_objects,
+                    confidence=confidence,
+                    lighting_info=lighting_info,
+                    functional_zones=list(functional_zones.keys()) if functional_zones else [],
+                    enable_landmark=self.enable_landmark,
+                    scene_scores={scene_type: confidence},
+                    spatial_analysis={},
+                    image_dimensions=image_dims
+                )
+            except Exception as e:
+                self.logger.error(f"Error generating scene description: {e}")
+        return f"A {scene_type} scene."
+    def _enhance_description_with_llm(self, scene_description, scene_type, detected_objects,
+                                    confidence, lighting_info, functional_zones, landmark_results, image_dims):
+        """使用 LLM 增強描述。"""
+        if not self.use_llm or not self.llm_enhancer:
+            return scene_description
+        try:
+            prominent_objects_detail = ""
+            if self.scene_describer and hasattr(self.scene_describer, 'format_object_list_for_description'):
+                try:
+                    prominent_objects_detail = self.scene_describer.format_object_list_for_description(
+                        detected_objects[:min(1, len(detected_objects))]
+                    )
+                except Exception as e:
+                    self.logger.error(f"Error formatting object list: {e}")
+            w_img, h_img = image_dims if image_dims else (1, 1)
+            scene_data_llm = {
+                "original_description": scene_description,
+                "scene_type": scene_type,
+                "scene_name": self.scene_types.get(scene_type, {}).get("name", "Landmark"),
+                "detected_objects": detected_objects,
+                "object_list": "landmark",
+                "confidence": confidence,
+                "lighting_info": lighting_info,
+                "functional_zones": functional_zones,
+                "clip_analysis": landmark_results.get("clip_analysis_on_full_image", {}),
+                "enable_landmark": True,
+                "image_width": w_img,
+                "image_height": h_img,
+                "prominent_objects_detail": prominent_objects_detail
+            }
+            return self.llm_enhancer.enhance_description(scene_data_llm)
+        except Exception as e:
+            self.logger.error(f"Error enhancing description with LLM: {e}")
+            traceback.print_exc()
+            return scene_description
+    def _enhance_no_detection_description(self, desc, scene_type, confidence, lighting_info,
+                                        clip_analysis, enable_landmark, width, height):
+        """增強無檢測結果的描述。"""
+        if not self.use_llm or not self.llm_enhancer:
+            return desc
+        try:
+            clip_analysis_safe = {}
+            if isinstance(clip_analysis, dict):
+                clip_analysis_safe = clip_analysis
+            scene_data_llm = {
+                "original_description": desc,
+                "scene_type": scene_type,
+                "scene_name": "Contextually Inferred (No Detections)",
+                "detected_objects": [],
+                "object_list": "general ambiance",
+                "confidence": confidence,
+                "lighting_info": lighting_info or {"time_of_day": "unknown", "confidence": 0.0},
+                "clip_analysis": clip_analysis_safe,
+                "enable_landmark": enable_landmark,
+                "image_width": width,
+                "image_height": height,
+                "prominent_objects_detail": "the overall visual context"
+            }
+            if hasattr(self.llm_enhancer, 'enhance_description'):
+                try:
+                    enhanced = self.llm_enhancer.enhance_description(scene_data_llm)
+                    if enhanced and len(enhanced.strip()) >= 20:
+                        return enhanced
+                except Exception as e:
+                    self.logger.error(f"Error in enhance_description: {e}")
+            if hasattr(self.llm_enhancer, 'handle_no_detection'):
+                try:
+                    return self.llm_enhancer.handle_no_detection(clip_analysis_safe)
+                except Exception as e:
+                    self.logger.error(f"Error in handle_no_detection: {e}")
+        except Exception as e:
+            self.logger.error(f"Error preparing data for LLM enhancement: {e}")
+            traceback.print_exc()
+        return desc
+    def _extract_possible_activities(self, detected_objects, landmark_results):
+        """提取可能的活動。"""
+        possible_activities = ["Sightseeing"]
+        # 檢查是否��主要地標活動從 CLIP 分析結果中獲取
+        primary_landmark_activities = landmark_results.get("primary_landmark_activities", [])
+        if primary_landmark_activities:
+            self.logger.info(f"Using {len(primary_landmark_activities)} landmark-specific activities")
+            possible_activities = primary_landmark_activities
+        else:
+            # 從檢測到的地標中提取特定活動
+            landmark_specific_activities = self.landmark_processing_manager.extract_landmark_specific_activities(detected_objects)
+            if landmark_specific_activities:
+                possible_activities = list(set(landmark_specific_activities))  # 去重
+                self.logger.info(f"Extracted {len(possible_activities)} activities from landmark data")
+            else:
+                # 回退到通用活動推斷
+                if self.descriptor and hasattr(self.descriptor, '_infer_possible_activities'):
+                    try:
+                        possible_activities = self.descriptor._infer_possible_activities(
+                            "tourist_landmark",
+                            detected_objects,
+                            enable_landmark=True,
+                            scene_scores={"tourist_landmark": 0.8}
+                        )
+                    except Exception as e:
+                        self.logger.error(f"Error inferring possible activities: {e}")
+        return possible_activities
+    def _enhance_final_description(self, base_description, scene_type, scene_confidence, detected_objects,
+                                 functional_zones, activities, safety_concerns, lighting_info,
+                                 clip_analysis_results, enable_landmark, image_dims, landmark_info):
+        """增強最終描述。"""
+        if not self.use_llm or not self.llm_enhancer:
+            return base_description
+        try:
+            obj_list_for_llm = ", ".join(sorted(list(set(
+                obj["class_name"] for obj in detected_objects
+                if obj.get("confidence", 0) > 0.4 and not obj.get("is_landmark")
+            ))))
+            if not obj_list_for_llm and enable_landmark and landmark_info.get("primary_landmark"):
+                obj_list_for_llm = landmark_info["primary_landmark"].get("class_name", "a prominent feature")
+            elif not obj_list_for_llm:
+                obj_list_for_llm = "various visual elements"
+            # 生成物體統計信息
+            object_statistics = {}
+            for obj in detected_objects:
+                class_name = obj.get("class_name", "unknown")
+                if class_name not in object_statistics:
+                    object_statistics[class_name] = {
+                        "count": 0,
+                        "avg_confidence": 0.0,
+                        "max_confidence": 0.0,
+                        "instances": []
+                    }
+                stats = object_statistics[class_name]
+                stats["count"] += 1
+                stats["instances"].append(obj)
+                stats["max_confidence"] = max(stats["max_confidence"], obj.get("confidence", 0.0))
+            # 計算平均信心度
+            for class_name, stats in object_statistics.items():
+                if stats["count"] > 0:
+                    total_conf = sum(inst.get("confidence", 0.0) for inst in stats["instances"])
+                    stats["avg_confidence"] = total_conf / stats["count"]
+            llm_scene_data = {
+                "original_description": base_description,
+                "scene_type": scene_type,
+                "scene_name": self.scene_types.get(scene_type, {}).get("name", "Unknown Scene"),
+                "detected_objects": detected_objects,
+                "object_list": obj_list_for_llm,
+                "object_statistics": object_statistics,
+                "confidence": scene_confidence,
+                "lighting_info": lighting_info,
+                "functional_zones": functional_zones,
+                "activities": activities,
+                "safety_concerns": safety_concerns,
+                "clip_analysis": clip_analysis_results if isinstance(clip_analysis_results, dict) else None,
+                "enable_landmark": enable_landmark,
+                "image_width": image_dims[0] if image_dims else None,
+                "image_height": image_dims[1] if image_dims else None,
+                "prominent_objects_detail": ""
+            }
+            # 添加顯著物體詳細信息
+            if self.scene_describer and hasattr(self.scene_describer, 'get_prominent_objects') and hasattr(self.scene_describer, 'format_object_list_for_description'):
+                try:
+                    prominent_objects = self.scene_describer.get_prominent_objects(
+                        detected_objects, min_prominence_score=0.1, max_categories_to_return=3, max_total_objects=7
+                    )
+                    llm_scene_data["prominent_objects_detail"] = self.scene_describer.format_object_list_for_description(prominent_objects)
+                except Exception as e:
+                    self.logger.error(f"Error getting prominent objects: {e}")
+            if enable_landmark and landmark_info.get("primary_landmark"):
+                llm_scene_data["primary_landmark_info"] = landmark_info["primary_landmark"]
+            return self.llm_enhancer.enhance_description(llm_scene_data)
+        except Exception as e:
+            self.logger.error(f"Error in LLM Enhancement in main flow (analyze method): {e}")
+            return base_description

scene_analyzer.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

scene_scoring_engine.py ADDED Viewed

	@@ -0,0 +1,491 @@

+import logging
+import traceback
+from typing import Dict, List, Tuple, Optional, Any
+from scene_type import SCENE_TYPES
+class SceneScoringEngine:
+    """
+    負責場景評分相關的所有計算邏輯，包括基於 YOLO 檢測的場景評分、
+    多種場景分數融合，以及最終場景類型的確定。
+    這邊會有YOLO, CLIP, Places365混合運用的分數計算
+    """
+    # 日常場景，用於特殊評分
+    EVERYDAY_SCENE_TYPE_KEYS = [
+        "general_indoor_space", "generic_street_view",
+        "desk_area_workspace", "outdoor_gathering_spot",
+        "kitchen_counter_or_utility_area"
+    ]
+    def __init__(self, scene_types: Dict[str, Any], enable_landmark: bool = True):
+        """
+        初始化場景評分引擎。
+        Args:
+            scene_types: 場景類型定義字典
+            enable_landmark: 是否啟用地標檢測功能
+        """
+        self.logger = logging.getLogger(__name__)
+        self.scene_types = scene_types
+        self.enable_landmark = enable_landmark
+    def compute_scene_scores(self, detected_objects: List[Dict],
+                           spatial_analysis_results: Optional[Dict] = None) -> Dict[str, float]:
+        """
+        基於檢測到的物體計算各場景類型的置信度分數。
+        增強了對日常場景的評分能力，並考慮物體豐富度和空間聚合性。
+        Args:
+            detected_objects: 檢測到的物體列表，包含物體詳細資訊
+            spatial_analysis_results: 空間分析器的輸出結果，特別是 'objects_by_region' 部分
+        Returns:
+            場景類型到置信度分數的映射字典
+        """
+        scene_scores = {}
+        if not detected_objects:
+            for scene_type_key in self.scene_types:
+                scene_scores[scene_type_key] = 0.0
+            return scene_scores
+        # 準備檢測物體的數據
+        detected_class_ids_all = [obj["class_id"] for obj in detected_objects]
+        detected_classes_set_all = set(detected_class_ids_all)
+        class_counts_all = {}
+        for obj in detected_objects:
+            class_id = obj["class_id"]
+            class_counts_all[class_id] = class_counts_all.get(class_id, 0) + 1
+        # 評估 scene_types 中定義的每個場景類型
+        for scene_type, scene_def in self.scene_types.items():
+            required_obj_ids_defined = set(scene_def.get("required_objects", []))
+            optional_obj_ids_defined = set(scene_def.get("optional_objects", []))
+            min_required_matches_needed = scene_def.get("minimum_required", 0)
+            # 確定哪些實際檢測到的物體與此場景類型相關
+            # 這些列表將存儲實際檢測到的物體字典，而不僅僅是 class_ids
+            actual_required_objects_found_list = []
+            for req_id in required_obj_ids_defined:
+                if req_id in detected_classes_set_all:
+                    # 找到此必需物體的第一個實例添加到列表中（用於後續的聚合性檢查）
+                    for dobj in detected_objects:
+                        if dobj['class_id'] == req_id:
+                            actual_required_objects_found_list.append(dobj)
+                            break
+            num_required_matches_found = len(actual_required_objects_found_list)
+            actual_optional_objects_found_list = []
+            for opt_id in optional_obj_ids_defined:
+                if opt_id in detected_classes_set_all:
+                    for dobj in detected_objects:
+                        if dobj['class_id'] == opt_id:
+                            actual_optional_objects_found_list.append(dobj)
+                            break
+            num_optional_matches_found = len(actual_optional_objects_found_list)
+            # 初始分數計算權重
+            # 基礎分數：55% 來自必需物體，25% 來自可選物體，10% 豐富度，10% 聚合性（最大值）
+            required_weight = 0.55
+            optional_weight = 0.25
+            richness_bonus_max = 0.10
+            cohesion_bonus_max = 0.10  # _get_object_spatial_cohesion_score 的最大獎勵是 0.1
+            current_scene_score = 0.0
+            objects_to_check_for_cohesion = []  # 用於空間聚合性評分
+            # 檢查 minimum_required 條件並計算基礎分數
+            if num_required_matches_found >= min_required_matches_needed:
+                if len(required_obj_ids_defined) > 0:
+                    required_ratio = num_required_matches_found / len(required_obj_ids_defined)
+                else:  # 沒有定義必需物體，但 min_required_matches_needed 可能為 0
+                    required_ratio = 1.0 if min_required_matches_needed == 0 else 0.0
+                current_scene_score = required_ratio * required_weight
+                objects_to_check_for_cohesion.extend(actual_required_objects_found_list)
+                # 從可選物體添加分數
+                if len(optional_obj_ids_defined) > 0:
+                    optional_ratio = num_optional_matches_found / len(optional_obj_ids_defined)
+                    current_scene_score += optional_ratio * optional_weight
+                objects_to_check_for_cohesion.extend(actual_optional_objects_found_list)
+            # 日常場景的靈活處理，如果嚴格的 minimum_required（基於 'required_objects'）未滿足
+            elif scene_type in self.EVERYDAY_SCENE_TYPE_KEYS:
+                # 如果日常場景有許多可選項目，它仍可能是一個弱候選
+                # 檢查是否存在相當比例的 'optional_objects'
+                if (len(optional_obj_ids_defined) > 0 and
+                    (num_optional_matches_found / len(optional_obj_ids_defined)) >= 0.25):  # 例如，至少 25% 的典型可選項目
+                    # 對這些類型的基礎分數更多地基於可選物體的滿足度
+                    current_scene_score = (num_optional_matches_found / len(optional_obj_ids_defined)) * (required_weight + optional_weight * 0.5)  # 給予一些基礎分數
+                    objects_to_check_for_cohesion.extend(actual_optional_objects_found_list)
+                else:
+                    scene_scores[scene_type] = 0.0
+                    continue  # 跳過此場景類型
+            else:  # 對於非日常場景，如果未滿足 minimum_required，分數為 0
+                scene_scores[scene_type] = 0.0
+                continue
+            # 物體豐富度/多樣性的獎勵
+            # 考慮找到的與場景定義相關的唯一物體類別
+            relevant_defined_class_ids = required_obj_ids_defined.union(optional_obj_ids_defined)
+            unique_relevant_detected_classes = relevant_defined_class_ids.intersection(detected_classes_set_all)
+            object_richness_score = 0.0
+            if len(relevant_defined_class_ids) > 0:
+                richness_ratio = len(unique_relevant_detected_classes) / len(relevant_defined_class_ids)
+                object_richness_score = min(richness_bonus_max, richness_ratio * 0.15)  # 豐富度最大 10% 獎勵
+            current_scene_score += object_richness_score
+            # 空間聚合性的獎勵（如果提供了 spatial_analysis_results）
+            spatial_cohesion_bonus = 0.0
+            if spatial_analysis_results and objects_to_check_for_cohesion:
+                spatial_cohesion_bonus = self._get_object_spatial_cohesion_score(
+                    objects_to_check_for_cohesion,  # 傳遞實際檢測到的物體字典列表
+                    spatial_analysis_results
+                )
+            current_scene_score += spatial_cohesion_bonus  # 此獎勵最大 0.1
+            # 關鍵物體多個實例的獎勵（原始邏輯的精煉版）
+            multiple_instance_bonus = 0.0
+            # 對於多實例獎勵，專注於場景定義中心的物體
+            key_objects_for_multi_instance_check = required_obj_ids_defined
+            if scene_type in self.EVERYDAY_SCENE_TYPE_KEYS and len(optional_obj_ids_defined) > 0:
+                # 對於日常場景，如果某些可選物體多次出現，也可以是關鍵的
+                # 例如，"general_indoor_space" 中的多把椅子
+                key_objects_for_multi_instance_check = key_objects_for_multi_instance_check.union(
+                    set(list(optional_obj_ids_defined)[:max(1, len(optional_obj_ids_defined)//2)])  # 考慮前半部分的可選物體
+                )
+            for class_id_check in key_objects_for_multi_instance_check:
+                if class_id_check in detected_classes_set_all and class_counts_all.get(class_id_check, 0) > 1:
+                    multiple_instance_bonus += 0.025  # 每種類型稍微小一點的獎勵
+            current_scene_score += min(0.075, multiple_instance_bonus)  # 最大 7.5% 獎勵
+            # 應用 SCENE_TYPES 中定義的場景特定優先級
+            if "priority" in scene_def:
+                current_scene_score *= scene_def["priority"]
+            scene_scores[scene_type] = min(1.0, max(0.0, current_scene_score))
+        # 如果通過實例屬性 self.enable_landmark 禁用地標檢測，
+        # 確保地標特定場景類型的分數被歸零。
+        if not self.enable_landmark:
+            landmark_scene_types = ["tourist_landmark", "natural_landmark", "historical_monument"]
+            for lm_scene_type in landmark_scene_types:
+                if lm_scene_type in scene_scores:
+                    scene_scores[lm_scene_type] = 0.0
+        return scene_scores
+    def _get_object_spatial_cohesion_score(self, objects_for_scene: List[Dict],
+                                         spatial_analysis_results: Optional[Dict]) -> float:
+        """
+        基於場景關鍵物體的空間聚合程度計算分數。
+        較高的分數意味著物體在較少的區域中更加集中。
+        這是一個啟發式方法，可以進一步精煉。
+        Args:
+            objects_for_scene: 與當前評估��景類型相關的檢測物體列表（至少包含 'class_id' 的字典）
+            spatial_analysis_results: SpatialAnalyzer._analyze_regions 的輸出
+                                    預期格式：{'objects_by_region': {'region_name': [{'class_id': id, ...}, ...]}}
+        Returns:
+            float: 聚合性分數，通常是小額獎勵（例如，0.0 到 0.1）
+        """
+        if (not objects_for_scene or not spatial_analysis_results or
+            "objects_by_region" not in spatial_analysis_results or
+            not spatial_analysis_results["objects_by_region"]):
+            return 0.0
+        # 獲取定義當前場景類型的關鍵物體的 class_ids 集合
+        key_object_class_ids = {obj.get('class_id') for obj in objects_for_scene if obj.get('class_id') is not None}
+        if not key_object_class_ids:
+            return 0.0
+        # 找出這些關鍵物體出現在哪些區域
+        regions_containing_key_objects = set()
+        # 計算找到的關鍵物體實例數量
+        # 這有助於區分 1 個區域中的 1 把椅子與分佈在 5 個區域中的 5 把椅子
+        total_key_object_instances_found = 0
+        for region_name, objects_in_region_list in spatial_analysis_results["objects_by_region"].items():
+            region_has_key_object = False
+            for obj_in_region in objects_in_region_list:
+                if obj_in_region.get('class_id') in key_object_class_ids:
+                    region_has_key_object = True
+                    total_key_object_instances_found += 1  # 計算每個實例
+            if region_has_key_object:
+                regions_containing_key_objects.add(region_name)
+        num_distinct_key_objects_in_scene = len(key_object_class_ids)  # 關鍵物體的類型數量
+        num_instances_of_key_objects_passed = len(objects_for_scene)  # 傳遞的實例數量
+        if not regions_containing_key_objects or num_instances_of_key_objects_passed == 0:
+            return 0.0
+        # 簡單的啟發式方法：
+        if (len(regions_containing_key_objects) == 1 and
+            total_key_object_instances_found >= num_instances_of_key_objects_passed * 0.75):
+            return 0.10  # 最強聚合性：大部分/所有關鍵物體實例在單個區域中
+        elif (len(regions_containing_key_objects) <= 2 and
+              total_key_object_instances_found >= num_instances_of_key_objects_passed * 0.60):
+            return 0.05  # 中等聚合性：大部分/所有關鍵物體實例在最多兩個區域中
+        elif (len(regions_containing_key_objects) <= 3 and
+              total_key_object_instances_found >= num_instances_of_key_objects_passed * 0.50):
+            return 0.02  # 較弱聚合性
+        return 0.0
+    def determine_scene_type(self, scene_scores: Dict[str, float]) -> Tuple[str, float]:
+        """
+        基於分數確定最可能的場景類型。如果偵測到地標分數夠高，則優先回傳 "tourist_landmark"。
+        Args:
+            scene_scores: 場景類型到置信度分數的映射字典
+        Returns:
+            (最佳場景類型, 置信度) 的元組
+        """
+        if not scene_scores:
+            return "unknown", 0.0
+        # 檢查地標相關分數是否達到門檻，如果是，直接回傳 "tourist_landmark"
+        # 假設場景分數 dictionary 中，"tourist_landmark"、"historical_monument"、"natural_landmark" 三個 key
+        # 分別代表不同類型地標。將它們加總，若總分超過 0.3，就認定為地標場景。
+        landmark_score = (
+            scene_scores.get("tourist_landmark", 0.0) +
+            scene_scores.get("historical_monument", 0.0) +
+            scene_scores.get("natural_landmark", 0.0)
+        )
+        if landmark_score >= 0.3:
+            # 回傳地標場景類型，以及該分數總和
+            return "tourist_landmark", float(landmark_score)
+        # 找分數最高的那個場景
+        best_scene = max(scene_scores, key=scene_scores.get)
+        best_score = scene_scores[best_scene]
+        return best_scene, float(best_score)
+    def fuse_scene_scores(self, yolo_scene_scores: Dict[str, float],
+                         clip_scene_scores: Dict[str, float],
+                         num_yolo_detections: int = 0,
+                         avg_yolo_confidence: float = 0.0,
+                         lighting_info: Optional[Dict] = None,
+                         places365_info: Optional[Dict] = None) -> Dict[str, float]:
+        """
+        融合來自 YOLO 物體檢測、CLIP 分析和 Places365 場景分類的場景分數。
+        根據場景類型、YOLO 檢測的豐富度、照明資訊和 Places365 置信度調整權重。
+        Args:
+            yolo_scene_scores: 基於 YOLO 物體檢測的場景分數
+            clip_scene_scores: 基於 CLIP 分析的場景分數
+            num_yolo_detections: YOLO 檢測到的置信度足夠的非地標物體總數
+            avg_yolo_confidence: YOLO 檢測到的非地標物體��平均置信度
+            lighting_info: 可選的照明條件分析結果，預期包含 'is_indoor' (bool) 和 'confidence' (float)
+            places365_info: 可選的 Places365 場景分類結果，預期包含 'mapped_scene_type'、'confidence' 和 'is_indoor'
+        Returns:
+            Dict: 融合了所有三個分析來源的場景分數
+        """
+        # 處理其中一個分數字典可能為空或所有分數實際上為零的情況
+        # 提取和處理 Places365 場景分數
+        places365_scene_scores_map = {}  # 修改變數名稱以避免與傳入的字典衝突
+        if places365_info and places365_info.get('confidence', 0) > 0.1:
+            mapped_scene_type = places365_info.get('mapped_scene_type', 'unknown')
+            places365_confidence = places365_info.get('confidence', 0.0)
+            if mapped_scene_type in self.scene_types.keys():
+                places365_scene_scores_map[mapped_scene_type] = places365_confidence  # 使用新的字典
+                self.logger.info(f"Places365 contributing: {mapped_scene_type} with confidence {places365_confidence:.3f}")
+        # 檢查各個數據來源是否具有有意義的分數
+        yolo_has_meaningful_scores = bool(yolo_scene_scores and any(s > 1e-5 for s in yolo_scene_scores.values()))  # 確保是布林值
+        clip_has_meaningful_scores = bool(clip_scene_scores and any(s > 1e-5 for s in clip_scene_scores.values()))  # 確保是布林值
+        places365_has_meaningful_scores = bool(places365_scene_scores_map and any(s > 1e-5 for s in places365_scene_scores_map.values()))
+        # 計算有意義的數據來源數量
+        meaningful_sources_count = sum([
+            yolo_has_meaningful_scores,
+            clip_has_meaningful_scores,
+            places365_has_meaningful_scores
+        ])
+        # 處理特殊情況：無有效數據源或僅有單一數據源
+        if meaningful_sources_count == 0:
+            return {st: 0.0 for st in self.scene_types.keys()}
+        elif meaningful_sources_count == 1:
+            if yolo_has_meaningful_scores:
+                return {st: yolo_scene_scores.get(st, 0.0) for st in self.scene_types.keys()}
+            elif clip_has_meaningful_scores:
+                return {st: clip_scene_scores.get(st, 0.0) for st in self.scene_types.keys()}
+            elif places365_has_meaningful_scores:
+                return {st: places365_scene_scores_map.get(st, 0.0) for st in self.scene_types.keys()}
+        # 初始化融合分數結果字典
+        fused_scores = {}
+        all_relevant_scene_types = set(self.scene_types.keys())
+        all_possible_scene_types = all_relevant_scene_types.union(
+            set(yolo_scene_scores.keys()),
+            set(clip_scene_scores.keys()),
+            set(places365_scene_scores_map.keys())
+        )
+        # 基礎權重 - 調整以適應三個來源
+        default_yolo_weight = 0.5
+        default_clip_weight = 0.3
+        default_places365_weight = 0.2
+        is_lighting_indoor = None
+        lighting_analysis_confidence = 0.0
+        if lighting_info and isinstance(lighting_info, dict):
+            is_lighting_indoor = lighting_info.get("is_indoor")
+            lighting_analysis_confidence = lighting_info.get("confidence", 0.0)
+        for scene_type in all_possible_scene_types:
+            yolo_score = yolo_scene_scores.get(scene_type, 0.0)
+            clip_score = clip_scene_scores.get(scene_type, 0.0)
+            places365_score = places365_scene_scores_map.get(scene_type, 0.0)
+            current_yolo_weight = default_yolo_weight
+            current_clip_weight = default_clip_weight
+            current_places365_weight = default_places365_weight
+            scene_definition = self.scene_types.get(scene_type, {})
+            # 基於場景類型性質和 YOLO 豐富度的權重調整
+            if scene_type in self.EVERYDAY_SCENE_TYPE_KEYS:
+                # Places365 在日常場景分類方面表現出色
+                if num_yolo_detections >= 5 and avg_yolo_confidence >= 0.45:  # 豐富的 YOLO 用於日常場景
+                    current_yolo_weight = 0.60
+                    current_clip_weight = 0.15
+                    current_places365_weight = 0.25
+                elif num_yolo_detections >= 3:  # 中等 YOLO 用於日常場景
+                    current_yolo_weight = 0.50
+                    current_clip_weight = 0.20
+                    current_places365_weight = 0.30
+                else:  # 降低 YOLO 用於日常場景，更多依賴 Places365
+                    current_yolo_weight = 0.35
+                    current_clip_weight = 0.25
+                    current_places365_weight = 0.40
+            # 對於 CLIP 的全域理解或特定訓練通常更有價值的場景
+            elif any(keyword in scene_type.lower() for keyword in ["asian", "cultural", "aerial", "landmark", "monument", "tourist", "natural_landmark", "historical_monument"]):
+                current_yolo_weight = 0.25
+                current_clip_weight = 0.65
+                current_places365_weight = 0.10  # 地標場景的較低權重
+            # 對於特定室內常見場景（非地標），物體檢測是關鍵，但 Places365 提供強大的場景上下文
+            elif any(keyword in scene_type.lower() for keyword in
+                    ["room", "kitchen", "office", "bedroom", "desk_area", "indoor_space",
+                     "professional_kitchen", "cafe", "library", "gym", "retail_store",
+                     "supermarket", "classroom", "conference_room", "medical_facility",
+                     "educational_setting", "dining_area"]):
+                current_yolo_weight = 0.55
+                current_clip_weight = 0.20
+                current_places365_weight = 0.25
+            # 對於特定室外常見場景（非地標），物體仍然重要
+            elif any(keyword in scene_type.lower() for keyword in
+                    ["parking_lot", "park_area", "beach", "harbor", "playground", "sports_field", "bus_stop", "train_station", "airport"]):
+                current_yolo_weight = 0.50
+                current_clip_weight = 0.25
+                current_places365_weight = 0.25
+            # 如果為此次運行全域禁用地標檢測
+            if not self.enable_landmark:
+                if any(keyword in scene_type.lower() for keyword in ["landmark", "monument", "tourist"]):
+                    yolo_score = 0.0  # 應該已經從 compute_scene_scores 中為 0
+                    clip_score *= 0.05  # 重度懲罰
+                    places365_score *= 0.8 if scene_type not in self.EVERYDAY_SCENE_TYPE_KEYS else 1.0  # 地標場景的輕微懲罰
+                elif (scene_type not in self.EVERYDAY_SCENE_TYPE_KEYS and
+                      not any(keyword in scene_type.lower() for keyword in ["asian", "cultural", "aerial"])):
+                    # 將權重從 CLIP 重新分配給 YOLO 和 Places365
+                    weight_boost = 0.05
+                    current_yolo_weight = min(0.9, current_yolo_weight + weight_boost)
+                    current_places365_weight = min(0.9, current_places365_weight + weight_boost)
+                    current_clip_weight = max(0.1, current_clip_weight - weight_boost * 2)
+            # 如果 Places365 對此特定場景類型有高置信度，則提升其權重
+            if places365_score > 0.0 and places365_info:  # 這裡的 places365_score 已經是從 map 中獲取
+                places365_original_confidence = places365_info.get('confidence', 0.0)  # 獲取原始的 Places365 信心度
+                if places365_original_confidence > 0.7:
+                    boost_factor = min(0.2, (places365_original_confidence - 0.7) * 0.4)
+                    current_places365_weight += boost_factor
+                    total_other_weight = current_yolo_weight + current_clip_weight
+                    if total_other_weight > 0:
+                        reduction_factor = boost_factor / total_other_weight
+                        current_yolo_weight *= (1 - reduction_factor)
+                        current_clip_weight *= (1 - reduction_factor)
+            # 權重標準化處理
+            total_weight = current_yolo_weight + current_clip_weight + current_places365_weight
+            if total_weight > 0:  # 避免除以零
+                current_yolo_weight /= total_weight
+                current_clip_weight /= total_weight
+                current_places365_weight /= total_weight
+            else:
+                current_yolo_weight = 1/3
+                current_clip_weight = 1/3
+                current_places365_weight = 1/3
+             # 計算融合score
+            fused_score = (yolo_score * current_yolo_weight) + (clip_score * current_clip_weight) + (places365_score * current_places365_weight)
+            # 處理室內外判斷的衝突分析
+            places365_is_indoor = None
+            places365_confidence_for_indoor = 0.0
+            effective_is_indoor = is_lighting_indoor
+            effective_confidence = lighting_analysis_confidence
+            if places365_info and isinstance(places365_info, dict):
+                places365_is_indoor = places365_info.get('is_indoor')
+                places365_confidence_for_indoor = places365_info.get('confidence', 0.0)
+                # Places365 在置信度高時覆蓋照明分析
+                if places365_confidence_for_indoor >= 0.8 and places365_is_indoor is not None:
+                    effective_is_indoor = places365_is_indoor
+                    effective_confidence = places365_confidence_for_indoor
+                    # 只在特定場景類型首次處理時輸出調試資訊
+                    if (scene_type == "intersection" or
+                        (scene_type in ["urban_intersection", "street_view"] and
+                         scene_type == sorted(all_possible_scene_types)[0])):
+                        self.logger.debug(f"Using Places365 indoor/outdoor decision: {places365_is_indoor} (confidence: {places365_confidence_for_indoor:.3f}) over lighting analysis")
+            if effective_is_indoor is not None and effective_confidence >= 0.65:
+                # 基於其定義確定場景類型本質上是室內還是室外
+                is_defined_as_indoor = ("indoor" in scene_definition.get("description", "").lower() or
+                                       any(kw in scene_type.lower() for kw in ["room", "kitchen", "office", "indoor", "library", "cafe", "gym"]))
+                is_defined_as_outdoor = ("outdoor" in scene_definition.get("description", "").lower() or
+                                        any(kw in scene_type.lower() for kw in ["street", "park", "aerial", "beach", "harbor", "intersection", "crosswalk"]))
+                lighting_adjustment_strength = 0.20  # 最大調整因子（例如，20%）
+                # 根據分析在閾值以上的置信度來縮放調整
+                adjustment_scale = (effective_confidence - 0.65) / (1.0 - 0.65)  # 從 0 到 1 縮放
+                adjustment = lighting_adjustment_strength * adjustment_scale
+                adjustment = min(lighting_adjustment_strength, max(0, adjustment))  # 限制調整
+                if effective_is_indoor and is_defined_as_outdoor:
+                    fused_score *= (1.0 - adjustment)
+                elif not effective_is_indoor and is_defined_as_indoor:
+                    fused_score *= (1.0 - adjustment)
+                elif effective_is_indoor and is_defined_as_indoor:
+                    fused_score = min(1.0, fused_score * (1.0 + adjustment * 0.5))
+                elif not effective_is_indoor and is_defined_as_outdoor:
+                    fused_score = min(1.0, fused_score * (1.0 + adjustment * 0.5))
+            fused_scores[scene_type] = min(1.0, max(0.0, fused_score))
+        return fused_scores
+    def update_enable_landmark_status(self, enable_landmark: bool):
+        """
+        更新地標檢測的啟用狀態。
+        Args:
+            enable_landmark: 是否啟用地標檢測
+        """
+        self.enable_landmark = enable_landmark

scene_viewpoint_analyzer.py ADDED Viewed

	@@ -0,0 +1,311 @@

+import logging
+import traceback
+import numpy as np
+from typing import Dict, List, Any, Optional, Tuple
+logger = logging.getLogger(__name__)
+class SceneViewpointAnalyzer:
+    """
+    負責場景視角檢測和模式識別
+    專注於檢測場景視角（俯視、平視等）並識別特殊場景模式（如十字路口、人流方向等）
+    提供詳細的場景空間分析和視角相關的場景理解功能
+    """
+    def __init__(self, enhanced_scene_describer=None):
+        """
+        初始化場景視角分析器
+        Args:
+            enhanced_scene_describer: 增強場景描述器實例，用於基本視角檢測
+        """
+        try:
+            self.enhanced_scene_describer = enhanced_scene_describer
+            logger.info("SceneViewpointAnalyzer initialized successfully")
+        except Exception as e:
+            logger.error(f"Failed to initialize SceneViewpointAnalyzer: {str(e)}")
+            logger.error(traceback.format_exc())
+            raise
+    def detect_viewpoint(self, detected_objects: List[Dict]) -> str:
+        """
+        檢測圖像視角類型
+        Args:
+            detected_objects: 檢測到的物件列表
+        Returns:
+            str: 檢測到的視角類型
+        """
+        try:
+            # 使用內部的場景視角檢測方法
+            viewpoint_info = self.detect_scene_viewpoint(detected_objects)
+            return viewpoint_info.get("viewpoint", "eye_level")
+        except Exception as e:
+            logger.warning(f"Error detecting viewpoint: {str(e)}")
+            return "eye_level"
+    def get_viewpoint_confidence(self, detected_objects: List[Dict]) -> Tuple[str, float]:
+        """
+        獲取視角檢測結果及其信心度
+        Args:
+            detected_objects: 檢測到的物件列表
+        Returns:
+            Tuple[str, float]: (視角類型, 信心度)
+        """
+        try:
+            viewpoint_info = self.detect_scene_viewpoint(detected_objects)
+            viewpoint = viewpoint_info.get("viewpoint", "eye_level")
+            # 根據檢測到的模式計算信心度
+            patterns = viewpoint_info.get("patterns", [])
+            confidence = 0.5  # 基礎信心度
+            if "crosswalk_intersection" in patterns:
+                confidence += 0.3
+            if "consistent_object_size" in patterns:
+                confidence += 0.2
+            if "multi_directional_movement" in patterns:
+                confidence += 0.1
+            confidence = min(confidence, 1.0)
+            return viewpoint, confidence
+        except Exception as e:
+            logger.error(f"Error getting viewpoint confidence: {str(e)}")
+            return "eye_level", 0.5
+    def detect_scene_viewpoint(self, detected_objects: List[Dict]) -> Dict:
+        """
+        檢測場景視角並識別特殊場景模式
+        Args:
+            detected_objects: 檢測到的物件列表
+        Returns:
+            包含視角和場景模式資訊的字典
+        """
+        try:
+            if not detected_objects:
+                logger.warning("No detected objects provided for viewpoint detection")
+                return {"viewpoint": "eye_level", "patterns": []}
+            # 從物件位置中提取資訊
+            patterns = []
+            # 檢測行人位置模式 - 篩選出所有行人物件
+            pedestrian_objs = [obj for obj in detected_objects if obj.get("class_id") == 0]
+            # 檢查是否有足夠的行人來識別模式 - 至少需要4個行人才能進行模式分析
+            if len(pedestrian_objs) >= 4:
+                # 提取行人的標準化中心座標用於模式分析
+                pedestrian_positions = [obj["normalized_center"] for obj in pedestrian_objs]
+                # 檢測十字交叉模式 - 這通常出現在斑馬線交叉口的俯視圖
+                if self._detect_cross_pattern(pedestrian_positions):
+                    patterns.append("crosswalk_intersection")
+                # 檢測多方向行人流 - 分析行人是否在多個方向移動
+                directions = self._analyze_movement_directions(pedestrian_positions)
+                if len(directions) >= 2:
+                    patterns.append("multi_directional_movement")
+            # 檢查物件的大小一致性 - 在空中俯視圖中，物件大小通常更一致
+            # 因為距離相對均勻，不像地面視角會有遠近差異
+            if len(detected_objects) >= 5:
+                sizes = [obj.get("normalized_area", 0) for obj in detected_objects]
+                # 計算標準化變異數，避免受平均值影響
+                size_variance = np.var(sizes) / (np.mean(sizes) ** 2) if np.mean(sizes) > 0 else 0
+                # 低變異表示大小一致，可能是俯視角度
+                if size_variance < 0.3:
+                    patterns.append("consistent_object_size")
+            # 基本視角檢測 - 使用增強場景描述器進行基礎視角判斷
+            viewpoint = "eye_level"  # 預設值
+            if self.enhanced_scene_describer and hasattr(self.enhanced_scene_describer, '_detect_viewpoint'):
+                viewpoint = self.enhanced_scene_describer._detect_viewpoint(detected_objects)
+            # 根據檢測到的模式增強視角判斷
+            # 如果檢測到斑馬線交叉但視角判斷不是空中視角，優先採用模式判斷
+            if "crosswalk_intersection" in patterns and viewpoint != "aerial":
+                viewpoint = "aerial"
+            result = {
+                "viewpoint": viewpoint,
+                "patterns": patterns
+            }
+            logger.info(f"Viewpoint detection completed: {viewpoint}, patterns: {patterns}")
+            return result
+        except Exception as e:
+            logger.error(f"Error in scene viewpoint detection: {str(e)}")
+            logger.error(traceback.format_exc())
+            return {"viewpoint": "eye_level", "patterns": []}
+    def _detect_cross_pattern(self, positions: List[List[float]]) -> bool:
+        """
+        檢測位置中的十字交叉模式
+        這種模式通常出現在十字路口的俯視圖中，行人分布呈現十字形
+        Args:
+            positions: 位置列表 [[x1, y1], [x2, y2], ...]
+        Returns:
+            是否檢測到十字交叉模式
+        """
+        try:
+            if len(positions) < 8:  # 需要足夠多的點才能形成有意義的十字模式
+                return False
+            # 提取 x 和 y 座標進行分析
+            x_coords = [pos[0] for pos in positions]
+            y_coords = [pos[1] for pos in positions]
+            # 計算座標的平均值，用於確定中心線位置
+            x_mean = np.mean(x_coords)
+            y_mean = np.mean(y_coords)
+            # 計算在中心線附近的點數量
+            # 如果有足夠多的點在垂直和水平中心線附近，可能是十字交叉
+            near_x_center = sum(1 for x in x_coords if abs(x - x_mean) < 0.1)  # 容忍10%的偏差
+            near_y_center = sum(1 for y in y_coords if abs(y - y_mean) < 0.1)  # 容忍10%的偏差
+            # 十字交叉模式的判斷條件：垂直和水平方向都有足夠的點聚集
+            is_cross_pattern = near_x_center >= 3 and near_y_center >= 3
+            if is_cross_pattern:
+                logger.info(f"Cross pattern detected with {near_x_center} points near vertical center and {near_y_center} points near horizontal center")
+            return is_cross_pattern
+        except Exception as e:
+            logger.error(f"Error detecting cross pattern: {str(e)}")
+            logger.error(traceback.format_exc())
+            return False
+    def _analyze_movement_directions(self, positions: List[List[float]]) -> List[str]:
+        """
+        分析位置中的移動方向
+        通過分析座標分布範圍來推斷主要的移動方向
+        Args:
+            positions: 位置列表 [[x1, y1], [x2, y2], ...]
+        Returns:
+            檢測到的主要方向列表
+        """
+        try:
+            if len(positions) < 6:  # 需要足夠的點才能分析方向性
+                return []
+            # 提取 x 和 y 座標
+            x_coords = [pos[0] for pos in positions]
+            y_coords = [pos[1] for pos in positions]
+            directions = []
+            # 水平移動分析（左右移動）
+            # 計算x座標的標準差和範圍來判斷水平方向的分散程度
+            x_std = np.std(x_coords)
+            x_range = max(x_coords) - min(x_coords)
+            # 垂直移動分析（上下移動）
+            # 計算y座標的標準差和範圍來判斷垂直方向的分散程度
+            y_std = np.std(y_coords)
+            y_range = max(y_coords) - min(y_coords)
+            # 足夠大的範圍表示該方向有明顯的運動或分散
+            # 40%的圖像範圍被認為是有意義的移動範圍
+            if x_range > 0.4:
+                directions.append("horizontal")
+                logger.debug(f"Horizontal movement detected with range: {x_range:.3f}")
+            if y_range > 0.4:
+                directions.append("vertical")
+                logger.debug(f"Vertical movement detected with range: {y_range:.3f}")
+            logger.info(f"Movement directions analyzed: {directions}")
+            return directions
+        except Exception as e:
+            logger.error(f"Error analyzing movement directions: {str(e)}")
+            logger.error(traceback.format_exc())
+            return []
+    def detect_aerial_view_indicators(self, detected_objects: List[Dict]) -> Dict:
+        """
+        檢測俯視角度的指標
+        分析物件分布特徵來判斷是否為俯視角度
+        Args:
+            detected_objects: 檢測到的物件列表
+        Returns:
+            包含俯視角度指標的字典
+        """
+        try:
+            indicators = {
+                "consistent_sizing": False,
+                "grid_like_distribution": False,
+                "high_object_density": False,
+                "aerial_score": 0.0
+            }
+            if not detected_objects:
+                return indicators
+            # 檢查物件大小的一致性
+            sizes = [obj.get("normalized_area", 0) for obj in detected_objects]
+            if len(sizes) >= 3:
+                size_variance = np.var(sizes) / (np.mean(sizes) ** 2) if np.mean(sizes) > 0 else 1
+                # 俯視角度通常物件大小較為一致
+                indicators["consistent_sizing"] = size_variance < 0.3
+            # 檢查是否有網格狀分布（如停車場的俯視圖）
+            positions = [obj.get("normalized_center", [0.5, 0.5]) for obj in detected_objects]
+            if len(positions) >= 6:
+                # 簡化的網格檢測：檢查是否有規律的行列分布
+                x_coords = [pos[0] for pos in positions]
+                y_coords = [pos[1] for pos in positions]
+                # 計算座標的分布是否接近規律網格
+                x_unique = len(set([round(x, 1) for x in x_coords]))  # 四捨五入到0.1精度
+                y_unique = len(set([round(y, 1) for y in y_coords]))
+                # 如果x和y方向都有多個不同的規律位置，可能是網格分布
+                indicators["grid_like_distribution"] = x_unique >= 3 and y_unique >= 3
+            # 檢查物件密度
+            total_objects = len(detected_objects)
+            # 俯視角度通常能看到更多物件
+            indicators["high_object_density"] = total_objects >= 8
+            # 計算俯視角度評分
+            score = 0
+            if indicators["consistent_sizing"]:
+                score += 0.4
+            if indicators["grid_like_distribution"]:
+                score += 0.4
+            if indicators["high_object_density"]:
+                score += 0.2
+            indicators["aerial_score"] = score
+            logger.info(f"Aerial view indicators: score={score:.2f}, consistent_sizing={indicators['consistent_sizing']}, grid_distribution={indicators['grid_like_distribution']}, high_density={indicators['high_object_density']}")
+            return indicators
+        except Exception as e:
+            logger.error(f"Error detecting aerial view indicators: {str(e)}")
+            logger.error(traceback.format_exc())
+            return {
+                "consistent_sizing": False,
+                "grid_like_distribution": False,
+                "high_object_density": False,
+                "aerial_score": 0.0
+            }

scene_zone_identifier.py ADDED Viewed

	@@ -0,0 +1,1728 @@

+import logging
+import traceback
+import numpy as np
+from typing import Dict, List, Any, Optional
+logger = logging.getLogger(__name__)
+class SceneZoneIdentifier:
+    """
+    負責不同場景類型的區域識別邏輯
+    專注於根據場景類型執行相應的功能區域識別策略
+    """
+    def __init__(self):
+        """初始化場景區域辨識器"""
+        try:
+            logger.info("SceneZoneIdentifier initialized successfully")
+        except Exception as e:
+            logger.error(f"Failed to initialize SceneZoneIdentifier: {str(e)}")
+            logger.error(traceback.format_exc())
+            raise
+    def identify_indoor_zones(self, category_regions: Dict, detected_objects: List[Dict], scene_type: str) -> Dict:
+        """
+        平衡化的室內功能區域識別並標準化命名
+        採用通用的物件關聯性分析，避免只針對特定場景
+        Args:
+            category_regions: 按類別和區域分組的物件字典
+            detected_objects: 檢測到的物件列表
+            scene_type: 場景類型
+        Returns:
+            識別出的室內功能區域字典，使用描述性鍵名
+        """
+        try:
+            zones = {}
+            # 主要功能區域（基於物件關聯性而非場景類型）
+            primary_zone = self._identify_primary_functional_area(detected_objects)
+            if primary_zone:
+                # 基於區域內容生成描述性鍵名
+                descriptive_key = self._generate_descriptive_zone_key_from_data(primary_zone, "primary")
+                zones[descriptive_key] = primary_zone
+            # 只有明確證據且物件數量足夠時創建次要功能區域
+            if len(zones) >= 1 and len(detected_objects) >= 6:
+                secondary_zone = self._identify_secondary_functional_area(detected_objects, zones)
+                if secondary_zone:
+                    # 基於區域內容生成描述性鍵名
+                    descriptive_key = self._generate_descriptive_zone_key_from_data(secondary_zone, "secondary")
+                    zones[descriptive_key] = secondary_zone
+            logger.info(f"Identified {len(zones)} indoor zones for scene type '{scene_type}'")
+            return zones
+        except Exception as e:
+            logger.error(f"Error identifying indoor zones: {str(e)}")
+            logger.error(traceback.format_exc())
+            return {}
+    def _generate_descriptive_zone_key_from_data(self, zone_data: Dict, priority_level: str) -> str:
+        """
+        基於區域數據生成描述性鍵名
+        Args:
+            zone_data: 區域數據字典
+            priority_level: 優先級別（primary/secondary）
+        Returns:
+            str: 描述性區域鍵名
+        """
+        try:
+            objects = zone_data.get("objects", [])
+            region = zone_data.get("region", "")
+            description = zone_data.get("description", "")
+            # 基於物件內容確定功能類型
+            if any("dining" in obj.lower() or "table" in obj.lower() for obj in objects):
+                base_name = "dining area"
+            elif any("chair" in obj.lower() or "sofa" in obj.lower() for obj in objects):
+                base_name = "seating area"
+            elif any("bed" in obj.lower() for obj in objects):
+                base_name = "sleeping area"
+            elif any("laptop" in obj.lower() or "keyboard" in obj.lower() for obj in objects):
+                base_name = "workspace area"
+            elif any("plant" in obj.lower() or "vase" in obj.lower() for obj in objects):
+                base_name = "decorative area"
+            elif any("refrigerator" in obj.lower() or "microwave" in obj.lower() for obj in objects):
+                base_name = "kitchen area"
+            else:
+                # 基於描述內容推斷
+                if "dining" in description.lower():
+                    base_name = "dining area"
+                elif "seating" in description.lower() or "relaxation" in description.lower():
+                    base_name = "seating area"
+                elif "work" in description.lower():
+                    base_name = "workspace area"
+                elif "decorative" in description.lower():
+                    base_name = "decorative area"
+                else:
+                    base_name = "functional area"
+            # 為次要區域添加位置標識以區分
+            if priority_level == "secondary" and region:
+                spatial_context = self._get_spatial_context_description(region)
+                if spatial_context:
+                    return f"{spatial_context} {base_name}"
+            return base_name
+        except Exception as e:
+            logger.warning(f"Error generating descriptive zone key: {str(e)}")
+            return "activity area"
+    def _get_spatial_context_description(self, region: str) -> str:
+        """
+        獲取空間上下文描述
+        Args:
+            region: 區域位置標識
+        Returns:
+            str: 空間上下文描述
+        """
+        try:
+            spatial_mapping = {
+                "top_left": "upper left",
+                "top_center": "upper",
+                "top_right": "upper right",
+                "middle_left": "left side",
+                "middle_center": "central",
+                "middle_right": "right side",
+                "bottom_left": "lower left",
+                "bottom_center": "lower",
+                "bottom_right": "lower right"
+            }
+            return spatial_mapping.get(region, "")
+        except Exception as e:
+            logger.warning(f"Error getting spatial context for region '{region}': {str(e)}")
+            return ""
+    def identify_outdoor_general_zones(self, category_regions: Dict, detected_objects: List[Dict], scene_type: str) -> Dict:
+        """
+        識別一般戶外場景的功能區域
+        Args:
+            category_regions: 按類別和區域分組的物件字典
+            detected_objects: 檢測到的物件列表
+            scene_type: 特定戶外場景類型
+        Returns:
+            戶外功能區域字典
+        """
+        try:
+            zones = {}
+            # 識別行人區域
+            people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
+            if people_objs:
+                people_regions = {}
+                for obj in people_objs:
+                    region = obj["region"]
+                    if region not in people_regions:
+                        people_regions[region] = []
+                    people_regions[region].append(obj)
+                if people_regions:
+                    # 找到主要的行人活動區域
+                    main_people_regions = sorted(people_regions.items(),
+                                            key=lambda x: len(x[1]),
+                                            reverse=True)[:2]  # 取前2個區域
+                    for idx, (region, objs) in enumerate(main_people_regions):
+                        if len(objs) > 0:
+                            # 生成基於位置的描述性鍵名
+                            spatial_desc = self._get_directional_description(region)
+                            if spatial_desc and spatial_desc != "central":
+                                zone_key = f"{spatial_desc} pedestrian area"
+                            else:
+                                zone_key = "main pedestrian area" if idx == 0 else "secondary pedestrian area"
+                            zones[zone_key] = {
+                                "region": region,
+                                "objects": ["person"] * len(objs),
+                                "description": f"Pedestrian area with {len(objs)} {'people' if len(objs) > 1 else 'person'}"
+                            }
+            # 識別車輛區域，適用於街道和停車場
+            vehicle_objs = [obj for obj in detected_objects if obj["class_id"] in [1, 2, 3, 5, 6, 7]]
+            if vehicle_objs:
+                vehicle_regions = {}
+                for obj in vehicle_objs:
+                    region = obj["region"]
+                    if region not in vehicle_regions:
+                        vehicle_regions[region] = []
+                    vehicle_regions[region].append(obj)
+                if vehicle_regions:
+                    main_vehicle_region = max(vehicle_regions.items(),
+                                        key=lambda x: len(x[1]),
+                                        default=(None, []))
+                    if main_vehicle_region[0] is not None:
+                        vehicle_types = [obj["class_name"] for obj in main_vehicle_region[1]]
+                        zones["vehicle_zone"] = {
+                            "region": main_vehicle_region[0],
+                            "objects": vehicle_types,
+                            "description": f"Traffic area with {', '.join(list(set(vehicle_types))[:3])}"
+                        }
+            # 針對公園區域的特殊處理
+            if scene_type == "park_area":
+                zones.update(self._identify_park_recreational_zones(detected_objects))
+            # 針對停車場的特殊處理
+            if scene_type == "parking_lot":
+                zones.update(self._identify_parking_zones(detected_objects))
+            logger.info(f"Identified {len(zones)} outdoor zones for scene type '{scene_type}'")
+            return zones
+        except Exception as e:
+            logger.error(f"Error identifying outdoor general zones: {str(e)}")
+            logger.error(traceback.format_exc())
+            return {}
+    def identify_intersection_zones(self, category_regions: Dict, detected_objects: List[Dict], viewpoint: str) -> Dict:
+        """
+        辨識城市十字路口的功能區域，無論是否有行人，只要偵測到紅綠燈就一定顯示 Traffic Control Area；
+        若有行人，則額外建立 Crossing Zone 並把行人 + 同 region 的紅綠燈歸在一起。
+        Args:
+            category_regions: 按類別和 region 分組的物件字典
+            detected_objects: YOLO 檢測到的所有物件列表
+            viewpoint: 偵測到的視角字串
+        Returns:
+            zones: 最終的十字路口功能區域字典
+        """
+        try:
+            zones = {}
+            # 1. 按 class_id 分出行人、車輛、紅綠燈
+            pedestrian_objs    = [obj for obj in detected_objects if obj["class_id"] == 0]
+            vehicle_objs       = [obj for obj in detected_objects if obj["class_id"] in [1, 2, 3, 5, 7]]
+            traffic_light_objs = [obj for obj in detected_objects if obj["class_id"] == 9]
+            # 2. Step A: 無條件建立 Traffic Control Area
+            #    把每個 region 下的紅綠燈都先分群，生成對應 zone，確保「只要偵測到紅綠燈就一定顯示」
+            signal_regions_all = {}
+            for t in traffic_light_objs:
+                region = t["region"]
+                signal_regions_all.setdefault(region, []).append(t)
+            for idx, (region, signals) in enumerate(signal_regions_all.items()):
+                # 先決定 zone_key (依 direction 或 primary/auxiliary)
+                direction = self._get_directional_description(region)
+                if direction and direction != "central":
+                    zone_key = f"{direction} traffic control area"
+                else:
+                    zone_key = "primary traffic control area" if idx == 0 else "auxiliary traffic control area"
+                # 確保命名不衝突
+                if zone_key in zones:
+                    suffix = 1
+                    new_key = f"{zone_key} ({suffix})"
+                    while new_key in zones:
+                        suffix += 1
+                        new_key = f"{zone_key} ({suffix})"
+                    zone_key = new_key
+                zones[zone_key] = {
+                    "region": region,
+                    "objects": ["traffic light"] * len(signals),
+                    "description": f"Traffic control area with {len(signals)} traffic lights in {region}"
+                }
+            # (用於後面計算 Crossing 使用掉的 traffic light)
+            used_tl_count_per_region = dict.fromkeys(signal_regions_all.keys(), 0)
+            # 3. Step B: 如果有行人，就建立 Crossing Zone，並移除已被打包的紅綠燈
+            if pedestrian_objs:
+                # 先呼叫 _analyze_crossing_patterns，讓它回傳「行人 + 同 region 的紅綠燈」區
+                crossing_zones = self._analyze_crossing_patterns(pedestrian_objs, traffic_light_objs)
+                # 把 Crossing Zone 加到最終 zones，並同時記錄已使用掉的紅綠燈數量
+                for zone_key, zone_info in crossing_zones.items():
+                    region = zone_info.get("region", "")
+                    obj_list = zone_info.get("objects", [])
+                    # 如果該 zone_info["objects"] 裡含有紅綠燈，就累加到 used_tl_count_per_region
+                    count_in_zone = obj_list.count("traffic light")
+                    if count_in_zone > 0:
+                        used_tl_count_per_region[region] = used_tl_count_per_region.get(region, 0) + count_in_zone
+                    # 加入最終結果
+                    # 如果 key 重複，也可以在此加上 index，或直接覆蓋
+                    if zone_key in zones:
+                        suffix = 1
+                        new_key = f"{zone_key} ({suffix})"
+                        while new_key in zones:
+                            suffix += 1
+                            new_key = f"{zone_key} ({suffix})"
+                        zone_key = new_key
+                    zones[zone_key] = {
+                        "region": region,
+                        "objects": obj_list,
+                        "description": zone_info.get("description", "")
+                    }
+            # 4. Step C: 計算並顯示 debug 資訊 (Total / Used / Remaining)
+            for region, signals in signal_regions_all.items():
+                total = len(signals)
+                used = used_tl_count_per_region.get(region, 0)
+                remaining = total - used
+                # print(f"[DEBUG] Region '{region}': Total TL = {total}, Used in crossing = {used}, Remaining = {remaining}")
+            # 5. Step D: 分析車輛交通區域（Vehicle Zones）
+            if vehicle_objs:
+                traffic_zones = self._analyze_traffic_zones(vehicle_objs)
+                # _analyze_traffic_zones 內部已用英文 debug，直接更新
+                for zone_key, zone_info in traffic_zones.items():
+                    if zone_key in zones:
+                        suffix = 1
+                        new_key = f"{zone_key} ({suffix})"
+                        while new_key in zones:
+                            suffix += 1
+                            new_key = f"{zone_key} ({suffix})"
+                        zone_key = new_key
+                    zones[zone_key] = zone_info
+            logger.info(f"Identified {len(zones)} intersection zones")
+            return zones
+        except Exception as e:
+            logger.error(f"Error in identify_intersection_zones: {str(e)}")
+            logger.error(traceback.format_exc())
+            return {}
+    def identify_aerial_view_zones(self, category_regions: Dict, detected_objects: List[Dict], scene_type: str) -> Dict:
+        """
+        辨識空中視角場景的功能區域
+        專注於模式和流動而非特定區域
+        Args:
+            category_regions: 按類別和區域分組的物件字典
+            detected_objects: 檢測到的物件列表
+            scene_type: 特定場景類型
+        Returns:
+            空中視角功能區域字典
+        """
+        try:
+            zones = {}
+            # 識別行人模式
+            people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
+            if people_objs:
+                # 將位置轉換為數組進行模式分析
+                positions = np.array([obj["normalized_center"] for obj in people_objs])
+                if len(positions) >= 3:
+                    # 計算分布指標
+                    x_coords = positions[:, 0]
+                    y_coords = positions[:, 1]
+                    x_mean = np.mean(x_coords)
+                    y_mean = np.mean(y_coords)
+                    x_std = np.std(x_coords)
+                    y_std = np.std(y_coords)
+                    # 判斷人群是否組織成線性模式
+                    if x_std < 0.1 or y_std < 0.1:
+                        # 沿一個軸的線性分布
+                        pattern_direction = "vertical" if x_std < y_std else "horizontal"
+                        zones["pedestrian_pattern"] = {
+                            "region": "central",
+                            "objects": ["person"] * len(people_objs),
+                            "description": f"Aerial view shows a {pattern_direction} pedestrian movement pattern"
+                        }
+                    else:
+                        # 更分散的模式
+                        zones["pedestrian_distribution"] = {
+                            "region": "wide",
+                            "objects": ["person"] * len(people_objs),
+                            "description": f"Aerial view shows pedestrians distributed across the area"
+                        }
+            # 識別車輛模式進行交通分析
+            vehicle_objs = [obj for obj in detected_objects if obj["class_id"] in [1, 2, 3, 5, 6, 7]]
+            if vehicle_objs:
+                zones.update(self._analyze_aerial_traffic_patterns(vehicle_objs))
+            # 針對十字路口特定空中視角的處理
+            if "intersection" in scene_type:
+                zones.update(self._identify_aerial_intersection_features(detected_objects))
+            # 針對廣場空中視角的處理
+            if "plaza" in scene_type:
+                zones.update(self._identify_aerial_plaza_features(people_objs))
+            logger.info(f"Identified {len(zones)} aerial view zones")
+            return zones
+        except Exception as e:
+            logger.error(f"Error identifying aerial view zones: {str(e)}")
+            logger.error(traceback.format_exc())
+            return {}
+    def identify_asian_cultural_zones(self, category_regions: Dict, detected_objects: List[Dict], scene_type: str) -> Dict:
+        """
+        辨識有亞洲文化背景的場景功能區域
+        Args:
+            category_regions: 按類別和區域分組的物件字典
+            detected_objects: 檢測到的物件列表
+            scene_type: 特定場景類型
+        Returns:
+            亞洲文化功能區域字典
+        """
+        try:
+            zones = {}
+            # 識別店面區域
+            # 由於店面不能直接檢測，從情境推斷
+            # 例如，尋找有標誌、行人和小物件的區域
+            storefront_regions = {}
+            for obj in detected_objects:
+                if obj["class_id"] == 0:  # Person
+                    region = obj["region"]
+                    if region not in storefront_regions:
+                        storefront_regions[region] = []
+                    storefront_regions[region].append(obj)
+            # 將人最多的區域作為店面區域
+            if storefront_regions:
+                main_storefront_regions = sorted(storefront_regions.items(),
+                                            key=lambda x: len(x[1]),
+                                            reverse=True)[:2]  # 前2個區域
+                for idx, (region, objs) in enumerate(main_storefront_regions):
+                    # 生成基於位置的描述性鍵名
+                    spatial_desc = self._get_directional_description(region)
+                    if spatial_desc and spatial_desc != "central":
+                        zone_key = f"{spatial_desc} commercial area"
+                    else:
+                        zone_key = "main commercial area" if idx == 0 else "secondary commercial area"
+                    zones[zone_key] = {
+                        "region": region,
+                        "objects": [obj["class_name"] for obj in objs],
+                        "description": f"Asian commercial storefront with pedestrian activity"
+                    }
+            # 辨識行人通道
+            zones.update(self._identify_asian_pedestrian_pathway(detected_objects))
+            # 辨識攤販區域（小攤/商店 - 從情境推斷）
+            zones.update(self._identify_vendor_zones(detected_objects))
+            # 針對夜市的特殊處理
+            if scene_type == "asian_night_market":
+                zones["food_stall_zone"] = {
+                    "region": "middle_center",
+                    "objects": ["inferred food stalls"],
+                    "description": "Food stall area typical of Asian night markets"
+                }
+            logger.info(f"Identified {len(zones)} Asian cultural zones")
+            return zones
+        except Exception as e:
+            logger.error(f"Error identifying Asian cultural zones: {str(e)}")
+            logger.error(traceback.format_exc())
+            return {}
+    def identify_upscale_dining_zones(self, category_regions: Dict, detected_objects: List[Dict]) -> Dict:
+        """
+        辨識高級餐飲設置的功能區域
+        Args:
+            category_regions: 按類別和區域分組的物件字典
+            detected_objects: 檢測到的物件列表
+        Returns:
+            高級餐飲功能區域字典
+        """
+        try:
+            zones = {}
+            # 辨識餐桌區域
+            dining_items = []
+            dining_regions = {}
+            for obj in detected_objects:
+                if obj["class_id"] in [40, 41, 42, 43, 44, 45, 60]:  # Wine glass, cup, fork, knife, spoon, bowl, table
+                    region = obj["region"]
+                    if region not in dining_regions:
+                        dining_regions[region] = []
+                    dining_regions[region].append(obj)
+                    dining_items.append(obj["class_name"])
+            if dining_items:
+                main_dining_region = max(dining_regions.items(),
+                                    key=lambda x: len(x[1]),
+                                    default=(None, []))
+                if main_dining_region[0] is not None:
+                    zones["formal_dining_zone"] = {
+                        "region": main_dining_region[0],
+                        "objects": list(set(dining_items)),
+                        "description": f"Formal dining area with {', '.join(list(set(dining_items))[:3])}"
+                    }
+            # 識別裝飾區域，增強檢測
+            zones.update(self._identify_upscale_decorative_zones(detected_objects))
+            # 識別座位安排區域
+            zones.update(self._identify_dining_seating_zones(detected_objects))
+            # 識別服務區域（如果與餐飲區域不同）
+            zones.update(self._identify_serving_zones(detected_objects, zones))
+            logger.info(f"Identified {len(zones)} upscale dining zones")
+            return zones
+        except Exception as e:
+            logger.error(f"Error identifying upscale dining zones: {str(e)}")
+            logger.error(traceback.format_exc())
+            return {}
+    def identify_financial_district_zones(self, category_regions: Dict, detected_objects: List[Dict]) -> Dict:
+        """
+        金融區場景的功能區域
+        Args:
+            category_regions: 按類別和區域分組的物件字典
+            detected_objects: 檢測到的物件列表
+        Returns:
+            金融區功能區域字典
+        """
+        try:
+            zones = {}
+            # 識別交通區域
+            traffic_items = []
+            traffic_regions = {}
+            for obj in detected_objects:
+                if obj["class_id"] in [1, 2, 3, 5, 6, 7, 9]:  # 各種車輛和交通燈
+                    region = obj["region"]
+                    if region not in traffic_regions:
+                        traffic_regions[region] = []
+                    traffic_regions[region].append(obj)
+                    traffic_items.append(obj["class_name"])
+            if traffic_items:
+                main_traffic_region = max(traffic_regions.items(),
+                                    key=lambda x: len(x[1]),
+                                    default=(None, []))
+                if main_traffic_region[0] is not None:
+                    zones["traffic_zone"] = {
+                        "region": main_traffic_region[0],
+                        "objects": list(set(traffic_items)),
+                        "description": f"Urban traffic area with {', '.join(list(set(traffic_items))[:3])}"
+                    }
+            # 側邊建築區域（從場景情境推斷）
+            zones.update(self._identify_building_zones(detected_objects))
+            # 行人區域
+            zones.update(self._identify_financial_pedestrian_zones(detected_objects))
+            logger.info(f"Identified {len(zones)} financial district zones")
+            return zones
+        except Exception as e:
+            logger.error(f"Error identifying financial district zones: {str(e)}")
+            logger.error(traceback.format_exc())
+            return {}
+    def identify_landmark_zones(self, landmark_objects: List[Dict]) -> Dict:
+        """
+        辨識與地標相關的功能區域
+        Args:
+            landmark_objects: 被辨識為地標的物體列表
+        Returns:
+            地標相關的功能區域字典
+        """
+        try:
+            landmark_zones = {}
+            # 如果沒有任何地標，就直接回空字典
+            if not landmark_objects:
+                logger.warning("No landmark objects provided to identify_landmark_zones")
+                return landmark_zones
+            # 只取第一個地標來示範：至少產生一個地標
+            landmark = landmark_objects[0]
+            # 確保傳入的 landmark 是 dict
+            if not isinstance(landmark, dict):
+                logger.warning("First landmark object is not a dict")
+                return landmark_zones
+            # 從 landmark dict 拿出必要欄位
+            landmark_id = landmark.get("landmark_id", "unknown_landmark")
+            landmark_name = landmark.get("class_name", "Landmark")
+            landmark_type = landmark.get("landmark_type", "architectural")
+            landmark_region = landmark.get("region", "middle_center")
+            # 如果 location 沒提供，就給預設 "this area"
+            location = landmark.get("location")
+            if not location:
+                location = "this area"
+            # 為地標創建主要觀景區
+            zone_id = f"{landmark_name.lower().replace(' ', '_')}_viewing_area"
+            zone_name = f"{landmark_name} Viewing Area"
+            # 根據地標類型調整描述，並確保帶入地點
+            if landmark_type == "natural":
+                zone_description = (
+                    f"Scenic viewpoint for observing {landmark_name}, "
+                    f"a notable natural landmark in {location}."
+                )
+                primary_function = "Nature observation and photography"
+            elif landmark_type == "monument":
+                zone_description = (
+                    f"Viewing area around {landmark_name}, "
+                    f"a significant monument in {location}."
+                )
+                primary_function = "Historical appreciation and cultural tourism"
+            else:  # architectural
+                zone_description = (
+                    f"Area centered around {landmark_name}, "
+                    f"where visitors can observe and appreciate this iconic structure in {location}."
+                )
+                primary_function = "Architectural tourism and photography"
+            # 確定與地標相關的物體（如果被偵測到）
+            related_objects = []
+            for o in landmark_objects:
+                cn = o.get("class_name", "").lower()
+                if cn in ["person", "camera", "cell phone", "backpack"]:
+                    related_objects.append(cn)
+            # 建立地標功能區
+            landmark_zones[zone_id] = {
+                "name": zone_name,
+                "description": zone_description,
+                "objects": ["landmark"] + related_objects,
+                "region": landmark_region,
+                "primary_function": primary_function
+            }
+            # 創建相關輔助功能區，如攝影區、紀念品販賣區
+            auxiliary_zones = self._create_landmark_auxiliary_zones(landmark, 0)
+            if auxiliary_zones:
+                landmark_zones.update(auxiliary_zones)
+            logger.info(f"Identified {len(landmark_zones)} landmark zones")
+            return landmark_zones
+        except Exception as e:
+            logger.error(f"Error in identify_landmark_zones: {str(e)}")
+            logger.error(traceback.format_exc())
+            return {}
+    def _identify_primary_functional_area(self, detected_objects: List[Dict]) -> Dict:
+        """
+        識別主要功能區域，基於最強的物件關聯性組合
+        採用通用邏輯處理各種室內場景
+        Args:
+            detected_objects: 檢測到的物件列表
+        Returns:
+            主要功能區域字典或None
+        """
+        try:
+            # 用餐區域檢測（桌椅組合）
+            dining_area = self._detect_functional_combination(
+                detected_objects,
+                primary_objects=[60],  # dining table
+                supporting_objects=[56, 40, 41, 42, 43],  # chair, wine glass, cup, fork, knife
+                min_supporting=2,
+                description_template="Dining area with table and seating arrangement"
+            )
+            if dining_area:
+                return dining_area
+            # 休息區域檢測（沙發電視組合或床）
+            seating_area = self._detect_functional_combination(
+                detected_objects,
+                primary_objects=[57, 59],  # sofa, bed
+                supporting_objects=[62, 58, 56],  # tv, potted plant, chair
+                min_supporting=1,
+                description_template="Seating and relaxation area"
+            )
+            if seating_area:
+                return seating_area
+            # 工作區域檢測（電子設備與家具組合）
+            work_area = self._detect_functional_combination(
+                detected_objects,
+                primary_objects=[63, 66],  # laptop, keyboard
+                supporting_objects=[60, 56, 64],  # dining table, chair, mouse
+                min_supporting=2,
+                description_template="Workspace area with electronics and furniture"
+            )
+            if work_area:
+                return work_area
+            return None
+        except Exception as e:
+            logger.error(f"Error identifying primary functional area: {str(e)}")
+            logger.error(traceback.format_exc())
+            return None
+    def _identify_secondary_functional_area(self, detected_objects: List[Dict], existing_zones: Dict) -> Dict:
+        """
+        識別次要功能區域，避免與主要區域重疊
+        Args:
+            detected_objects: 檢測到的物件列表
+            existing_zones: 已存在的功能區域
+        Returns:
+            次要功能區域字典或None
+        """
+        try:
+            # 獲取已使用的區域
+            used_regions = set(zone.get("region") for zone in existing_zones.values())
+            # 裝飾區域檢測（植物集中區域）
+            decorative_area = self._detect_functional_combination(
+                detected_objects,
+                primary_objects=[58],  # potted plant
+                supporting_objects=[75],  # vase
+                min_supporting=0,
+                min_primary=3,  # 至少需要3個植物
+                description_template="Decorative area with plants and ornamental items",
+                exclude_regions=used_regions
+            )
+            if decorative_area:
+                return decorative_area
+            # 儲存區域檢測（廚房電器組合）
+            storage_area = self._detect_functional_combination(
+                detected_objects,
+                primary_objects=[72, 68, 69],  # refrigerator, microwave, oven
+                supporting_objects=[71],  # sink
+                min_supporting=0,
+                min_primary=2,
+                description_template="Kitchen appliance and storage area",
+                exclude_regions=used_regions
+            )
+            if storage_area:
+                return storage_area
+            return None
+        except Exception as e:
+            logger.error(f"Error identifying secondary functional area: {str(e)}")
+            logger.error(traceback.format_exc())
+            return None
+    def _detect_functional_combination(self, detected_objects: List[Dict], primary_objects: List[int],
+                                    supporting_objects: List[int], min_supporting: int,
+                                    description_template: str, min_primary: int = 1,
+                                    exclude_regions: set = None) -> Dict:
+        """
+        通用的功能組合檢測方法
+        基於主要物件和支持物件的組合判斷功能區域
+        Args:
+            detected_objects: 檢測到的物件列表
+            primary_objects: 主要物件的class_id列表
+            supporting_objects: 支持物件的class_id列表
+            min_supporting: 最少需要的支持物件數量
+            description_template: 描述模板
+            min_primary: 最少需要的主要物件數量
+            exclude_regions: 需要排除的區域集合
+        Returns:
+            功能區域資訊字典，如果不符合條件則返回None
+        """
+        try:
+            if exclude_regions is None:
+                exclude_regions = set()
+            # 收集主要物件
+            primary_objs = [obj for obj in detected_objects
+                        if obj.get("class_id") in primary_objects and obj.get("confidence", 0) >= 0.4]
+            # 收集支持物件
+            supporting_objs = [obj for obj in detected_objects
+                            if obj.get("class_id") in supporting_objects and obj.get("confidence", 0) >= 0.4]
+            # 檢查是否滿足最少數量要求
+            if len(primary_objs) < min_primary or len(supporting_objs) < min_supporting:
+                return None
+            # 按區域組織物件
+            region_combinations = {}
+            all_relevant_objs = primary_objs + supporting_objs
+            for obj in all_relevant_objs:
+                region = obj.get("region")
+                # 排除指定區域
+                if region in exclude_regions:
+                    continue
+                if region not in region_combinations:
+                    region_combinations[region] = {"primary": [], "supporting": [], "all": []}
+                region_combinations[region]["all"].append(obj)
+                if obj.get("class_id") in primary_objects:
+                    region_combinations[region]["primary"].append(obj)
+                else:
+                    region_combinations[region]["supporting"].append(obj)
+            # 找到最佳區域組合
+            best_region = None
+            best_score = 0
+            for region, objs in region_combinations.items():
+                # 計算該區域的評分
+                primary_count = len(objs["primary"])
+                supporting_count = len(objs["supporting"])
+                # 必須滿足最低要求
+                if primary_count < min_primary or supporting_count < min_supporting:
+                    continue
+                # 計算組合評分（主要物件權重較高）
+                score = primary_count * 2 + supporting_count
+                if score > best_score:
+                    best_score = score
+                    best_region = region
+            if best_region is None:
+                return None
+            best_combination = region_combinations[best_region]
+            all_objects = [obj["class_name"] for obj in best_combination["all"]]
+            return {
+                "region": best_region,
+                "objects": all_objects,
+                "description": description_template
+            }
+        except Exception as e:
+            logger.error(f"Error detecting functional combination: {str(e)}")
+            logger.error(traceback.format_exc())
+            return None
+    def _analyze_crossing_patterns(self, pedestrians: List[Dict], traffic_lights: List[Dict]) -> Dict:
+        """
+        Analyze pedestrian crossing patterns to identify crossing zones.
+        若同一 region 中同時有行人與紅綠燈，則將兩者都放入該區域的 objects。
+        Args:
+            pedestrians: 行人物件列表（每個 obj 應包含 'class_id', 'region', 'confidence' 等）
+            traffic_lights: 紅綠燈物件列表（每個 obj 應包含 'class_id', 'region', 'confidence' 等）
+        Returns:
+            crossing_zones: 字典，key 為 zone 名稱，value 包含 'region', 'objects', 'description'
+        """
+        try:
+            crossing_zones = {}
+            # 如果沒有任何行人，就不辨識任何 crossing zone
+            if not pedestrians:
+                return crossing_zones
+            # (1) 按照 region 分組行人
+            pedestrian_regions = {}
+            for p in pedestrians:
+                region = p["region"]
+                pedestrian_regions.setdefault(region, []).append(p)
+            # (2) 針對每個 region，看是否同時有紅綠燈
+            # 建立一個 mapping： region -> { "pedestrians": [...], "traffic_lights": [...] }
+            combined_regions = {}
+            for region, peds in pedestrian_regions.items():
+                # 取得該 region 下所有紅綠燈
+                tls_in_region = [t for t in traffic_lights if t["region"] == region]
+                combined_regions[region] = {
+                    "pedestrians": peds,
+                    "traffic_lights": tls_in_region
+                }
+            # (3) 按照行人數量排序，找出前兩個需要建立 crossing zone 的 region
+            sorted_regions = sorted(
+                combined_regions.items(),
+                key=lambda x: len(x[1]["pedestrians"]),
+                reverse=True
+            )
+            # (4) 將前兩個 region 建立 Crossing Zone，objects 同時包含行人與紅綠燈
+            for idx, (region, group) in enumerate(sorted_regions[:2]):
+                peds = group["pedestrians"]
+                tls  = group["traffic_lights"]
+                has_nearby_signals = len(tls) > 0
+                # 生成 zone_name（基於 region 方向 + idx 決定主/次 crossing）
+                direction = self._get_directional_description(region)
+                if direction and direction != "central":
+                    zone_name = f"{direction} crossing area"
+                else:
+                    zone_name = "main crossing area" if idx == 0 else "secondary crossing area"
+                # 組合 description
+                description = f"Pedestrian crossing area with {len(peds)} "
+                description += "person" if len(peds) == 1 else "people"
+                if direction:
+                    description += f" in {direction} direction"
+                if has_nearby_signals:
+                    description += " near traffic signals"
+                # ======= 將行人 + 同區紅綠燈一併放入 objects =======
+                obj_list = ["pedestrian"] * len(peds)
+                if has_nearby_signals:
+                    obj_list += ["traffic light"] * len(tls)
+                crossing_zones[zone_name] = {
+                    "region": region,
+                    "objects": obj_list,
+                    "description": description
+                }
+            return crossing_zones
+        except Exception as e:
+            logger.error(f"Error in _analyze_crossing_patterns: {str(e)}")
+            logger.error(traceback.format_exc())
+            return {}
+    def _analyze_traffic_zones(self, vehicles: List[Dict]) -> Dict:
+        """
+        分析車輛分布以識別具有方向感知的交通區域
+        Args:
+            vehicles: 車輛物件列表
+        Returns:
+            識別出的交通區域字典
+        """
+        try:
+            traffic_zones = {}
+            if not vehicles:
+                return traffic_zones
+            # 按區域分組車輛
+            vehicle_regions = {}
+            for v in vehicles:
+                region = v["region"]
+                if region not in vehicle_regions:
+                    vehicle_regions[region] = []
+                vehicle_regions[region].append(v)
+            # 為有車輛的區域創建交通區域
+            main_traffic_region = max(vehicle_regions.items(), key=lambda x: len(x[1]), default=(None, []))
+            if main_traffic_region[0] is not None:
+                region = main_traffic_region[0]
+                vehicles_in_region = main_traffic_region[1]
+                # 獲取車輛類型列表用於描述
+                vehicle_types = [v["class_name"] for v in vehicles_in_region]
+                unique_types = list(set(vehicle_types))
+                # 獲取方向描述
+                direction = self._get_directional_description(region)
+                # 創建描述性區域
+                traffic_zones["vehicle_zone"] = {
+                    "region": region,
+                    "objects": vehicle_types,
+                    "description": f"Vehicle traffic area with {', '.join(unique_types[:3])}" +
+                                (f" in {direction} area" if direction else "")
+                }
+                # 如果車輛分布在多個區域，創建次要區域
+                if len(vehicle_regions) > 1:
+                    # 獲取第二大車輛聚集區域
+                    sorted_regions = sorted(vehicle_regions.items(), key=lambda x: len(x[1]), reverse=True)
+                    if len(sorted_regions) > 1:
+                        second_region, second_vehicles = sorted_regions[1]
+                        direction = self._get_directional_description(second_region)
+                        vehicle_types = [v["class_name"] for v in second_vehicles]
+                        unique_types = list(set(vehicle_types))
+                        traffic_zones["secondary_vehicle_zone"] = {
+                            "region": second_region,
+                            "objects": vehicle_types,
+                            "description": f"Secondary traffic area with {', '.join(unique_types[:2])}" +
+                                        (f" in {direction} direction" if direction else "")
+                        }
+            return traffic_zones
+        except Exception as e:
+            logger.error(f"Error analyzing traffic zones: {str(e)}")
+            logger.error(traceback.format_exc())
+            return {}
+    def _get_directional_description(self, region: str) -> str:
+        """
+        將區域名稱轉換為方位描述（東西南北）
+        Args:
+            region: 區域名稱
+        Returns:
+            方位描述字串
+        """
+        try:
+            region_lower = region.lower()
+            if "top" in region_lower and "left" in region_lower:
+                return "northwest"
+            elif "top" in region_lower and "right" in region_lower:
+                return "northeast"
+            elif "bottom" in region_lower and "left" in region_lower:
+                return "southwest"
+            elif "bottom" in region_lower and "right" in region_lower:
+                return "southeast"
+            elif "top" in region_lower:
+                return "north"
+            elif "bottom" in region_lower:
+                return "south"
+            elif "left" in region_lower:
+                return "west"
+            elif "right" in region_lower:
+                return "east"
+            else:
+                return "central"
+        except Exception as e:
+            logger.error(f"Error getting directional description for region '{region}': {str(e)}")
+            return "central"
+    def _identify_park_recreational_zones(self, detected_objects: List[Dict]) -> Dict:
+        """
+        識別公園的休閒活動區域
+        Args:
+            detected_objects: 檢測到的物件列表
+        Returns:
+            休閒區域字典
+        """
+        try:
+            zones = {}
+            # 尋找休閒物件（運動球、風箏等）
+            rec_items = []
+            rec_regions = {}
+            for obj in detected_objects:
+                if obj["class_id"] in [32, 33, 34, 35, 38]:  # sports ball, kite, baseball bat, glove, tennis racket
+                    region = obj["region"]
+                    if region not in rec_regions:
+                        rec_regions[region] = []
+                    rec_regions[region].append(obj)
+                    rec_items.append(obj["class_name"])
+            if rec_items:
+                main_rec_region = max(rec_regions.items(),
+                                key=lambda x: len(x[1]),
+                                default=(None, []))
+                if main_rec_region[0] is not None:
+                    zones["recreational_zone"] = {
+                        "region": main_rec_region[0],
+                        "objects": list(set(rec_items)),
+                        "description": f"Recreational area with {', '.join(list(set(rec_items)))}"
+                    }
+            return zones
+        except Exception as e:
+            logger.error(f"Error identifying park recreational zones: {str(e)}")
+            logger.error(traceback.format_exc())
+            return {}
+    def _identify_parking_zones(self, detected_objects: List[Dict]) -> Dict:
+        """
+        停車場的停車區域
+        Args:
+            detected_objects: 檢測到的物件列表
+        Returns:
+            停車區域字典
+        """
+        try:
+            zones = {}
+            # 尋找停放的汽車
+            car_objs = [obj for obj in detected_objects if obj["class_id"] == 2]  # cars
+            if len(car_objs) >= 3:
+                # 檢查汽車是否按模式排列（簡化）
+                car_positions = [obj["normalized_center"] for obj in car_objs]
+                # 通過分析垂直位置檢查行模式
+                y_coords = [pos[1] for pos in car_positions]
+                y_clusters = {}
+                # 簡化聚類 - 按相似y坐標分組汽車
+                for i, y in enumerate(y_coords):
+                    assigned = False
+                    for cluster_y in y_clusters.keys():
+                        if abs(y - cluster_y) < 0.1:  # 圖像高度的10%內
+                            y_clusters[cluster_y].append(i)
+                            assigned = True
+                            break
+                    if not assigned:
+                        y_clusters[y] = [i]
+                # 如果有行模式
+                if max(len(indices) for indices in y_clusters.values()) >= 2:
+                    zones["parking_row"] = {
+                        "region": "central",
+                        "objects": ["car"] * len(car_objs),
+                        "description": f"Organized parking area with vehicles arranged in rows"
+                    }
+                else:
+                    zones["parking_area"] = {
+                        "region": "wide",
+                        "objects": ["car"] * len(car_objs),
+                        "description": f"Parking area with {len(car_objs)} vehicles"
+                    }
+            return zones
+        except Exception as e:
+            logger.error(f"Error identifying parking zones: {str(e)}")
+            logger.error(traceback.format_exc())
+            return {}
+    def _analyze_aerial_traffic_patterns(self, vehicle_objs: List[Dict]) -> Dict:
+        """
+        分析空中視角的車輛交通模式
+        Args:
+            vehicle_objs: 車輛物件列表
+        Returns:
+            交通模式區域字典
+        """
+        try:
+            zones = {}
+            if not vehicle_objs:
+                return zones
+            # 將位置轉換為數組進行模式分析
+            positions = np.array([obj["normalized_center"] for obj in vehicle_objs])
+            if len(positions) >= 2:
+                # 計算分布指標
+                x_coords = positions[:, 0]
+                y_coords = positions[:, 1]
+                x_mean = np.mean(x_coords)
+                y_mean = np.mean(y_coords)
+                x_std = np.std(x_coords)
+                y_std = np.std(y_coords)
+                # 判斷車輛是否組織成車道
+                if x_std < y_std * 0.5:
+                    # 車輛垂直對齊 - 表示南北交通
+                    zones["vertical_traffic_flow"] = {
+                        "region": "central_vertical",
+                        "objects": [obj["class_name"] for obj in vehicle_objs[:5]],
+                        "description": "North-south traffic flow visible from aerial view"
+                    }
+                elif y_std < x_std * 0.5:
+                    # 車輛水平對齊 - 表示東西交通
+                    zones["horizontal_traffic_flow"] = {
+                        "region": "central_horizontal",
+                        "objects": [obj["class_name"] for obj in vehicle_objs[:5]],
+                        "description": "East-west traffic flow visible from aerial view"
+                    }
+                else:
+                    # 車輛多方向 - 表示十字路口
+                    zones["intersection_traffic"] = {
+                        "region": "central",
+                        "objects": [obj["class_name"] for obj in vehicle_objs[:5]],
+                        "description": "Multi-directional traffic at intersection visible from aerial view"
+                    }
+            return zones
+        except Exception as e:
+            logger.error(f"Error analyzing aerial traffic patterns: {str(e)}")
+            logger.error(traceback.format_exc())
+            return {}
+    def _identify_aerial_intersection_features(self, detected_objects: List[Dict]) -> Dict:
+        """
+        空中視角十字路口特徵
+        Args:
+            detected_objects: 檢測到的物件列表
+        Returns:
+            十字路口特徵區域字典
+        """
+        try:
+            zones = {}
+            # 檢查交通信號
+            traffic_light_objs = [obj for obj in detected_objects if obj["class_id"] == 9]
+            if traffic_light_objs:
+                zones["traffic_control_pattern"] = {
+                    "region": "intersection",
+                    "objects": ["traffic light"] * len(traffic_light_objs),
+                    "description": f"Intersection traffic control with {len(traffic_light_objs)} signals visible from above"
+                }
+            # 人行道從空中視角的情境推斷
+            zones["crossing_pattern"] = {
+                "region": "central",
+                "objects": ["inferred crosswalk"],
+                "description": "Crossing pattern visible from aerial perspective"
+            }
+            return zones
+        except Exception as e:
+            logger.error(f"Error identifying aerial intersection features: {str(e)}")
+            logger.error(traceback.format_exc())
+            return {}
+    def _identify_aerial_plaza_features(self, people_objs: List[Dict]) -> Dict:
+        """
+        識別空中視角廣場特徵
+        Args:
+            people_objs: 行人物件列表
+        Returns:
+            廣場特徵區域字典
+        """
+        try:
+            zones = {}
+            if people_objs:
+                # 檢查人群是否聚集在中央區域
+                central_people = [obj for obj in people_objs
+                                if "middle" in obj["region"]]
+                if central_people:
+                    zones["central_gathering"] = {
+                        "region": "middle_center",
+                        "objects": ["person"] * len(central_people),
+                        "description": f"Central plaza gathering area with {len(central_people)} people viewed from above"
+                    }
+            return zones
+        except Exception as e:
+            logger.error(f"Error identifying aerial plaza features: {str(e)}")
+            logger.error(traceback.format_exc())
+            return {}
+    def _identify_asian_pedestrian_pathway(self, detected_objects: List[Dict]) -> Dict:
+        """
+        亞洲文化場景中的行人通道
+        Args:
+            detected_objects: 檢測到的物件列表
+        Returns:
+            行人通道區域字典
+        """
+        try:
+            zones = {}
+            pathway_items = []
+            pathway_regions = {}
+            # 提取人群用於通道分析
+            people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
+            # 分析人群是否形成線形（商業街的特徵）
+            people_positions = [obj["normalized_center"] for obj in people_objs]
+            structured_path = False
+            path_direction = "meandering"
+            if len(people_positions) >= 3:
+                # 檢查人群是否沿相似y坐標排列（水平路徑）
+                y_coords = [pos[1] for pos in people_positions]
+                y_mean = sum(y_coords) / len(y_coords)
+                y_variance = sum((y - y_mean)**2 for y in y_coords) / len(y_coords)
+                horizontal_path = y_variance < 0.05  # 低變異表示水平對齊
+                # 檢查人群是否沿相似x坐標排列（垂直路徑）
+                x_coords = [pos[0] for pos in people_positions]
+                x_mean = sum(x_coords) / len(x_coords)
+                x_variance = sum((x - x_mean)**2 for x in x_coords) / len(x_coords)
+                vertical_path = x_variance < 0.05  # 低變異表示垂直對齊
+                structured_path = horizontal_path or vertical_path
+                path_direction = "horizontal" if horizontal_path else "vertical" if vertical_path else "meandering"
+            # 收集通道物件（人、自行車、摩托車在中間區域）
+            for obj in detected_objects:
+                if obj["class_id"] in [0, 1, 3]:  # Person, bicycle, motorcycle
+                    y_pos = obj["normalized_center"][1]
+                    # 按垂直位置分組（圖像中間可能是通道）
+                    if 0.25 <= y_pos <= 0.75:
+                        region = obj["region"]
+                        if region not in pathway_regions:
+                            pathway_regions[region] = []
+                        pathway_regions[region].append(obj)
+                        pathway_items.append(obj["class_name"])
+            if pathway_items:
+                path_desc = "Pedestrian walkway with people moving through the commercial area"
+                if structured_path:
+                    path_desc = f"{path_direction.capitalize()} pedestrian walkway with organized foot traffic"
+                zones["pedestrian_pathway"] = {
+                    "region": "middle_center",  # 假設：通道通常在中間
+                    "objects": list(set(pathway_items)),
+                    "description": path_desc
+                }
+            return zones
+        except Exception as e:
+            logger.error(f"Error identifying Asian pedestrian pathway: {str(e)}")
+            logger.error(traceback.format_exc())
+            return {}
+    def _identify_vendor_zones(self, detected_objects: List[Dict]) -> Dict:
+        """
+        識別攤販區域
+        Args:
+            detected_objects: 檢測到的物件列表
+        Returns:
+            攤販區域字典
+        """
+        try:
+            zones = {}
+            # 識別攤販區域（小攤/商店 - 從情境推斷）
+            has_small_objects = any(obj["class_id"] in [24, 26, 39, 41] for obj in detected_objects)  # bags, bottles, cups
+            has_people = any(obj["class_id"] == 0 for obj in detected_objects)
+            if has_small_objects and has_people:
+                # 可能的攤販區域是人群和小物件聚集的地方
+                small_obj_regions = {}
+                for obj in detected_objects:
+                    if obj["class_id"] in [24, 26, 39, 41, 67]:  # bags, bottles, cups, phones
+                        region = obj["region"]
+                        if region not in small_obj_regions:
+                            small_obj_regions[region] = []
+                        small_obj_regions[region].append(obj)
+                if small_obj_regions:
+                    main_vendor_region = max(small_obj_regions.items(),
+                                        key=lambda x: len(x[1]),
+                                        default=(None, []))
+                    if main_vendor_region[0] is not None:
+                        vendor_items = [obj["class_name"] for obj in main_vendor_region[1]]
+                        zones["vendor_zone"] = {
+                            "region": main_vendor_region[0],
+                            "objects": list(set(vendor_items)),
+                            "description": "Vendor or market stall area with small merchandise"
+                        }
+            return zones
+        except Exception as e:
+            logger.error(f"Error identifying vendor zones: {str(e)}")
+            logger.error(traceback.format_exc())
+            return {}
+    def _identify_upscale_decorative_zones(self, detected_objects: List[Dict]) -> Dict:
+        """
+        識別高級餐飲的裝飾區域
+        Args:
+            detected_objects: 檢測到的物件列表
+        Returns:
+            裝飾區域字典
+        """
+        try:
+            zones = {}
+            decor_items = []
+            decor_regions = {}
+            # 尋找裝飾元素（花瓶、酒杯、未使用的餐具）
+            for obj in detected_objects:
+                if obj["class_id"] in [75, 40]:  # Vase, wine glass
+                    region = obj["region"]
+                    if region not in decor_regions:
+                        decor_regions[region] = []
+                    decor_regions[region].append(obj)
+                    decor_items.append(obj["class_name"])
+            if decor_items:
+                main_decor_region = max(decor_regions.items(),
+                                    key=lambda x: len(x[1]),
+                                    default=(None, []))
+                if main_decor_region[0] is not None:
+                    zones["decorative_zone"] = {
+                        "region": main_decor_region[0],
+                        "objects": list(set(decor_items)),
+                        "description": f"Decorative area with {', '.join(list(set(decor_items)))}"
+                    }
+            return zones
+        except Exception as e:
+            logger.error(f"Error identifying upscale decorative zones: {str(e)}")
+            logger.error(traceback.format_exc())
+            return {}
+    def _identify_dining_seating_zones(self, detected_objects: List[Dict]) -> Dict:
+        """
+        識別餐廳座位安排區域
+        Args:
+            detected_objects: 檢測到的物件���表
+        Returns:
+            座位區域字典
+        """
+        try:
+            zones = {}
+            # 識別座位安排區域
+            chairs = [obj for obj in detected_objects if obj["class_id"] == 56]  # chairs
+            if len(chairs) >= 2:
+                chair_regions = {}
+                for obj in chairs:
+                    region = obj["region"]
+                    if region not in chair_regions:
+                        chair_regions[region] = []
+                    chair_regions[region].append(obj)
+                if chair_regions:
+                    main_seating_region = max(chair_regions.items(),
+                                        key=lambda x: len(x[1]),
+                                        default=(None, []))
+                    if main_seating_region[0] is not None:
+                        zones["dining_seating_zone"] = {
+                            "region": main_seating_region[0],
+                            "objects": ["chair"] * len(main_seating_region[1]),
+                            "description": f"Formal dining seating arrangement with {len(main_seating_region[1])} chairs"
+                        }
+            return zones
+        except Exception as e:
+            logger.error(f"Error identifying dining seating zones: {str(e)}")
+            logger.error(traceback.format_exc())
+            return {}
+    def _identify_serving_zones(self, detected_objects: List[Dict], existing_zones: Dict) -> Dict:
+        """
+        識別服務區域
+        Args:
+            detected_objects: 檢測到的物件列表
+            existing_zones: 已存在的功能區域
+        Returns:
+            服務區域字典
+        """
+        try:
+            zones = {}
+            serving_items = []
+            serving_regions = {}
+            # 服務區域可能有瓶子、碗、容器
+            for obj in detected_objects:
+                if obj["class_id"] in [39, 45]:  # Bottle, bowl
+                    # 檢查是否在與主餐桌不同的區域
+                    if "formal_dining_zone" in existing_zones and obj["region"] != existing_zones["formal_dining_zone"]["region"]:
+                        region = obj["region"]
+                        if region not in serving_regions:
+                            serving_regions[region] = []
+                        serving_regions[region].append(obj)
+                        serving_items.append(obj["class_name"])
+            if serving_items:
+                main_serving_region = max(serving_regions.items(),
+                                    key=lambda x: len(x[1]),
+                                    default=(None, []))
+                if main_serving_region[0] is not None:
+                    zones["serving_zone"] = {
+                        "region": main_serving_region[0],
+                        "objects": list(set(serving_items)),
+                        "description": f"Serving or sideboard area with {', '.join(list(set(serving_items)))}"
+                    }
+            return zones
+        except Exception as e:
+            logger.error(f"Error identifying serving zones: {str(e)}")
+            logger.error(traceback.format_exc())
+            return {}
+    def _identify_building_zones(self, detected_objects: List[Dict]) -> Dict:
+        """
+        識別建築區域（從場景情境推斷）
+        Args:
+            detected_objects: 檢測到的物件列表
+        Returns:
+            建築區域字典
+        """
+        try:
+            zones = {}
+            # 側邊建築區域（從場景情境推斷）
+            # 檢查是否有實際可能包含建築物的區域
+            left_side_regions = ["top_left", "middle_left", "bottom_left"]
+            right_side_regions = ["top_right", "middle_right", "bottom_right"]
+            # 檢查左側
+            left_building_evidence = True
+            for region in left_side_regions:
+                # 如果此區域有很多車輛或人群，不太可能是建築物
+                vehicle_in_region = any(obj["region"] == region and obj["class_id"] in [1, 2, 3, 5, 7]
+                                    for obj in detected_objects)
+                people_in_region = any(obj["region"] == region and obj["class_id"] == 0
+                                    for obj in detected_objects)
+                if vehicle_in_region or people_in_region:
+                    left_building_evidence = False
+                    break
+            # 檢查右側
+            right_building_evidence = True
+            for region in right_side_regions:
+                # 如果此區域有很多車輛或人群，不太可能是建築物
+                vehicle_in_region = any(obj["region"] == region and obj["class_id"] in [1, 2, 3, 5, 7]
+                                    for obj in detected_objects)
+                people_in_region = any(obj["region"] == region and obj["class_id"] == 0
+                                    for obj in detected_objects)
+                if vehicle_in_region or people_in_region:
+                    right_building_evidence = False
+                    break
+            # 如果證據支持，添加建築區域
+            if left_building_evidence:
+                zones["building_zone_left"] = {
+                    "region": "middle_left",
+                    "objects": ["building"],  # 推斷
+                    "description": "Tall buildings line the left side of the street"
+                }
+            if right_building_evidence:
+                zones["building_zone_right"] = {
+                    "region": "middle_right",
+                    "objects": ["building"],  # 推斷
+                    "description": "Tall buildings line the right side of the street"
+                }
+            return zones
+        except Exception as e:
+            logger.error(f"Error identifying building zones: {str(e)}")
+            logger.error(traceback.format_exc())
+            return {}
+    def _identify_financial_pedestrian_zones(self, detected_objects: List[Dict]) -> Dict:
+        """
+        識別金融區的行人區域
+        Args:
+            detected_objects: 檢測到的物件列表
+        Returns:
+            行人區域字典
+        """
+        try:
+            zones = {}
+            # 識別行人區域（如果有人群）
+            people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
+            if people_objs:
+                people_regions = {}
+                for obj in people_objs:
+                    region = obj["region"]
+                    if region not in people_regions:
+                        people_regions[region] = []
+                    people_regions[region].append(obj)
+                if people_regions:
+                    main_pedestrian_region = max(people_regions.items(),
+                                            key=lambda x: len(x[1]),
+                                            default=(None, []))
+                    if main_pedestrian_region[0] is not None:
+                        zones["pedestrian_zone"] = {
+                            "region": main_pedestrian_region[0],
+                            "objects": ["person"] * len(main_pedestrian_region[1]),
+                            "description": f"Pedestrian area with {len(main_pedestrian_region[1])} people navigating the financial district"
+                        }
+            return zones
+        except Exception as e:
+            logger.error(f"Error identifying financial pedestrian zones: {str(e)}")
+            logger.error(traceback.format_exc())
+            return {}
+    def _create_landmark_auxiliary_zones(self, landmark: Dict, index: int) -> Dict:
+        """
+        創建地標相關的輔助區域（攝影區、紀念品區等）
+        Args:
+            landmark: 地標物件字典
+            index: 地標索引
+        Returns:
+            輔助區域字典
+        """
+        try:
+            auxiliary_zones = {}
+            landmark_region = landmark.get("region", "middle_center")
+            landmark_name = landmark.get("class_name", "Landmark")
+            # 創建攝影區
+            # 根據地標位置調整攝影區位置（地標前方通常是攝影區）
+            region_mapping = {
+                "top_left": "bottom_right",
+                "top_center": "bottom_center",
+                "top_right": "bottom_left",
+                "middle_left": "middle_right",
+                "middle_center": "bottom_center",
+                "middle_right": "middle_left",
+                "bottom_left": "top_right",
+                "bottom_center": "top_center",
+                "bottom_right": "top_left"
+            }
+            photo_region = region_mapping.get(landmark_region, landmark_region)
+            photo_key = f"{landmark_name.lower().replace(' ', '_')}_photography_spot"
+            auxiliary_zones[photo_key] = {
+                "name": f"{landmark_name} Photography Spot",
+                "description": f"Popular position for photographing {landmark_name} with optimal viewing angle.",
+                "objects": ["camera", "person", "cell phone"],
+                "region": photo_region,
+                "primary_function": "Tourist photography"
+            }
+            # 如果是著名地標，可能有紀念品販售區
+            if landmark.get("confidence", 0) > 0.7:  # 高置信度地標更可能有紀念品區
+                # 根據地標位置找到適合的紀念品區位置（通常在地標附近但不直接在地標上）
+                adjacent_regions = {
+                    "top_left": ["top_center", "middle_left"],
+                    "top_center": ["top_left", "top_right"],
+                    "top_right": ["top_center", "middle_right"],
+                    "middle_left": ["top_left", "bottom_left"],
+                    "middle_center": ["middle_left", "middle_right"],
+                    "middle_right": ["top_right", "bottom_right"],
+                    "bottom_left": ["middle_left", "bottom_center"],
+                    "bottom_center": ["bottom_left", "bottom_right"],
+                    "bottom_right": ["bottom_center", "middle_right"]
+                }
+                if landmark_region in adjacent_regions:
+                    souvenir_region = adjacent_regions[landmark_region][0]  # 選擇第一個相鄰區域
+                    souvenir_key = f"{landmark_name.lower().replace(' ', '_')}_souvenir_area"
+                    auxiliary_zones[souvenir_key] = {
+                        "name": f"{landmark_name} Souvenir Area",
+                        "description": f"Area where visitors can purchase souvenirs and memorabilia related to {landmark_name}.",
+                        "objects": ["person", "handbag", "backpack"],
+                        "region": souvenir_region,
+                        "primary_function": "Tourism commerce"
+                    }
+            return auxiliary_zones
+        except Exception as e:
+            logger.error(f"Error creating landmark auxiliary zones: {str(e)}")
+            logger.error(traceback.format_exc())
+            return {}

spatial_analyzer.py CHANGED Viewed

@@ -1,1895 +1,443 @@
 import os
 import numpy as np
 from typing import Dict, List, Tuple, Any, Optional
-from scene_type import SCENE_TYPES
-from enhance_scene_describer import EnhancedSceneDescriber
 class SpatialAnalyzer:
     """
-    Analyzes spatial relationships between objects in an image.
-    Handles region assignment, object positioning, and functional zone identification.
     """
     def __init__(self, class_names: Dict[int, str] = None, object_categories=None):
-        """Initialize the spatial analyzer with image regions"""
-        # Define regions of the image (3x3 grid)
-        self.regions = {
-            "top_left": (0, 0, 1/3, 1/3),
-            "top_center": (1/3, 0, 2/3, 1/3),
-            "top_right": (2/3, 0, 1, 1/3),
-            "middle_left": (0, 1/3, 1/3, 2/3),
-            "middle_center": (1/3, 1/3, 2/3, 2/3),
-            "middle_right": (2/3, 1/3, 1, 2/3),
-            "bottom_left": (0, 2/3, 1/3, 1),
-            "bottom_center": (1/3, 2/3, 2/3, 1),
-            "bottom_right": (2/3, 2/3, 1, 1)
-        }
-        self.class_names = class_names
-        self.OBJECT_CATEGORIES = object_categories or {}
-        self.enhance_descriptor = EnhancedSceneDescriber(scene_types=SCENE_TYPES)
-        # Distances thresholds for proximity analysis (normalized)
-        self.proximity_threshold = 0.2
-    def _determine_region(self, x: float, y: float) -> str:
         """
-        Determine which region a point falls into.
         Args:
-            x: Normalized x-coordinate (0-1)
-            y: Normalized y-coordinate (0-1)
-        Returns:
-            Region name
         """
-        for region_name, (x1, y1, x2, y2) in self.regions.items():
-            if x1 <= x < x2 and y1 <= y < y2:
-                return region_name
-        return "unknown"
-    def _analyze_regions(self, detected_objects: List[Dict]) -> Dict:
-        """
-        Analyze object distribution across image regions.
-        Args:
-            detected_objects: List of detected objects with position information
-        Returns:
-            Dictionary with region analysis
-        """
-        # Count objects in each region
-        region_counts = {region: 0 for region in self.regions.keys()}
-        region_objects = {region: [] for region in self.regions.keys()}
-        for obj in detected_objects:
-            region = obj["region"]
-            if region in region_counts:
-                region_counts[region] += 1
-                region_objects[region].append({
-                    "class_id": obj["class_id"],
-                    "class_name": obj["class_name"]
-                })
-        # Determine main focus regions (top 1-2 regions by object count)
-        sorted_regions = sorted(region_counts.items(), key=lambda x: x[1], reverse=True)
-        main_regions = [region for region, count in sorted_regions if count > 0][:2]
-        return {
-            "counts": region_counts,
-            "main_focus": main_regions,
-            "objects_by_region": region_objects
-        }
-    def _extract_detected_objects(self, detection_result: Any, confidence_threshold: float = 0.25) -> List[Dict]:
-        """
-        Extract detected objects from detection result with position information.
-        Args:
-            detection_result: Detection result from YOLOv8
-            confidence_threshold: Minimum confidence threshold
-        Returns:
-            List of dictionaries with detected object information
-        """
-        boxes = detection_result.boxes.xyxy.cpu().numpy()
-        classes = detection_result.boxes.cls.cpu().numpy().astype(int)
-        confidences = detection_result.boxes.conf.cpu().numpy()
-        # Image dimensions
-        img_height, img_width = detection_result.orig_shape[:2]
-        detected_objects = []
-        for box, class_id, confidence in zip(boxes, classes, confidences):
-            # Skip objects with confidence below threshold
-            if confidence < confidence_threshold:
-                continue
-            x1, y1, x2, y2 = box
-            width = x2 - x1
-            height = y2 - y1
-            # Center point
-            center_x = (x1 + x2) / 2
-            center_y = (y1 + y2) / 2
-            # Normalized positions (0-1)
-            norm_x = center_x / img_width
-            norm_y = center_y / img_height
-            norm_width = width / img_width
-            norm_height = height / img_height
-            # Area calculation
-            area = width * height
-            norm_area = area / (img_width * img_height)
-            # Region determination
-            object_region = self._determine_region(norm_x, norm_y)
-            detected_objects.append({
-                "class_id": int(class_id),
-                "class_name": self.class_names[int(class_id)],
-                "confidence": float(confidence),
-                "box": [float(x1), float(y1), float(x2), float(y2)],
-                "center": [float(center_x), float(center_y)],
-                "normalized_center": [float(norm_x), float(norm_y)],
-                "size": [float(width), float(height)],
-                "normalized_size": [float(norm_width), float(norm_height)],
-                "area": float(area),
-                "normalized_area": float(norm_area),
-                "region": object_region
-            })
-        return detected_objects
-    def _detect_scene_viewpoint(self, detected_objects: List[Dict]) -> Dict:
         """
-        檢測場景視角並識別特殊場景模式。
         Args:
-            detected_objects: 檢測到的物體列表
-        Returns:
-            Dict: 包含視角和場景模式信息的字典
         """
-        if not detected_objects:
-            return {"viewpoint": "eye_level", "patterns": []}
-        # 從物體位置中提取信息
-        patterns = []
-        # 檢測行人位置模式
-        pedestrian_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
-        # 檢查是否有足夠的行人來識別模式
-        if len(pedestrian_objs) >= 4:
-            pedestrian_positions = [obj["normalized_center"] for obj in pedestrian_objs]
-            # 檢測十字交叉模式
-            if self._detect_cross_pattern(pedestrian_positions):
-                patterns.append("crosswalk_intersection")
-            # 檢測多方向行人流
-            directions = self._analyze_movement_directions(pedestrian_positions)
-            if len(directions) >= 2:
-                patterns.append("multi_directional_movement")
-        # 檢查物體的大小一致性 - 在空中俯視圖中，物體大小通常更一致
-        if len(detected_objects) >= 5:
-            sizes = [obj.get("normalized_area", 0) for obj in detected_objects]
-            size_variance = np.var(sizes) / (np.mean(sizes) ** 2)  # 標準化變異數，不會受到平均值影響
-            if size_variance < 0.3:  # 低變異表示大小一致
-                patterns.append("consistent_object_size")
-        # 基本視角檢測
-        viewpoint = self.enhance_descriptor._detect_viewpoint(detected_objects)
-        # 根據檢測到的模式增強視角判斷
-        if "crosswalk_intersection" in patterns and viewpoint != "aerial":
-            # 如果檢測到斑馬線交叉但視角判斷不是空中視角，優先採用模式判斷
-            viewpoint = "aerial"
-        return {
-            "viewpoint": viewpoint,
-            "patterns": patterns
-        }
-    def _detect_cross_pattern(self, positions):
         """
-        檢測位置中的十字交叉模式
         Args:
-            positions: 位置列表 [[x1, y1], [x2, y2], ...]
         Returns:
-            bool: 是否檢測到十字交叉模式
         """
-        if len(positions) < 8:  # 需要足夠多的點
-            return False
-        # 提取 x 和 y 坐標
-        x_coords = [pos[0] for pos in positions]
-        y_coords = [pos[1] for pos in positions]
-        # 檢測 x 和 y 方向的聚類
-        x_clusters = []
-        y_clusters = []
-        # 簡化的聚類分析
-        x_mean = np.mean(x_coords)
-        y_mean = np.mean(y_coords)
-        # 計算在中心線附近的點
-        near_x_center = sum(1 for x in x_coords if abs(x - x_mean) < 0.1)
-        near_y_center = sum(1 for y in y_coords if abs(y - y_mean) < 0.1)
-        # 如果有足夠的點在中心線附近，可能是十字交叉
-        return near_x_center >= 3 and near_y_center >= 3
-    def _analyze_movement_directions(self, positions):
         """
-        分析位置中的移動方向
         Args:
-            positions: 位置列表 [[x1, y1], [x2, y2], ...]
         Returns:
-            list: 檢測到的主要方向
         """
-        if len(positions) < 6:
-            return []
-        # extract x 和 y 坐標
-        x_coords = [pos[0] for pos in positions]
-        y_coords = [pos[1] for pos in positions]
-        directions = []
-        # horizontal move (left --> right)
-        x_std = np.std(x_coords)
-        x_range = max(x_coords) - min(x_coords)
-        # vertical move(up --> down)
-        y_std = np.std(y_coords)
-        y_range = max(y_coords) - min(y_coords)
-        # 足夠大的範圍表示該方向有運動
-        if x_range > 0.4:
-            directions.append("horizontal")
-        if y_range > 0.4:
-            directions.append("vertical")
-        return directions
-    def _identify_functional_zones(self, detected_objects: List[Dict], scene_type: str) -> Dict:
         """
-        Identify functional zones within the scene with improved detection for different viewpoints
-        and cultural contexts.
         Args:
-            detected_objects: List of detected objects
-            scene_type: Identified scene type
         Returns:
-            Dictionary of functional zones with their descriptions
         """
-        # Group objects by category and region
-        category_regions = {}
-        if not getattr(self, 'enable_landmark', True):
-            detected_objects = [obj for obj in detected_objects if not obj.get("is_landmark", False)]
-        # 過濾地標相關場景類型
-        if scene_type in ["tourist_landmark", "natural_landmark", "historical_monument"]:
-            scene_type = "city_street"
-        # MODIFIED: Smart threshold evaluation instead of fixed values
-        should_identify = self._evaluate_zone_identification_feasibility(detected_objects, scene_type)
-        if not should_identify:
-            return {}
-        # MODIFIED: Build category_regions mapping (was missing in original)
-        for obj in detected_objects:
-            category = self._categorize_object(obj)
-            if not category:
-                continue
-            if category not in category_regions:
-                category_regions[category] = {}
-            region = obj.get("region", "center")
-            if region not in category_regions[category]:
-                category_regions[category][region] = []
-            category_regions[category][region].append(obj)
-        # Identify zones based on object groupings
-        zones = {}
-        # Detect viewpoint to adjust zone identification strategy
-        viewpoint = self._detect_scene_viewpoint(detected_objects)
-        # Choose appropriate zone identification strategy based on scene type and viewpoint
-        if scene_type in ["living_room", "bedroom", "dining_area", "kitchen", "office_workspace", "meeting_room"]:
-            # Indoor scenes
-            zones.update(self._identify_indoor_zones(category_regions, detected_objects, scene_type))
-        elif scene_type in ["city_street", "parking_lot", "park_area"]:
-            # Outdoor general scenes
-            zones.update(self._identify_outdoor_general_zones(category_regions, detected_objects, scene_type))
-        elif "aerial" in scene_type or viewpoint == "aerial":
-            # Aerial viewpoint scenes
-            zones.update(self._identify_aerial_view_zones(category_regions, detected_objects, scene_type))
-        elif "asian" in scene_type:
-            # Asian cultural context scenes
-            zones.update(self._identify_asian_cultural_zones(category_regions, detected_objects, scene_type))
-        elif scene_type == "urban_intersection":
-            # Specific urban intersection logic
-            zones.update(self._identify_intersection_zones(category_regions, detected_objects, viewpoint))
-        elif scene_type == "financial_district":
-            # Financial district specific logic
-            zones.update(self._identify_financial_district_zones(category_regions, detected_objects))
-        elif scene_type == "upscale_dining":
-            # Upscale dining specific logic
-            zones.update(self._identify_upscale_dining_zones(category_regions, detected_objects))
-        elif scene_type == "tourist_landmark" or "landmark" in scene_type:
-            # 處理地標場景類型
-            landmark_objects = [obj for obj in detected_objects if obj.get("is_landmark", False)]
-            if landmark_objects:
-                landmark_zones = self._identify_landmark_zones(landmark_objects)
-                zones.update(landmark_zones)
-        else:
-            # Default zone identification for other scene types
-            zones.update(self._identify_default_zones(category_regions, detected_objects))
-        # 檢查是否有地標物體但場景類型不是地標類型
-        if scene_type != "tourist_landmark" and "landmark" not in scene_type:
-            landmark_objects = [obj for obj in detected_objects if obj.get("is_landmark", False)]
-            if landmark_objects:
-                # 添加地標功能區，但不覆蓋已有的功能區
-                landmark_zones = self._identify_landmark_zones(landmark_objects)
-                # 確保地標區域不會覆蓋已識別的其他重要功能區
-                for zone_id, zone_info in landmark_zones.items():
-                    if zone_id not in zones:
-                        zones[zone_id] = zone_info
-        # MODIFIED: Enhanced fallback strategy - try simplified identification if no zones found
-        if not zones:
-            zones.update(self._identify_default_zones(category_regions, detected_objects))
-            # Final fallback: create basic zones from high-confidence objects
-            if not zones:
-                zones.update(self._create_basic_zones_from_objects(detected_objects, scene_type))
-        return zones
-    def _identify_core_objects_for_scene(self, detected_objects: List[Dict], scene_type: str) -> List[Dict]:
         """
-        Identify core objects that define a particular scene type.
         Args:
-            detected_objects: List of detected objects
-            scene_type: Scene type
         Returns:
-            List of core objects for the scene
         """
-        core_objects = []
-        scene_core_mapping = {
-            "bedroom": [59],  # bed
-            "kitchen": [68, 69, 71, 72],  # microwave, oven, sink, refrigerator
-            "living_room": [57, 58, 62],  # sofa, chair, tv
-            "dining_area": [60, 46, 47],  # dining table, fork, knife
-            "office_workspace": [63, 64, 66, 73]  # laptop, mouse, keyboard, book
-        }
-        if scene_type in scene_core_mapping:
-            core_class_ids = scene_core_mapping[scene_type]
-            for obj in detected_objects:
-                if obj["class_id"] in core_class_ids and obj.get("confidence", 0) >= 0.4:
-                    core_objects.append(obj)
-        return core_objects
-    def _get_object_categories(self, detected_objects: List[Dict]) -> set:
-        """Get unique object categories from detected objects."""
-        object_categories = set()
-        for obj in detected_objects:
-            category = self._categorize_object(obj)
-            if category:
-                object_categories.add(category)
-        return object_categories
-    def _create_basic_zones_from_objects(self, detected_objects: List[Dict], scene_type: str) -> Dict:
         """
-        Create basic functional zones from individual high-confidence objects.
-        This is a fallback when standard zone identification fails.
         Args:
-            detected_objects: List of detected objects
-            scene_type: Scene type
         Returns:
-            Dictionary of basic zones
         """
-        zones = {}
-        # Focus on high-confidence objects
-        high_conf_objects = [obj for obj in detected_objects if obj.get("confidence", 0) >= 0.6]
-        if not high_conf_objects:
-            high_conf_objects = detected_objects  # Fallback to all objects
-        # Create zones based on individual important objects
-        for i, obj in enumerate(high_conf_objects[:3]):  # Limit to top 3 objects
-            class_name = obj["class_name"]
-            region = obj.get("region", "center")
-            # Create descriptive zone based on object type
-            zone_description = self._get_basic_zone_description(class_name, scene_type)
-            if zone_description:
-                zones[f"functional_area_{i+1}"] = {
-                    "region": region,
-                    "objects": [class_name],
-                    "description": zone_description
-                }
-        return zones
-    def _get_basic_zone_description(self, class_name: str, scene_type: str) -> str:
-        """Generate basic zone description based on object and scene type."""
-        # Object-specific descriptions
-        descriptions = {
-            "bed": "Sleeping and rest area",
-            "sofa": "Seating and relaxation area",
-            "chair": "Seating area",
-            "dining table": "Dining and meal area",
-            "tv": "Entertainment and media area",
-            "laptop": "Work and computing area",
-            "potted plant": "Decorative and green space area",
-            "refrigerator": "Food storage and kitchen area",
-            "car": "Vehicle and transportation area",
-            "person": "Activity and social area"
-        }
-        return descriptions.get(class_name, f"Functional area with {class_name}")
     def _categorize_object(self, obj: Dict) -> str:
         """
-        Categorize detected objects into functional categories for zone identification.
-        """
-        class_id = obj.get("class_id", -1)
-        class_name = obj.get("class_name", "").lower()
-        # Use existing category mapping if available
-        if hasattr(self, 'OBJECT_CATEGORIES') and self.OBJECT_CATEGORIES:
-            for category, ids in self.OBJECT_CATEGORIES.items():
-                if class_id in ids:
-                    return category
-        # Fallback categorization based on class names for common COCO classes
-        furniture_items = ["chair", "couch", "bed", "dining table", "toilet"]
-        plant_items = ["potted plant"]
-        electronic_items = ["tv", "laptop", "mouse", "remote", "keyboard", "cell phone"]
-        vehicle_items = ["bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat"]
-        person_items = ["person"]
-        kitchen_items = ["bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl",
-                        "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog",
-                        "pizza", "donut", "cake", "refrigerator", "oven", "toaster", "sink", "microwave"]
-        sports_items = ["frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat",
-                    "baseball glove", "skateboard", "surfboard", "tennis racket"]
-        personal_items = ["handbag", "tie", "suitcase", "umbrella", "backpack"]
-        if any(item in class_name for item in furniture_items):
-            return "furniture"
-        elif any(item in class_name for item in plant_items):
-            return "plant"
-        elif any(item in class_name for item in electronic_items):
-            return "electronics"
-        elif any(item in class_name for item in vehicle_items):
-            return "vehicle"
-        elif any(item in class_name for item in person_items):
-            return "person"
-        elif any(item in class_name for item in kitchen_items):
-            return "kitchen_items"
-        elif any(item in class_name for item in sports_items):
-            return "sports"
-        elif any(item in class_name for item in personal_items):
-            return "personal_items"
-        else:
-            return "misc"
-    def _evaluate_zone_identification_feasibility(self, detected_objects: List[Dict], scene_type: str) -> bool:
-        """
-        基於物件關聯性和分布特徵的彈性可行性評估
         """
-        if len(detected_objects) < 2:
-            return False
-        # 計算不同置信度層級的物件分布
-        high_conf_objects = [obj for obj in detected_objects if obj.get("confidence", 0) >= 0.6]
-        medium_conf_objects = [obj for obj in detected_objects if obj.get("confidence", 0) >= 0.4]
-        # 基礎條件：至少需要一定數量的可信物件
-        if len(medium_conf_objects) < 2:
-            return False
-        # evalure relationships
-        functional_relationships = self._calculate_functional_relationships(detected_objects)
-        # 評估space的分布多樣性
-        spatial_diversity = self._calculate_spatial_diversity(detected_objects)
-        # 綜合評分機制
-        feasibility_score = 0
-        # 物件數量的貢獻值（權重30%）
-        object_count_score = min(len(detected_objects) / 5.0, 1.0) * 0.3
-        # 信心度質量貢獻（權重25%）
-        confidence_score = len(high_conf_objects) / max(len(detected_objects), 1) * 0.25
-        # 功能關聯性貢獻（權重25%）
-        relationship_score = functional_relationships * 0.25
-        # space多樣性貢獻（權重20%）
-        diversity_score = spatial_diversity * 0.20
-        feasibility_score = object_count_score + confidence_score + relationship_score + diversity_score
-        # 動態閾值：基於場景複雜度調整
-        complexity_threshold = self._get_complexity_threshold(scene_type)
-        return feasibility_score >= complexity_threshold
-    def _calculate_functional_relationships(self, detected_objects: List[Dict]) -> float:
-        """
-        計算物件間的功能關聯性評分
-        基於常見的物件組合模式評估功能相關性
-        """
-        relationship_pairs = {
-            # 家具組合關係
-            frozenset([56, 60]): 1.0,  # 椅子+桌子 (dining/work area)
-            frozenset([57, 62]): 0.9,  # 沙發+電視 (living area)
-            frozenset([59, 58]): 0.7,  # 床+植物 (bedroom decor)
-            # 工作相關組合
-            frozenset([63, 66]): 0.9,  # 筆電+鍵盤 (workspace)
-            frozenset([63, 64]): 0.8,  # 筆電+滑鼠 (workspace)
-            frozenset([60, 63]): 0.8,  # 桌子+筆電 (workspace)
-            # 廚房相關組合
-            frozenset([68, 72]): 0.9,  # 微波爐+冰箱 (kitchen)
-            frozenset([69, 71]): 0.8,  # 烤箱+水槽 (kitchen)
-            # 用餐相關組合
-            frozenset([60, 40]): 0.8,  # 桌子+酒杯 (dining)
-            frozenset([60, 41]): 0.8,  # 桌子+杯子 (dining)
-            frozenset([56, 40]): 0.7,  # 椅子+酒杯 (dining)
-            # 交通相關組合
-            frozenset([2, 9]): 0.8,   # 汽車+交通燈 (traffic)
-            frozenset([0, 9]): 0.7,   # 行人+交通燈 (crosswalk)
-        }
-        detected_class_ids = set(obj["class_id"] for obj in detected_objects)
-        max_possible_score = 0
-        actual_score = 0
-        for pair, score in relationship_pairs.items():
-            max_possible_score += score
-            if pair.issubset(detected_class_ids):
-                actual_score += score
-        return actual_score / max_possible_score if max_possible_score > 0 else 0
-    def _calculate_spatial_diversity(self, detected_objects: List[Dict]) -> float:
-        """
-        計算物件空間分布的多樣性
-        評估物件是否分散在不同區域，避免所有物件集中在單一區域
-        """
-        regions = set(obj.get("region", "center") for obj in detected_objects)
-        unique_regions = len(regions)
-        return min(unique_regions / 2.0, 1.0)
-    def _get_complexity_threshold(self, scene_type: str) -> float:
-        """
-        可根據場景類型返回適當的複雜度閾值
-        平衡不同場景的區域劃分需求
-        """
-        # 較簡單場景需要較高分數才進行區域劃分
-        simple_scenes = ["bedroom", "bathroom", "closet"]
-        # 較複雜場景可以較低分數進行區域劃分
-        complex_scenes = ["living_room", "kitchen", "office_workspace", "dining_area"]
-        if scene_type in simple_scenes:
-            return 0.65  # 較高閾值，避免過度細分
-        elif scene_type in complex_scenes:
-            return 0.45  # 較低閾值，允許合理劃分
-        else:
-            return 0.55  # 中等閾值，平衡策略
-    def _identify_indoor_zones(self, category_regions: Dict, detected_objects: List[Dict], scene_type: str) -> Dict:
-        """
-        平衡化的室內功能區域識別
-        採用通用的物件關聯性分析，避免場景特定的硬編碼
-        """
-        zones = {}
-        # 辨識到主要功能區域（基於物件關聯性而非場景類型）
-        primary_zone = self._identify_primary_functional_area(detected_objects)
-        if primary_zone:
-            zones["primary_area"] = primary_zone
-        # 只有明確證據且物件數量足夠時創建次要功能區域
-        if len(zones) >= 1 and len(detected_objects) >= 6:
-            secondary_zone = self._identify_secondary_functional_area(detected_objects, zones)
-            if secondary_zone:
-                zones["secondary_area"] = secondary_zone
-        return zones
-    def _identify_primary_functional_area(self, detected_objects: List[Dict]) -> Dict:
-        """
-        辨識主要功能區域，基於最強的物件關聯性組合
-        採用通用邏輯處理各種室內場景
-        """
-        # 用餐區域檢測（桌椅組合）
-        dining_area = self._detect_functional_combination(
-            detected_objects,
-            primary_objects=[60],  # dining table
-            supporting_objects=[56, 40, 41, 42, 43],  # chair, wine glass, cup, fork, knife
-            min_supporting=2,
-            description_template="Dining area with table and seating arrangement"
-        )
-        if dining_area:
-            return dining_area
-        # 休息區域檢測（沙發電視組合或床）
-        seating_area = self._detect_functional_combination(
-            detected_objects,
-            primary_objects=[57, 59],  # sofa, bed
-            supporting_objects=[62, 58, 56],  # tv, potted plant, chair
-            min_supporting=1,
-            description_template="Seating and relaxation area"
-        )
-        if seating_area:
-            return seating_area
-        # 工作區域檢測（電子設備與家具組合）
-        work_area = self._detect_functional_combination(
-            detected_objects,
-            primary_objects=[63, 66],  # laptop, keyboard
-            supporting_objects=[60, 56, 64],  # dining table, chair, mouse
-            min_supporting=2,
-            description_template="Workspace area with electronics and furniture"
-        )
-        if work_area:
-            return work_area
-        return None
-    def _identify_secondary_functional_area(self, detected_objects: List[Dict], existing_zones: Dict) -> Dict:
         """
-        識別次要功能區域，避免與主要區域重疊
-        """
-        # 獲取已使用的區域
-        used_regions = set(zone["region"] for zone in existing_zones.values())
-        # 裝飾區域檢測（植物集中區域）
-        decorative_area = self._detect_functional_combination(
-            detected_objects,
-            primary_objects=[58],  # potted plant
-            supporting_objects=[75],  # vase
-            min_supporting=0,
-            min_primary=3,  # 至少需要3個植物
-            description_template="Decorative area with plants and ornamental items",
-            exclude_regions=used_regions
-        )
-        if decorative_area:
-            return decorative_area
-        # 儲存區域檢測（廚房電器組合）
-        storage_area = self._detect_functional_combination(
-            detected_objects,
-            primary_objects=[72, 68, 69],  # refrigerator, microwave, oven
-            supporting_objects=[71],  # sink
-            min_supporting=0,
-            min_primary=2,
-            description_template="Kitchen appliance and storage area",
-            exclude_regions=used_regions
-        )
-        if storage_area:
-            return storage_area
-        return None
-    def _detect_functional_combination(self, detected_objects: List[Dict], primary_objects: List[int],
-                                    supporting_objects: List[int], min_supporting: int,
-                                    description_template: str, min_primary: int = 1,
-                                    exclude_regions: set = None) -> Dict:
-        """
-        通用的功能組合檢測方法
-        基於主要物件和支持物件的組合判斷功能區域
         Args:
-            detected_objects: 檢測到的物件列表
-            primary_objects: 主要物件的class_id列表
-            supporting_objects: 支持物件的class_id列表
-            min_supporting: 最少需要的支持物件數量
-            description_template: 描述模板
-            min_primary: 最少需要的主要物件數量
-            exclude_regions: 需要排除的區域集合
         Returns:
-            Dict: 功能區域資訊，如果不符合條件則返回None
         """
-        if exclude_regions is None:
-            exclude_regions = set()
-        # 收集主要物件
-        primary_objs = [obj for obj in detected_objects
-                    if obj["class_id"] in primary_objects and obj.get("confidence", 0) >= 0.4]
-        # 收集支持物件
-        supporting_objs = [obj for obj in detected_objects
-                        if obj["class_id"] in supporting_objects and obj.get("confidence", 0) >= 0.4]
-        # 檢查是否滿足最少數量要求
-        if len(primary_objs) < min_primary or len(supporting_objs) < min_supporting:
-            return None
-        # 按區域組織物件
-        region_combinations = {}
-        all_relevant_objs = primary_objs + supporting_objs
-        for obj in all_relevant_objs:
-            region = obj["region"]
-            # 排除指定區域
-            if region in exclude_regions:
-                continue
-            if region not in region_combinations:
-                region_combinations[region] = {"primary": [], "supporting": [], "all": []}
-            region_combinations[region]["all"].append(obj)
-            if obj["class_id"] in primary_objects:
-                region_combinations[region]["primary"].append(obj)
-            else:
-                region_combinations[region]["supporting"].append(obj)
-        # 找到最佳區域組合
-        best_region = None
-        best_score = 0
-        for region, objs in region_combinations.items():
-            # 計算該區域的評分
-            primary_count = len(objs["primary"])
-            supporting_count = len(objs["supporting"])
-            # 必須滿足最低要求
-            if primary_count < min_primary or supporting_count < min_supporting:
-                continue
-            # 計算組合評分（主要物件權重較高）
-            score = primary_count * 2 + supporting_count
-            if score > best_score:
-                best_score = score
-                best_region = region
-        if best_region is None:
-            return None
-        best_combination = region_combinations[best_region]
-        all_objects = [obj["class_name"] for obj in best_combination["all"]]
-        return {
-            "region": best_region,
-            "objects": all_objects,
-            "description": description_template
-        }
-    def _identify_intersection_zones(self, category_regions: Dict, detected_objects: List[Dict], viewpoint: str) -> Dict:
-        """
-        Identify functional zones for urban intersections with enhanced spatial awareness.
-        Args:
-            category_regions: Objects grouped by category and region
-            detected_objects: List of detected objects
-            viewpoint: Detected viewpoint
-        Returns:
-            Dict: Refined intersection functional zones
-        """
-        zones = {}
-        # Get pedestrians, vehicles and traffic signals
-        pedestrian_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
-        vehicle_objs = [obj for obj in detected_objects if obj["class_id"] in [1, 2, 3, 5, 7]]  # bicycle, car, motorcycle, bus, truck
-        traffic_light_objs = [obj for obj in detected_objects if obj["class_id"] == 9]
-        # Create distribution maps for better spatial understanding
-        regions_distribution = self._create_distribution_map(detected_objects)
-        # Analyze pedestrian crossing patterns
-        crossing_zones = self._analyze_crossing_patterns(pedestrian_objs, traffic_light_objs, regions_distribution)
-        zones.update(crossing_zones)
-        # Analyze vehicle traffic zones with directional awareness
-        traffic_zones = self._analyze_traffic_zones(vehicle_objs, regions_distribution)
-        zones.update(traffic_zones)
-        # Identify traffic control zones based on signal placement
-        if traffic_light_objs:
-            # Group traffic lights by region for better organization
-            signal_regions = {}
-            for obj in traffic_light_objs:
-                region = obj["region"]
-                if region not in signal_regions:
-                    signal_regions[region] = []
-                signal_regions[region].append(obj)
-            # Create traffic control zones for each region with signals
-            for idx, (region, signals) in enumerate(signal_regions.items()):
-                # Check if this region has a directional name
-                direction = self._get_directional_description(region)
-                zones[f"traffic_control_zone_{idx+1}"] = {
-                    "region": region,
-                    "objects": ["traffic light"] * len(signals),
-                    "description": f"Traffic control area with {len(signals)} traffic signals" +
-                                (f" in {direction} area" if direction else "")
-                }
-        return zones
-    def _identify_landmark_zones(self, landmark_objects: List[Dict]) -> Dict:
         """
-        識別與地標相關的功能區域
         Args:
-            landmark_objects: 被識別為地標的物體列表
         Returns:
-            Dict: 地標相關的功能區域
         """
-        landmark_zones = {}
-        if not landmark_objects:
-            print("Warning: No landmark objects provided to _identify_landmark_zones")
-            return landmark_zones
         try:
-            for i, landmark in enumerate(landmark_objects):
-                if not isinstance(landmark, dict):
-                    print(f"Warning: Landmark object at index {i} is not a dictionary: {type(landmark)}")
-                    continue
-                landmark_id = landmark.get("landmark_id")
-                if not landmark_id:
-                    print(f"Warning: Missing landmark_id for landmark at index {i}")
-                    landmark_id = f"unknown_landmark_{i}"
-                landmark_name = landmark.get("class_name", "Landmark")
-                landmark_type = landmark.get("landmark_type", "architectural")
-                landmark_region = landmark.get("region", "middle_center")
-                # 為地標創建主要觀景區
-                zone_id = f"landmark_zone_{i+1}"
-                zone_name = f"{landmark_name} Viewing Area"
-                # 根據地標類型調整描述
-                if landmark_type == "natural":
-                    zone_description = f"Scenic viewpoint for observing {landmark_name}, a notable natural landmark in {landmark.get('location', 'this area')}."
-                    primary_function = "Nature observation and photography"
-                elif landmark_type == "monument":
-                    zone_description = f"Viewing area around {landmark_name}, a significant monument in {landmark.get('location', 'this area')}."
-                    primary_function = "Historical appreciation and cultural tourism"
-                else:  # architectural
-                    zone_description = f"Area centered around {landmark_name}, where visitors can observe and appreciate this iconic structure in {landmark.get('location', 'this area')}."
-                    primary_function = "Architectural tourism and photography"
-                # 確定與地標相關的物體
-                related_objects = ["person", "camera", "cell phone", "backpack"]
-                # 創建功能區域
-                landmark_zones[zone_id] = {
-                    "name": zone_name,
-                    "description": zone_description,
-                    "objects": ["landmark"] + [obj for obj in related_objects if obj in [o.get("class_name") for o in landmark_objects]],
-                    "region": landmark_region,
-                    "primary_function": primary_function
-                }
-                # 如果有建造年份信息，加到描述中
-                if "year_built" in landmark:
-                    landmark_zones[zone_id]["description"] += f" Built in {landmark['year_built']}."
-                # 如果有建築風格信息，加到描述中
-                if "architectural_style" in landmark:
-                    landmark_zones[zone_id]["description"] += f" Features {landmark['architectural_style']} architectural style."
-                # 如果有重要性信息，加到描述中
-                if "significance" in landmark:
-                    landmark_zones[zone_id]["description"] += f" {landmark['significance']}."
-                try:
-                    # 創建照相區
-                    photo_region = landmark_region  # 默認與地標在同一區域
-                    # 根據地標位置調整照相區位置（地標前方通常是照相區）
-                    region_mapping = {
-                        "top_left": "bottom_right",
-                        "top_center": "bottom_center",
-                        "top_right": "bottom_left",
-                        "middle_left": "middle_right",
-                        "middle_center": "bottom_center",
-                        "middle_right": "middle_left",
-                        "bottom_left": "top_right",
-                        "bottom_center": "top_center",
-                        "bottom_right": "top_left"
-                    }
-                    if landmark_region in region_mapping:
-                        photo_region = region_mapping[landmark_region]
-                    landmark_zones[f"photo_spot_{i+1}"] = {
-                        "name": f"{landmark_name} Photography Spot",
-                        "description": f"Popular position for photographing {landmark_name} with optimal viewing angle.",
-                        "objects": ["camera", "person", "cell phone"],
-                        "region": photo_region,
-                        "primary_function": "Tourist photography"
-                    }
-                except Exception as e:
-                    print(f"Error creating photo spot zone: {e}")
-                try:
-                    # 如果是著名地標，可能有紀念品販售區
-                    if landmark.get("confidence", 0) > 0.7:  # 高置信度地標更可能有紀念品區
-                        # 根據地標位置找到適合的紀念品區位置（通常在地標附近但不直接在地標上）
-                        adjacent_regions = {
-                            "top_left": ["top_center", "middle_left"],
-                            "top_center": ["top_left", "top_right"],
-                            "top_right": ["top_center", "middle_right"],
-                            "middle_left": ["top_left", "bottom_left"],
-                            "middle_center": ["middle_left", "middle_right"],
-                            "middle_right": ["top_right", "bottom_right"],
-                            "bottom_left": ["middle_left", "bottom_center"],
-                            "bottom_center": ["bottom_left", "bottom_right"],
-                            "bottom_right": ["bottom_center", "middle_right"]
-                        }
-                        if landmark_region in adjacent_regions:
-                            souvenir_region = adjacent_regions[landmark_region][0]  # 選擇第一個相鄰區域
-                            landmark_zones[f"souvenir_area_{i+1}"] = {
-                                "name": f"{landmark_name} Souvenir Area",
-                                "description": f"Area where visitors can purchase souvenirs and memorabilia related to {landmark_name}.",
-                                "objects": ["person", "handbag", "backpack"],
-                                "region": souvenir_region,
-                                "primary_function": "Tourism commerce"
-                            }
-                except Exception as e:
-                    print(f"Error creating souvenir area zone: {e}")
         except Exception as e:
-            print(f"Error in _identify_landmark_zones: {e}")
-            import traceback
-            traceback.print_exc()
-        return landmark_zones
-    def _analyze_crossing_patterns(self, pedestrians: List[Dict], traffic_lights: List[Dict],
-                                region_distribution: Dict) -> Dict:
         """
-        Analyze pedestrian crossing patterns to identify crosswalk zones.
         Args:
-            pedestrians: List of pedestrian objects
-            traffic_lights: List of traffic light objects
-            region_distribution: Distribution of objects by region
         Returns:
-            Dict: Identified crossing zones
         """
-        crossing_zones = {}
-        if not pedestrians:
-            return crossing_zones
-        # Group pedestrians by region
-        pedestrian_regions = {}
-        for p in pedestrians:
-            region = p["region"]
-            if region not in pedestrian_regions:
-                pedestrian_regions[region] = []
-            pedestrian_regions[region].append(p)
-        # Sort regions by pedestrian count to find main crossing areas
-        sorted_regions = sorted(pedestrian_regions.items(), key=lambda x: len(x[1]), reverse=True)
-        # Create crossing zones for regions with pedestrians
-        for idx, (region, peds) in enumerate(sorted_regions[:2]):  # Focus on top 2 regions
-            # Check if there are traffic lights nearby to indicate a crosswalk
-            has_nearby_signals = any(t["region"] == region for t in traffic_lights)
-            # Create crossing zone with descriptive naming
-            zone_name = f"crossing_zone_{idx+1}"
-            direction = self._get_directional_description(region)
-            description = f"Pedestrian crossing area with {len(peds)} "
-            description += "person" if len(peds) == 1 else "people"
-            if direction:
-                description += f" in {direction} direction"
-            if has_nearby_signals:
-                description += " near traffic signals"
-            crossing_zones[zone_name] = {
-                "region": region,
-                "objects": ["pedestrian"] * len(peds),
-                "description": description
-            }
-        return crossing_zones
-    def _analyze_traffic_zones(self, vehicles: List[Dict], region_distribution: Dict) -> Dict:
         """
-        Analyze vehicle distribution to identify traffic zones with directional awareness.
         Args:
-            vehicles: List of vehicle objects
-            region_distribution: Distribution of objects by region
         Returns:
-            Dict: Identified traffic zones
         """
-        traffic_zones = {}
-        if not vehicles:
-            return traffic_zones
-        # 把運輸工具歸成一區
-        vehicle_regions = {}
-        for v in vehicles:
-            region = v["region"]
-            if region not in vehicle_regions:
-                vehicle_regions[region] = []
-            vehicle_regions[region].append(v)
-        # Create traffic zones for regions with vehicles
-        main_traffic_region = max(vehicle_regions.items(), key=lambda x: len(x[1]), default=(None, []))
-        if main_traffic_region[0] is not None:
-            region = main_traffic_region[0]
-            vehicles_in_region = main_traffic_region[1]
-            # Get a list of vehicle types for description
-            vehicle_types = [v["class_name"] for v in vehicles_in_region]
-            unique_types = list(set(vehicle_types))
-            # Get directional description
-            direction = self._get_directional_description(region)
-            # Create descriptive zone
-            traffic_zones["vehicle_zone"] = {
-                "region": region,
-                "objects": vehicle_types,
-                "description": f"Vehicle traffic area with {', '.join(unique_types[:3])}" +
-                            (f" in {direction} area" if direction else "")
-            }
-            # If vehicles are distributed across multiple regions, create secondary zones
-            if len(vehicle_regions) > 1:
-                # Get second most populated region
-                sorted_regions = sorted(vehicle_regions.items(), key=lambda x: len(x[1]), reverse=True)
-                if len(sorted_regions) > 1:
-                    second_region, second_vehicles = sorted_regions[1]
-                    direction = self._get_directional_description(second_region)
-                    vehicle_types = [v["class_name"] for v in second_vehicles]
-                    unique_types = list(set(vehicle_types))
-                    traffic_zones["secondary_vehicle_zone"] = {
-                        "region": second_region,
-                        "objects": vehicle_types,
-                        "description": f"Secondary traffic area with {', '.join(unique_types[:2])}" +
-                                    (f" in {direction} direction" if direction else "")
-                    }
-        return traffic_zones
-    def _get_directional_description(self, region: str) -> str:
         """
-        把方向轉換成方位(東西南北)
         Args:
-            region: Region name from the grid
         Returns:
-            str: Directional description
         """
-        if "top" in region and "left" in region:
-            return "northwest"
-        elif "top" in region and "right" in region:
-            return "northeast"
-        elif "bottom" in region and "left" in region:
-            return "southwest"
-        elif "bottom" in region and "right" in region:
-            return "southeast"
-        elif "top" in region:
-            return "north"
-        elif "bottom" in region:
-            return "south"
-        elif "left" in region:
-            return "west"
-        elif "right" in region:
-            return "east"
-        else:
-            return "central"
-    def _create_distribution_map(self, detected_objects: List[Dict]) -> Dict:
         """
-        Create a distribution map of objects across regions for spatial analysis.
         Args:
-            detected_objects: List of detected objects
         Returns:
-            Dict: Distribution map of objects by region and class
         """
-        distribution = {}
-        # Initialize all regions
-        for region in self.regions.keys():
-            distribution[region] = {
-                "total": 0,
-                "objects": {},
-                "density": 0
-            }
-        # Populate the distribution
-        for obj in detected_objects:
-            region = obj["region"]
-            class_id = obj["class_id"]
-            class_name = obj["class_name"]
-            distribution[region]["total"] += 1
-            if class_id not in distribution[region]["objects"]:
-                distribution[region]["objects"][class_id] = {
-                    "name": class_name,
-                    "count": 0,
-                    "positions": []
-                }
-            distribution[region]["objects"][class_id]["count"] += 1
-            # Store position for spatial relationship analysis
-            if "normalized_center" in obj:
-                distribution[region]["objects"][class_id]["positions"].append(obj["normalized_center"])
-        # Calculate object density for each region
-        for region, data in distribution.items():
-            # Assuming all regions are equal size in the grid
-            data["density"] = data["total"] / 1
-        return distribution
-    def _identify_asian_cultural_zones(self, category_regions: Dict, detected_objects: List[Dict], scene_type: str) -> Dict:
         """
-        Identify functional zones for scenes with Asian cultural context.
         Args:
-            category_regions: Objects grouped by category and region
-            detected_objects: List of detected objects
-            scene_type: Specific scene type
         Returns:
-            Dict: Asian cultural functional zones
-        """
-        zones = {}
-        # Identify storefront zone
-        storefront_items = []
-        storefront_regions = {}
-        # Since storefronts aren't directly detectable, infer from context
-        # For example, look for regions with signs, people, and smaller objects
-        sign_regions = set()
-        for obj in detected_objects:
-            if obj["class_id"] == 0:  # Person
-                region = obj["region"]
-                if region not in storefront_regions:
-                    storefront_regions[region] = []
-                storefront_regions[region].append(obj)
-                # Add regions with people as potential storefront areas
-                sign_regions.add(region)
-        # Use the areas with most people as storefront zones
-        if storefront_regions:
-            main_storefront_regions = sorted(storefront_regions.items(),
-                                        key=lambda x: len(x[1]),
-                                        reverse=True)[:2]  # Top 2 regions
-            for idx, (region, objs) in enumerate(main_storefront_regions):
-                zones[f"commercial_zone_{idx+1}"] = {
-                    "region": region,
-                    "objects": [obj["class_name"] for obj in objs],
-                    "description": f"Asian commercial storefront with pedestrian activity"
-                }
-        # Identify pedestrian pathway - enhanced to better detect linear pathways
-        pathway_items = []
-        pathway_regions = {}
-        # Extract people for pathway analysis
-        people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
-        # Analyze if people form a line (typical of shopping streets)
-        people_positions = [obj["normalized_center"] for obj in people_objs]
-        structured_path = False
-        if len(people_positions) >= 3:
-            # Check if people are arranged along a similar y-coordinate (horizontal path)
-            y_coords = [pos[1] for pos in people_positions]
-            y_mean = sum(y_coords) / len(y_coords)
-            y_variance = sum((y - y_mean)**2 for y in y_coords) / len(y_coords)
-            horizontal_path = y_variance < 0.05  # Low variance indicates horizontal alignment
-            # Check if people are arranged along a similar x-coordinate (vertical path)
-            x_coords = [pos[0] for pos in people_positions]
-            x_mean = sum(x_coords) / len(x_coords)
-            x_variance = sum((x - x_mean)**2 for x in x_coords) / len(x_coords)
-            vertical_path = x_variance < 0.05  # Low variance indicates vertical alignment
-            structured_path = horizontal_path or vertical_path
-            path_direction = "horizontal" if horizontal_path else "vertical" if vertical_path else "meandering"
-        # Collect pathway objects (people, bicycles, motorcycles in middle area)
-        for obj in detected_objects:
-            if obj["class_id"] in [0, 1, 3]:  # Person, bicycle, motorcycle
-                y_pos = obj["normalized_center"][1]
-                # Group by vertical position (middle of image likely pathway)
-                if 0.25 <= y_pos <= 0.75:
-                    region = obj["region"]
-                    if region not in pathway_regions:
-                        pathway_regions[region] = []
-                    pathway_regions[region].append(obj)
-                    pathway_items.append(obj["class_name"])
-        if pathway_items:
-            path_desc = "Pedestrian walkway with people moving through the commercial area"
-            if structured_path:
-                path_desc = f"{path_direction.capitalize()} pedestrian walkway with organized foot traffic"
-            zones["pedestrian_pathway"] = {
-                "region": "middle_center",  # Assumption: pathway often in middle
-                "objects": list(set(pathway_items)),
-                "description": path_desc
-            }
-        # Identify vendor zone (small stalls/shops - inferred from context)
-        has_small_objects = any(obj["class_id"] in [24, 26, 39, 41] for obj in detected_objects)  # bags, bottles, cups
-        has_people = any(obj["class_id"] == 0 for obj in detected_objects)
-        if has_small_objects and has_people:
-            # Likely vendor areas are where people and small objects cluster
-            small_obj_regions = {}
-            for obj in detected_objects:
-                if obj["class_id"] in [24, 26, 39, 41, 67]:  # bags, bottles, cups, phones
-                    region = obj["region"]
-                    if region not in small_obj_regions:
-                        small_obj_regions[region] = []
-                    small_obj_regions[region].append(obj)
-            if small_obj_regions:
-                main_vendor_region = max(small_obj_regions.items(),
-                                    key=lambda x: len(x[1]),
-                                    default=(None, []))
-                if main_vendor_region[0] is not None:
-                    vendor_items = [obj["class_name"] for obj in main_vendor_region[1]]
-                    zones["vendor_zone"] = {
-                        "region": main_vendor_region[0],
-                        "objects": list(set(vendor_items)),
-                        "description": "Vendor or market stall area with small merchandise"
-                    }
-        # For night markets, identify illuminated zones
-        if scene_type == "asian_night_market":
-            # Night markets typically have bright spots for food stalls
-            # This would be enhanced with lighting analysis integration
-            zones["food_stall_zone"] = {
-                "region": "middle_center",
-                "objects": ["inferred food stalls"],
-                "description": "Food stall area typical of Asian night markets"
-            }
-        return zones
-    def _identify_upscale_dining_zones(self, category_regions: Dict, detected_objects: List[Dict]) -> Dict:
         """
-        Identify functional zones for upscale dining settings.
-        Args:
-            category_regions: Objects grouped by category and region
-            detected_objects: List of detected objects
-        Returns:
-            Dict: Upscale dining functional zones
-        """
-        zones = {}
-        # Identify dining table zone
-        dining_items = []
-        dining_regions = {}
-        for obj in detected_objects:
-            if obj["class_id"] in [40, 41, 42, 43, 44, 45, 60]:  # Wine glass, cup, fork, knife, spoon, bowl, table
-                region = obj["region"]
-                if region not in dining_regions:
-                    dining_regions[region] = []
-                dining_regions[region].append(obj)
-                dining_items.append(obj["class_name"])
-        if dining_items:
-            main_dining_region = max(dining_regions.items(),
-                                key=lambda x: len(x[1]),
-                                default=(None, []))
-            if main_dining_region[0] is not None:
-                zones["formal_dining_zone"] = {
-                    "region": main_dining_region[0],
-                    "objects": list(set(dining_items)),
-                    "description": f"Formal dining area with {', '.join(list(set(dining_items))[:3])}"
-                }
-        # Identify decorative zone with enhanced detection
-        decor_items = []
-        decor_regions = {}
-        # Look for decorative elements (vases, wine glasses, unused dishes)
-        for obj in detected_objects:
-            if obj["class_id"] in [75, 40]:  # Vase, wine glass
-                region = obj["region"]
-                if region not in decor_regions:
-                    decor_regions[region] = []
-                decor_regions[region].append(obj)
-                decor_items.append(obj["class_name"])
-        if decor_items:
-            main_decor_region = max(decor_regions.items(),
-                                key=lambda x: len(x[1]),
-                                default=(None, []))
-            if main_decor_region[0] is not None:
-                zones["decorative_zone"] = {
-                    "region": main_decor_region[0],
-                    "objects": list(set(decor_items)),
-                    "description": f"Decorative area with {', '.join(list(set(decor_items)))}"
-                }
-        # Identify seating arrangement zone
-        chairs = [obj for obj in detected_objects if obj["class_id"] == 56]  # chairs
-        if len(chairs) >= 2:
-            chair_regions = {}
-            for obj in chairs:
-                region = obj["region"]
-                if region not in chair_regions:
-                    chair_regions[region] = []
-                chair_regions[region].append(obj)
-            if chair_regions:
-                main_seating_region = max(chair_regions.items(),
-                                    key=lambda x: len(x[1]),
-                                    default=(None, []))
-                if main_seating_region[0] is not None:
-                    zones["dining_seating_zone"] = {
-                        "region": main_seating_region[0],
-                        "objects": ["chair"] * len(main_seating_region[1]),
-                        "description": f"Formal dining seating arrangement with {len(main_seating_region[1])} chairs"
-                    }
-        # Identify serving area (if different from dining area)
-        serving_items = []
-        serving_regions = {}
-        # Serving areas might have bottles, bowls, containers
-        for obj in detected_objects:
-            if obj["class_id"] in [39, 45]:  # Bottle, bowl
-                # Check if it's in a different region from the main dining table
-                if "formal_dining_zone" in zones and obj["region"] != zones["formal_dining_zone"]["region"]:
-                    region = obj["region"]
-                    if region not in serving_regions:
-                        serving_regions[region] = []
-                    serving_regions[region].append(obj)
-                    serving_items.append(obj["class_name"])
-        if serving_items:
-            main_serving_region = max(serving_regions.items(),
-                                key=lambda x: len(x[1]),
-                                default=(None, []))
-            if main_serving_region[0] is not None:
-                zones["serving_zone"] = {
-                    "region": main_serving_region[0],
-                    "objects": list(set(serving_items)),
-                    "description": f"Serving or sideboard area with {', '.join(list(set(serving_items)))}"
-                }
-        return zones
-    def _identify_financial_district_zones(self, category_regions: Dict, detected_objects: List[Dict]) -> Dict:
         """
-        Identify functional zones for financial district scenes.
         Args:
-            category_regions: Objects grouped by category and region
-            detected_objects: List of detected objects
         Returns:
-            Dict: Financial district functional zones
         """
-        zones = {}
-        # Identify traffic zone
-        traffic_items = []
-        traffic_regions = {}
-        for obj in detected_objects:
-            if obj["class_id"] in [1, 2, 3, 5, 6, 7, 9]:  # Various vehicles and traffic lights
-                region = obj["region"]
-                if region not in traffic_regions:
-                    traffic_regions[region] = []
-                traffic_regions[region].append(obj)
-                traffic_items.append(obj["class_name"])
-        if traffic_items:
-            main_traffic_region = max(traffic_regions.items(),
-                                key=lambda x: len(x[1]),
-                                default=(None, []))
-            if main_traffic_region[0] is not None:
-                zones["traffic_zone"] = {
-                    "region": main_traffic_region[0],
-                    "objects": list(set(traffic_items)),
-                    "description": f"Urban traffic area with {', '.join(list(set(traffic_items))[:3])}"
-                }
-        # Building zones on the sides (inferred from scene context)
-        # Enhanced to check if there are actual regions that might contain buildings
-        # Check for regions without vehicles or pedestrians - likely building areas
-        left_side_regions = ["top_left", "middle_left", "bottom_left"]
-        right_side_regions = ["top_right", "middle_right", "bottom_right"]
-        # Check left side
-        left_building_evidence = True
-        for region in left_side_regions:
-            # If many vehicles or people in this region, less likely to be buildings
-            vehicle_in_region = any(obj["region"] == region and obj["class_id"] in [1, 2, 3, 5, 7]
-                                for obj in detected_objects)
-            people_in_region = any(obj["region"] == region and obj["class_id"] == 0
-                                for obj in detected_objects)
-            if vehicle_in_region or people_in_region:
-                left_building_evidence = False
-                break
-        # Check right side
-        right_building_evidence = True
-        for region in right_side_regions:
-            # If many vehicles or people in this region, less likely to be buildings
-            vehicle_in_region = any(obj["region"] == region and obj["class_id"] in [1, 2, 3, 5, 7]
-                                for obj in detected_objects)
-            people_in_region = any(obj["region"] == region and obj["class_id"] == 0
-                                for obj in detected_objects)
-            if vehicle_in_region or people_in_region:
-                right_building_evidence = False
-                break
-        # Add building zones if evidence supports them
-        if left_building_evidence:
-            zones["building_zone_left"] = {
-                "region": "middle_left",
-                "objects": ["building"],  # Inferred
-                "description": "Tall buildings line the left side of the street"
-            }
-        if right_building_evidence:
-            zones["building_zone_right"] = {
-                "region": "middle_right",
-                "objects": ["building"],  # Inferred
-                "description": "Tall buildings line the right side of the street"
-            }
-        # Identify pedestrian zone if people are present
-        people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
-        if people_objs:
-            people_regions = {}
-            for obj in people_objs:
-                region = obj["region"]
-                if region not in people_regions:
-                    people_regions[region] = []
-                people_regions[region].append(obj)
-            if people_regions:
-                main_pedestrian_region = max(people_regions.items(),
-                                        key=lambda x: len(x[1]),
-                                        default=(None, []))
-                if main_pedestrian_region[0] is not None:
-                    zones["pedestrian_zone"] = {
-                        "region": main_pedestrian_region[0],
-                        "objects": ["person"] * len(main_pedestrian_region[1]),
-                        "description": f"Pedestrian area with {len(main_pedestrian_region[1])} people navigating the financial district"
-                    }
-        return zones
-    def _identify_aerial_view_zones(self, category_regions: Dict, detected_objects: List[Dict], scene_type: str) -> Dict:
         """
-        Identify functional zones for scenes viewed from an aerial perspective.
         Args:
-            category_regions: Objects grouped by category and region
-            detected_objects: List of detected objects
-            scene_type: Specific scene type
         Returns:
-            Dict: Aerial view functional zones
         """
-        zones = {}
-        # For aerial views, we focus on patterns and flows rather than specific zones
-        # Identify pedestrian patterns
-        people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
-        if people_objs:
-            # Convert positions to arrays for pattern analysis
-            positions = np.array([obj["normalized_center"] for obj in people_objs])
-            if len(positions) >= 3:
-                # Calculate distribution metrics
-                x_coords = positions[:, 0]
-                y_coords = positions[:, 1]
-                x_mean = np.mean(x_coords)
-                y_mean = np.mean(y_coords)
-                x_std = np.std(x_coords)
-                y_std = np.std(y_coords)
-                # Determine if people are organized in a linear pattern
-                if x_std < 0.1 or y_std < 0.1:
-                    # Linear distribution along one axis
-                    pattern_direction = "vertical" if x_std < y_std else "horizontal"
-                    zones["pedestrian_pattern"] = {
-                        "region": "central",
-                        "objects": ["person"] * len(people_objs),
-                        "description": f"Aerial view shows a {pattern_direction} pedestrian movement pattern"
-                    }
-                else:
-                    # More dispersed pattern
-                    zones["pedestrian_distribution"] = {
-                        "region": "wide",
-                        "objects": ["person"] * len(people_objs),
-                        "description": f"Aerial view shows pedestrians distributed across the area"
-                    }
-        # Identify vehicle patterns for traffic analysis
-        vehicle_objs = [obj for obj in detected_objects if obj["class_id"] in [1, 2, 3, 5, 6, 7]]
-        if vehicle_objs:
-            # Convert positions to arrays for pattern analysis
-            positions = np.array([obj["normalized_center"] for obj in vehicle_objs])
-            if len(positions) >= 2:
-                # Calculate distribution metrics
-                x_coords = positions[:, 0]
-                y_coords = positions[:, 1]
-                x_mean = np.mean(x_coords)
-                y_mean = np.mean(y_coords)
-                x_std = np.std(x_coords)
-                y_std = np.std(y_coords)
-                # Determine if vehicles are organized in lanes
-                if x_std < y_std * 0.5:
-                    # Vehicles aligned vertically - indicates north-south traffic
-                    zones["vertical_traffic_flow"] = {
-                        "region": "central_vertical",
-                        "objects": [obj["class_name"] for obj in vehicle_objs[:5]],
-                        "description": "North-south traffic flow visible from aerial view"
-                    }
-                elif y_std < x_std * 0.5:
-                    # Vehicles aligned horizontally - indicates east-west traffic
-                    zones["horizontal_traffic_flow"] = {
-                        "region": "central_horizontal",
-                        "objects": [obj["class_name"] for obj in vehicle_objs[:5]],
-                        "description": "East-west traffic flow visible from aerial view"
-                    }
-                else:
-                    # Vehicles in multiple directions - indicates intersection
-                    zones["intersection_traffic"] = {
-                        "region": "central",
-                        "objects": [obj["class_name"] for obj in vehicle_objs[:5]],
-                        "description": "Multi-directional traffic at intersection visible from aerial view"
-                    }
-        # For intersection specific aerial views, identify crossing patterns
-        if "intersection" in scene_type:
-            # Check for traffic signals
-            traffic_light_objs = [obj for obj in detected_objects if obj["class_id"] == 9]
-            if traffic_light_objs:
-                zones["traffic_control_pattern"] = {
-                    "region": "intersection",
-                    "objects": ["traffic light"] * len(traffic_light_objs),
-                    "description": f"Intersection traffic control with {len(traffic_light_objs)} signals visible from above"
-                }
-            # Crosswalks are inferred from context in aerial views
-            zones["crossing_pattern"] = {
-                "region": "central",
-                "objects": ["inferred crosswalk"],
-                "description": "Crossing pattern visible from aerial perspective"
-            }
-        # For plaza aerial views, identify gathering patterns
-        if "plaza" in scene_type:
-            # Plazas typically have central open area with people
-            if people_objs:
-                # Check if people are clustered in central region
-                central_people = [obj for obj in people_objs
-                                if "middle" in obj["region"]]
-                if central_people:
-                    zones["central_gathering"] = {
-                        "region": "middle_center",
-                        "objects": ["person"] * len(central_people),
-                        "description": f"Central plaza gathering area with {len(central_people)} people viewed from above"
-                    }
-        return zones
-    def _identify_outdoor_general_zones(self, category_regions: Dict, detected_objects: List[Dict], scene_type: str) -> Dict:
-        """
-        Identify functional zones for general outdoor scenes.
-        Args:
-            category_regions: Objects grouped by category and region
-            detected_objects: List of detected objects
-            scene_type: Specific outdoor scene type
-        Returns:
-            Dict: Outdoor functional zones
-        """
-        zones = {}
-        # Identify pedestrian zones
-        people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
-        if people_objs:
-            people_regions = {}
-            for obj in people_objs:
-                region = obj["region"]
-                if region not in people_regions:
-                    people_regions[region] = []
-                people_regions[region].append(obj)
-            if people_regions:
-                # Find main pedestrian areas
-                main_people_regions = sorted(people_regions.items(),
-                                        key=lambda x: len(x[1]),
-                                        reverse=True)[:2]  # Top 2 regions
-                for idx, (region, objs) in enumerate(main_people_regions):
-                    if len(objs) > 0:
-                        zones[f"pedestrian_zone_{idx+1}"] = {
-                            "region": region,
-                            "objects": ["person"] * len(objs),
-                            "description": f"Pedestrian area with {len(objs)} {'people' if len(objs) > 1 else 'person'}"
-                        }
-        # Identify vehicle zones for streets and parking lots
-        vehicle_objs = [obj for obj in detected_objects if obj["class_id"] in [1, 2, 3, 5, 6, 7]]
-        if vehicle_objs:
-            vehicle_regions = {}
-            for obj in vehicle_objs:
-                region = obj["region"]
-                if region not in vehicle_regions:
-                    vehicle_regions[region] = []
-                vehicle_regions[region].append(obj)
-            if vehicle_regions:
-                main_vehicle_region = max(vehicle_regions.items(),
-                                    key=lambda x: len(x[1]),
-                                    default=(None, []))
-                if main_vehicle_region[0] is not None:
-                    vehicle_types = [obj["class_name"] for obj in main_vehicle_region[1]]
-                    zones["vehicle_zone"] = {
-                        "region": main_vehicle_region[0],
-                        "objects": vehicle_types,
-                        "description": f"Traffic area with {', '.join(list(set(vehicle_types))[:3])}"
-                    }
-        # For park areas, identify recreational zones
-        if scene_type == "park_area":
-            # Look for recreational objects (sports balls, kites, etc.)
-            rec_items = []
-            rec_regions = {}
-            for obj in detected_objects:
-                if obj["class_id"] in [32, 33, 34, 35, 38]:  # sports ball, kite, baseball bat, glove, tennis racket
-                    region = obj["region"]
-                    if region not in rec_regions:
-                        rec_regions[region] = []
-                    rec_regions[region].append(obj)
-                    rec_items.append(obj["class_name"])
-            if rec_items:
-                main_rec_region = max(rec_regions.items(),
-                                key=lambda x: len(x[1]),
-                                default=(None, []))
-                if main_rec_region[0] is not None:
-                    zones["recreational_zone"] = {
-                        "region": main_rec_region[0],
-                        "objects": list(set(rec_items)),
-                        "description": f"Recreational area with {', '.join(list(set(rec_items)))}"
-                    }
-        # For parking lots, identify parking zones
-        if scene_type == "parking_lot":
-            # Look for parked cars with consistent spacing
-            car_objs = [obj for obj in detected_objects if obj["class_id"] == 2]  # cars
-            if len(car_objs) >= 3:
-                # Check if cars are arranged in patterns (simplified)
-                car_positions = [obj["normalized_center"] for obj in car_objs]
-                # Check for row patterns by analyzing vertical positions
-                y_coords = [pos[1] for pos in car_positions]
-                y_clusters = {}
-                # Simplified clustering - group cars by similar y-coordinates
-                for i, y in enumerate(y_coords):
-                    assigned = False
-                    for cluster_y in y_clusters.keys():
-                        if abs(y - cluster_y) < 0.1:  # Within 10% of image height
-                            y_clusters[cluster_y].append(i)
-                            assigned = True
-                            break
-                    if not assigned:
-                        y_clusters[y] = [i]
-                # If we have row patterns
-                if max(len(indices) for indices in y_clusters.values()) >= 2:
-                    zones["parking_row"] = {
-                        "region": "central",
-                        "objects": ["car"] * len(car_objs),
-                        "description": f"Organized parking area with vehicles arranged in rows"
-                    }
-                else:
-                    zones["parking_area"] = {
-                        "region": "wide",
-                        "objects": ["car"] * len(car_objs),
-                        "description": f"Parking area with {len(car_objs)} vehicles"
-                    }
-        return zones
-    def _identify_default_zones(self, category_regions: Dict, detected_objects: List[Dict]) -> Dict:
-        """
-        Identify general functional zones when no specific scene type is matched.
-        Args:
-            category_regions: Objects grouped by category and region
-            detected_objects: List of detected objects
-        Returns:
-            Dict: Default functional zones
-        """
-        zones = {}
-        # Group objects by category and find main concentrations
-        for category, regions in category_regions.items():
-            if not regions:
-                continue
-            # Find region with most objects in this category
-            main_region = max(regions.items(),
-                        key=lambda x: len(x[1]),
-                        default=(None, []))
-            if main_region[0] is None or len(main_region[1]) < 2:
-                continue
-            # Create zone based on object category
-            zone_objects = [obj["class_name"] for obj in main_region[1]]
-            # Skip if too few objects
-            if len(zone_objects) < 2:
-                continue
-            # Create appropriate zone name and description based on category
-            if category == "furniture":
-                zones["furniture_zone"] = {
-                    "region": main_region[0],
-                    "objects": zone_objects,
-                    "description": f"Area with furniture including {', '.join(zone_objects[:3])}"
-                }
-            elif category == "electronics":
-                zones["electronics_zone"] = {
-                    "region": main_region[0],
-                    "objects": zone_objects,
-                    "description": f"Area with electronic devices including {', '.join(zone_objects[:3])}"
-                }
-            elif category == "kitchen_items":
-                zones["dining_zone"] = {
-                    "region": main_region[0],
-                    "objects": zone_objects,
-                    "description": f"Dining or food area with {', '.join(zone_objects[:3])}"
-                }
-            elif category == "vehicles":
-                zones["vehicle_zone"] = {
-                    "region": main_region[0],
-                    "objects": zone_objects,
-                    "description": f"Area with vehicles including {', '.join(zone_objects[:3])}"
-                }
-            elif category == "personal_items":
-                zones["personal_items_zone"] = {
-                    "region": main_region[0],
-                    "objects": zone_objects,
-                    "description": f"Area with personal items including {', '.join(zone_objects[:3])}"
-                }
-        # Check for people groups
-        people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
-        if len(people_objs) >= 2:
-            people_regions = {}
-            for obj in people_objs:
-                region = obj["region"]
-                if region not in people_regions:
-                    people_regions[region] = []
-                people_regions[region].append(obj)
-            if people_regions:
-                main_people_region = max(people_regions.items(),
-                                    key=lambda x: len(x[1]),
-                                    default=(None, []))
-                if main_people_region[0] is not None:
-                    zones["people_zone"] = {
-                        "region": main_people_region[0],
-                        "objects": ["person"] * len(main_people_region[1]),
-                        "description": f"Area with {len(main_people_region[1])} people"
-                    }
-        return zones
-    def _find_main_region(self, region_objects_dict: Dict) -> str:
-        """Find the main region with the most objects"""
-        if not region_objects_dict:
-            return "unknown"
-        return max(region_objects_dict.items(),
-                key=lambda x: len(x[1]),
-                default=("unknown", []))[0]

 import os
 import numpy as np
+import logging
+import traceback
 from typing import Dict, List, Tuple, Any, Optional
+from region_analyzer import RegionAnalyzer
+from object_extractor import ObjectExtractor
+from scene_viewpoint_analyzer import SceneViewpointAnalyzer
+from zone_evaluator import ZoneEvaluator
+from scene_zone_identifier import SceneZoneIdentifier
+from functional_zone_identifier import FunctionalZoneIdentifier
+logger = logging.getLogger(__name__)
 class SpatialAnalyzer:
     """
+    分析圖像中物件間空間關係的主要類別
+    處理區域分配、物件定位和功能區域識別
+    使用Facade模式整合多個子組件，保持外部接口的穩定性
     """
     def __init__(self, class_names: Dict[int, str] = None, object_categories=None):
         """
+        初始化空間分析器，包含圖像區域定義
         Args:
+            class_names: 類別ID到類別名稱的映射字典
+            object_categories: 物件類別分組字典
         """
+        try:
+            # 初始化所有子組件
+            self.region_analyzer = RegionAnalyzer()
+            self.object_extractor = ObjectExtractor(class_names, object_categories)
+            self.scene_viewpoint_analyzer = SceneViewpointAnalyzer()
+            self.zone_evaluator = ZoneEvaluator()
+            self.scene_zone_identifier = SceneZoneIdentifier()
+            self.functional_zone_identifier = FunctionalZoneIdentifier(
+                zone_evaluator=self.zone_evaluator,
+                scene_zone_identifier=self.scene_zone_identifier,
+                scene_viewpoint_analyzer=self.scene_viewpoint_analyzer
+            )
+            self.class_names = class_names
+            self.OBJECT_CATEGORIES = object_categories or {}
+            self.enhance_descriptor = None
+            # 接近分析的距離閾值（標準化）
+            self.proximity_threshold = 0.2
+            logger.info("SpatialAnalyzer initialized successfully with all sub-components")
+        except Exception as e:
+            logger.error(f"Failed to initialize SpatialAnalyzer: {str(e)}")
+            logger.error(traceback.format_exc())
+            raise
+    def update_class_names(self, class_names: Dict[int, str]):
         """
+        更新類別名稱映射並傳遞給 ObjectExtractor
         Args:
+            class_names: 新的類別名稱映射字典
         """
+        try:
+            self.class_names = class_names
+            if hasattr(self, 'object_extractor') and self.object_extractor:
+                self.object_extractor.update_class_names(class_names)
+                logger.info(f"Updated class names in SpatialAnalyzer and ObjectExtractor")
+        except Exception as e:
+            logger.error(f"Failed to update class names in SpatialAnalyzer: {str(e)}")
+    def _determine_region(self, x: float, y: float) -> str:
         """
+        判斷點位於哪個區域
         Args:
+            x: 標準化x座標 (0-1)
+            y: 標準化y座標 (0-1)
         Returns:
+            區域名稱
         """
+        try:
+            return self.region_analyzer.determine_region(x, y)
+        except Exception as e:
+            logger.error(f"Error in _determine_region: {str(e)}")
+            logger.error(traceback.format_exc())
+            return "unknown"
+    def _analyze_regions(self, detected_objects: List[Dict]) -> Dict:
         """
+        分析物件在各區域的分布情況
         Args:
+            detected_objects: 包含位置資訊的檢測物件列表
         Returns:
+            包含區域分析結果的字典
         """
+        try:
+            return self.region_analyzer.analyze_regions(detected_objects)
+        except Exception as e:
+            logger.error(f"Error in _analyze_regions: {str(e)}")
+            logger.error(traceback.format_exc())
+            return {
+                "counts": {},
+                "main_focus": [],
+                "objects_by_region": {}
+            }
+    def _extract_detected_objects(self, detection_result: Any, confidence_threshold: float = 0.25) -> List[Dict]:
         """
+        從檢測結果中提取物件資訊，包含位置資訊
         Args:
+            detection_result: YOLOv8檢測結果
+            confidence_threshold: 最小信心度閾值
         Returns:
+            包含檢測物件資訊的字典列表
         """
+        try:
+            return self.object_extractor.extract_detected_objects(
+                detection_result,
+                confidence_threshold,
+                region_analyzer=self.region_analyzer
+            )
+        except Exception as e:
+            logger.error(f"Error in _extract_detected_objects: {str(e)}")
+            logger.error(traceback.format_exc())
+            return []
+    def _detect_scene_viewpoint(self, detected_objects: List[Dict]) -> Dict:
         """
+        檢測場景視角並識別特殊場景模式
         Args:
+            detected_objects: 檢測到的物件列表
         Returns:
+            包含視角和場景模式資訊的字典
         """
+        try:
+            # 委託���新的場景視角分析器
+            return self.scene_viewpoint_analyzer.detect_scene_viewpoint(detected_objects)
+        except Exception as e:
+            logger.error(f"Error in _detect_scene_viewpoint: {str(e)}")
+            logger.error(traceback.format_exc())
+            return {"viewpoint": "eye_level", "patterns": []}
+    def _identify_functional_zones(self, detected_objects: List[Dict], scene_type: str) -> Dict:
         """
+        識別場景內的功能區域，具有針對不同視角和文化背景的改進檢測能力
         Args:
+            detected_objects: 檢測到的物件列表
+            scene_type: 識別出的場景類型
         Returns:
+            包含功能區域及其描述的字典
         """
+        try:
+            return self.functional_zone_identifier.identify_functional_zones(detected_objects, scene_type)
+        except Exception as e:
+            logger.error(f"Error in _identify_functional_zones: {str(e)}")
+            logger.error(traceback.format_exc())
+            return {}
     def _categorize_object(self, obj: Dict) -> str:
         """
+        將檢測到的物件分類到功能類別中，用於區域識別
+        確保所有返回值都使用自然語言格式，避免底線或技術性標識符
         """
+        try:
+            class_id = obj.get("class_id", -1)
+            class_name = obj.get("class_name", "").lower().strip()
+            # 優先處理 traffic light
+            # 只要 class_id == 9 或 class_name 包含 "traffic light"，就分類為 "traffic light"
+            if class_id == 9 or "traffic light" in class_name:
+                return "traffic light"
+            # 如果有自訂的 OBJECT_CATEGORIES 映射，優先使用它
+            if hasattr(self, 'OBJECT_CATEGORIES') and self.OBJECT_CATEGORIES:
+                for category, ids in self.OBJECT_CATEGORIES.items():
+                    if class_id in ids:
+                        # 確保返回的類別名稱使用自然語言格式
+                        return self._clean_category_name(category)
+            # COCO class default name
+            furniture_items = ["chair", "couch", "bed", "dining table", "toilet"]
+            plant_items = ["potted plant"]
+            electronic_items = ["tv", "laptop", "mouse", "remote", "keyboard", "cell phone"]
+            vehicle_items = ["bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat"]
+            person_items = ["person"]
+            kitchen_items = [
+                "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl",
+                "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog",
+                "pizza", "donut", "cake", "refrigerator", "oven", "toaster", "sink", "microwave"
+            ]
+            sports_items = [
+                "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat",
+                "baseball glove", "skateboard", "surfboard", "tennis racket"
+            ]
+            personal_items = ["handbag", "tie", "suitcase", "umbrella", "backpack"]
+            # fallback natural language
+            if any(item in class_name for item in furniture_items):
+                return "furniture"
+            elif any(item in class_name for item in plant_items):
+                return "plant"
+            elif any(item in class_name for item in electronic_items):
+                return "electronics"
+            elif any(item in class_name for item in vehicle_items):
+                return "vehicle"
+            elif any(item in class_name for item in person_items):
+                return "person"
+            elif any(item in class_name for item in kitchen_items):
+                return "kitchen items"  # 移除底線
+            elif any(item in class_name for item in sports_items):
+                return "sports"
+            elif any(item in class_name for item in personal_items):
+                return "personal items"  # 移除底線
+            else:
+                return "misc"
+        except Exception as e:
+            logger.error(f"Error categorizing object: {str(e)}")
+            logger.error(traceback.format_exc())
+            return "misc"
+    def _clean_category_name(self, category: str) -> str:
         """
+        清理類別名稱，移除底線並轉換為較自然的格式
         Args:
+            category: 原始類別名稱
         Returns:
+            str: 清理後的類別名稱
         """
+        try:
+            if not category:
+                return "misc"
+            # 將底線替換為空格
+            cleaned = category.replace('_', ' ')
+            # 處理常見的技術性命名模式
+            replacements = {
+                'kitchen items': 'kitchen items',
+                'personal items': 'personal items',
+                'traffic light': 'traffic light',
+                'misc items': 'misc'
+            }
+            # 應用特定的替換規則
+            for old_term, new_term in replacements.items():
+                if cleaned == old_term:
+                    return new_term
+            return cleaned.strip()
+        except Exception as e:
+            logger.warning(f"Error cleaning category name '{category}': {str(e)}")
+            return "misc"
+    def _get_object_categories(self, detected_objects: List[Dict]) -> set:
         """
+        從檢測到的物件中獲取唯一的物件類別
         Args:
+            detected_objects: 檢測到的物件列表
         Returns:
+            唯一物件類別的集合
         """
         try:
+            return self.object_extractor.get_object_categories(detected_objects)
         except Exception as e:
+            logger.error(f"Error in _get_object_categories: {str(e)}")
+            logger.error(traceback.format_exc())
+            return set()
+    def _identify_core_objects_for_scene(self, detected_objects: List[Dict], scene_type: str) -> List[Dict]:
         """
+        識別定義特定場景類型的核心物件
         Args:
+            detected_objects: 檢測到的物件列表
+            scene_type: 場景類型
         Returns:
+            場景的核心物件列表
         """
+        try:
+            return self.object_extractor.identify_core_objects_for_scene(detected_objects, scene_type)
+        except Exception as e:
+            logger.error(f"Error in _identify_core_objects_for_scene: {str(e)}")
+            logger.error(traceback.format_exc())
+            return []
+    def _evaluate_zone_identification_feasibility(self, detected_objects: List[Dict], scene_type: str) -> bool:
         """
+        基於物件關聯性和分布特徵的彈性可行性評估
         Args:
+            detected_objects: 檢測到的物件列表
+            scene_type: 場景類型
         Returns:
+            是否適合進行區域識別
         """
+        try:
+            return self.zone_evaluator.evaluate_zone_identification_feasibility(detected_objects, scene_type)
+        except Exception as e:
+            logger.error(f"Error in _evaluate_zone_identification_feasibility: {str(e)}")
+            logger.error(traceback.format_exc())
+            return False
+    def _calculate_functional_relationships(self, detected_objects: List[Dict]) -> float:
         """
+        計算物件間的功能關聯性評分
         Args:
+            detected_objects: 檢測到的物件列表
         Returns:
+            功能關聯性評分 (0.0-1.0)
         """
+        try:
+            return self.zone_evaluator.calculate_functional_relationships(detected_objects)
+        except Exception as e:
+            logger.error(f"Error in _calculate_functional_relationships: {str(e)}")
+            logger.error(traceback.format_exc())
+            return 0.0
+    def _calculate_spatial_diversity(self, detected_objects: List[Dict]) -> float:
         """
+        計算物件空間分布的多樣性
         Args:
+            detected_objects: 檢測到的物件列表
         Returns:
+            空間多樣性評分 (0.0-1.0)
         """
+        try:
+            return self.zone_evaluator.calculate_spatial_diversity(detected_objects)
+        except Exception as e:
+            logger.error(f"Error in _calculate_spatial_diversity: {str(e)}")
+            logger.error(traceback.format_exc())
+            return 0.0
+    def _get_complexity_threshold(self, scene_type: str) -> float:
         """
+        根據場景類型返回適當的複雜度閾值
         Args:
+            scene_type: 場景類型
         Returns:
+            複雜度閾值 (0.0-1.0)
         """
+        try:
+            return self.zone_evaluator.get_complexity_threshold(scene_type)
+        except Exception as e:
+            logger.error(f"Error in _get_complexity_threshold: {str(e)}")
+            logger.error(traceback.format_exc())
+            return 0.55
+    def _create_distribution_map(self, detected_objects: List[Dict]) -> Dict:
         """
+        創建物件在各區域分布的詳細地圖，用於空間分析
         Args:
+            detected_objects: 檢測到的物件列表
         Returns:
+            包含各區域分布詳情的字典
         """
+        try:
+            return self.region_analyzer.create_distribution_map(detected_objects)
+        except Exception as e:
+            logger.error(f"Error in _create_distribution_map: {str(e)}")
+            logger.error(traceback.format_exc())
+            return {}
+    def _find_main_region(self, region_objects_dict: Dict) -> str:
         """
+        找到物件最多的主要區域
         Args:
+            region_objects_dict: 區域物件字典
         Returns:
+            主要區域名稱
         """
+        try:
+            if not region_objects_dict:
+                return "unknown"
+            return max(region_objects_dict.items(),
+                    key=lambda x: len(x[1]),
+                    default=("unknown", []))[0]
+        except Exception as e:
+            logger.error(f"Error in _find_main_region: {str(e)}")
+            logger.error(traceback.format_exc())
+            return "unknown"
+    def _detect_cross_pattern(self, positions):
+        """檢測位置中的十字交叉模式 - 委託給SceneViewpointAnalyzer"""
+        try:
+            return self.scene_viewpoint_analyzer._detect_cross_pattern(positions)
+        except Exception as e:
+            logger.error(f"Error in _detect_cross_pattern: {str(e)}")
+            return False
+    def _analyze_movement_directions(self, positions):
+        """分析位置中的移動方向 - 委託給SceneViewpointAnalyzer"""
+        try:
+            return self.scene_viewpoint_analyzer._analyze_movement_directions(positions)
+        except Exception as e:
+            logger.error(f"Error in _analyze_movement_directions: {str(e)}")
+            return []
+    def _get_directional_description(self, region: str) -> str:
+        """將區域名稱轉換為方位描述 - 委託給RegionAnalyzer"""
+        try:
+            return self.region_analyzer.get_directional_description(region)
+        except Exception as e:
+            logger.error(f"Error in _get_directional_description: {str(e)}")
+            return "central"
+    @property
+    def regions(self):
+        """提供對區域定義的向後兼容訪問"""
+        return self.region_analyzer.regions

template_manager.py ADDED Viewed

	@@ -0,0 +1,2150 @@

+import logging
+import traceback
+import re
+import random
+from typing import Dict, List, Optional, Any
+import json
+from scene_detail_templates import SCENE_DETAIL_TEMPLATES
+from object_template_fillers import OBJECT_TEMPLATE_FILLERS
+from viewpoint_templates import VIEWPOINT_TEMPLATES
+from cultural_templates import CULTURAL_TEMPLATES
+from lighting_conditions import LIGHTING_CONDITIONS
+from confidence_templates import CONFIDENCE_TEMPLATES
+class TemplateLoadingError(Exception):
+    """模板載入或處理相關錯誤的自訂例外"""
+    pass
+class TemplateFillError(Exception):
+    pass
+class TemplateManager:
+    """
+    模板管理器 - 負責描述模板的載入、管理和填充
+    此class 管理所有用於場景描述生成的模板資源，提供模板填充功能，
+    並根據場景類型、物體檢測結果和上下文的資訊給出適當的描述內容。
+    """
+    def __init__(self, custom_templates_db: Optional[Dict] = None):
+        """
+        初始化模板管理器
+        Args:
+            custom_templates_db: 可選的自定義模板數據庫，如果提供則會與默認模板合併
+        """
+        self.logger = logging.getLogger(self.__class__.__name__)
+        self.template_registry = {}
+        try:
+            # 載入模板數據庫
+            self.templates = self._load_templates()
+            # 如果提供了自定義模板，則進行合併
+            if custom_templates_db:
+                self._merge_custom_templates(custom_templates_db)
+            # 驗證模板完整性
+            self._validate_templates()
+            self.logger.info("TemplateManager initialized successfully with %d template categories",
+                        len(self.templates))
+        except Exception as e:
+            error_msg = f"Failed to initialize TemplateManager: {str(e)}"
+            self.logger.error(f"{error_msg}\n{traceback.format_exc()}")
+            # 初始化基本的空模板
+            self.templates = self._initialize_fallback_templates()
+    def _load_templates(self) -> Dict:
+        """
+        載入所有描述模板
+        Returns:
+            Dict: 包含所有模板類別的字典
+        """
+        try:
+            templates = {}
+            # 載入場景詳細描述模板
+            self.logger.debug("Loading scene detail templates")
+            try:
+                templates["scene_detail_templates"] = SCENE_DETAIL_TEMPLATES
+            except NameError:
+                self.logger.warning("SCENE_DETAIL_TEMPLATES not defined, using empty dict")
+                templates["scene_detail_templates"] = {}
+            # 載入物體模板填充器
+            self.logger.debug("Loading object template fillers")
+            try:
+                templates["object_template_fillers"] = OBJECT_TEMPLATE_FILLERS
+            except NameError:
+                self.logger.warning("OBJECT_TEMPLATE_FILLERS not defined, using empty dict")
+                templates["object_template_fillers"] = {}
+            # 載入視角模板
+            self.logger.debug("Loading viewpoint templates")
+            try:
+                templates["viewpoint_templates"] = VIEWPOINT_TEMPLATES
+            except NameError:
+                self.logger.warning("VIEWPOINT_TEMPLATES not defined, using empty dict")
+                templates["viewpoint_templates"] = {}
+            # 載入文化模板
+            self.logger.debug("Loading cultural templates")
+            try:
+                templates["cultural_templates"] = CULTURAL_TEMPLATES
+            except NameError:
+                self.logger.warning("CULTURAL_TEMPLATES not defined, using empty dict")
+                templates["cultural_templates"] = {}
+            # 從照明條件模組載入照明模板
+            self.logger.debug("Loading lighting templates")
+            try:
+                templates["lighting_templates"] = self._extract_lighting_templates()
+            except Exception as e:
+                self.logger.warning(f"Failed to extract lighting templates: {str(e)}")
+                templates["lighting_templates"] = {}
+            # 載入信心度模板
+            self.logger.debug("Loading confidence templates")
+            try:
+                templates["confidence_templates"] = CONFIDENCE_TEMPLATES
+            except NameError:
+                self.logger.warning("CONFIDENCE_TEMPLATES not defined, using empty dict")
+                templates["confidence_templates"] = {}
+            # 初始化默認模板（當成備份）
+            self._initialize_default_templates(templates)
+            self.logger.info("Successfully loaded %d template categories", len(templates))
+            return templates
+        except Exception as e:
+            error_msg = f"Unexpected error during template loading: {str(e)}"
+            self.logger.error(f"{error_msg}\n{traceback.format_exc()}")
+            # 返回基本模板
+            return self._initialize_fallback_templates()
+    def _initialize_template_registry(self) -> Dict[str, Dict[str, Any]]:
+        """
+        初始化模板，包含各種場景類型的結構化模板
+        Returns:
+            Dict[str, Dict[str, Any]]: 模板註冊表字典
+        """
+        try:
+            template_registry = {
+                "indoor_detailed": {
+                    "scene_type": "indoor",
+                    "complexity": "high",
+                    "structure": [
+                        {
+                            "type": "opening",
+                            "content": "This indoor scene presents a comprehensive view of a well-organized living space."
+                        },
+                        {
+                            "type": "zone_analysis",
+                            "priority": "functional_areas",
+                            "detail_level": "detailed"
+                        },
+                        {
+                            "type": "object_summary",
+                            "grouping": "by_category",
+                            "include_counts": True
+                        },
+                        {
+                            "type": "conclusion",
+                            "style": "analytical"
+                        }
+                    ]
+                },
+                "indoor_moderate": {
+                    "scene_type": "indoor",
+                    "complexity": "medium",
+                    "structure": [
+                        {
+                            "type": "opening",
+                            "content": "The indoor environment displays organized functional areas."
+                        },
+                        {
+                            "type": "zone_analysis",
+                            "priority": "main_areas",
+                            "detail_level": "moderate"
+                        },
+                        {
+                            "type": "object_summary",
+                            "grouping": "by_function",
+                            "include_counts": False
+                        },
+                        {
+                            "type": "conclusion",
+                            "style": "descriptive"
+                        }
+                    ]
+                },
+                "indoor_simple": {
+                    "scene_type": "indoor",
+                    "complexity": "low",
+                    "structure": [
+                        {
+                            "type": "opening",
+                            "content": "An indoor space with visible furniture and household items."
+                        },
+                        {
+                            "type": "zone_analysis",
+                            "priority": "basic_areas",
+                            "detail_level": "simple"
+                        },
+                        {
+                            "type": "object_summary",
+                            "grouping": "general",
+                            "include_counts": False
+                        }
+                    ]
+                },
+                "outdoor_detailed": {
+                    "scene_type": "outdoor",
+                    "complexity": "high",
+                    "structure": [
+                        {
+                            "type": "opening",
+                            "content": "This outdoor scene captures a dynamic urban environment with multiple activity zones."
+                        },
+                        {
+                            "type": "zone_analysis",
+                            "priority": "activity_areas",
+                            "detail_level": "detailed"
+                        },
+                        {
+                            "type": "object_summary",
+                            "grouping": "by_location",
+                            "include_counts": True
+                        },
+                        {
+                            "type": "conclusion",
+                            "style": "environmental"
+                        }
+                    ]
+                },
+                "outdoor_moderate": {
+                    "scene_type": "outdoor",
+                    "complexity": "medium",
+                    "structure": [
+                        {
+                            "type": "opening",
+                            "content": "The outdoor scene shows organized public spaces and pedestrian areas."
+                        },
+                        {
+                            "type": "zone_analysis",
+                            "priority": "public_areas",
+                            "detail_level": "moderate"
+                        },
+                        {
+                            "type": "object_summary",
+                            "grouping": "by_type",
+                            "include_counts": False
+                        },
+                        {
+                            "type": "conclusion",
+                            "style": "observational"
+                        }
+                    ]
+                },
+                "outdoor_simple": {
+                    "scene_type": "outdoor",
+                    "complexity": "low",
+                    "structure": [
+                        {
+                            "type": "opening",
+                            "content": "An outdoor area with pedestrians and urban elements."
+                        },
+                        {
+                            "type": "zone_analysis",
+                            "priority": "basic_areas",
+                            "detail_level": "simple"
+                        },
+                        {
+                            "type": "object_summary",
+                            "grouping": "general",
+                            "include_counts": False
+                        }
+                    ]
+                },
+                "commercial_detailed": {
+                    "scene_type": "commercial",
+                    "complexity": "high",
+                    "structure": [
+                        {
+                            "type": "opening",
+                            "content": "This commercial environment demonstrates organized retail and customer service areas."
+                        },
+                        {
+                            "type": "zone_analysis",
+                            "priority": "service_areas",
+                            "detail_level": "detailed"
+                        },
+                        {
+                            "type": "object_summary",
+                            "grouping": "by_function",
+                            "include_counts": True
+                        },
+                        {
+                            "type": "conclusion",
+                            "style": "business"
+                        }
+                    ]
+                },
+                "transportation_detailed": {
+                    "scene_type": "transportation",
+                    "complexity": "high",
+                    "structure": [
+                        {
+                            "type": "opening",
+                            "content": "This transportation hub features organized passenger facilities and transit infrastructure."
+                        },
+                        {
+                            "type": "zone_analysis",
+                            "priority": "transit_areas",
+                            "detail_level": "detailed"
+                        },
+                        {
+                            "type": "object_summary",
+                            "grouping": "by_transit_function",
+                            "include_counts": True
+                        },
+                        {
+                            "type": "conclusion",
+                            "style": "infrastructure"
+                        }
+                    ]
+                },
+                "default": {
+                    "scene_type": "general",
+                    "complexity": "medium",
+                    "structure": [
+                        {
+                            "type": "opening",
+                            "content": "The scene displays various elements organized across functional areas."
+                        },
+                        {
+                            "type": "zone_analysis",
+                            "priority": "general_areas",
+                            "detail_level": "moderate"
+                        },
+                        {
+                            "type": "object_summary",
+                            "grouping": "general",
+                            "include_counts": False
+                        },
+                        {
+                            "type": "conclusion",
+                            "style": "general"
+                        }
+                    ]
+                }
+            }
+            self.logger.debug(f"Initialized template registry with {len(template_registry)} templates")
+            return template_registry
+        except Exception as e:
+            error_msg = f"Error initializing template registry: {str(e)}"
+            self.logger.error(f"{error_msg}\n{traceback.format_exc()}")
+            # 返回最基本的註冊表
+            return {
+                "default": {
+                    "scene_type": "general",
+                    "complexity": "low",
+                    "structure": [
+                        {
+                            "type": "opening",
+                            "content": "Scene analysis completed with identified objects and areas."
+                        }
+                    ]
+                }
+            }
+    def get_template_by_scene_type(self, scene_type: str, detected_objects: List[Dict],
+                              functional_zones: Dict) -> str:
+        """
+        根據場景類型選擇合適的模��並進行標準化處理
+        Args:
+            scene_type: 場景類型
+            detected_objects: 檢測到的物件列表
+            functional_zones: 功能區域字典
+        Returns:
+            str: 標準化後的模板字符串
+        """
+        try:
+            # 獲取場景的物件統計信息
+            object_stats = self._analyze_scene_composition(detected_objects)
+            zone_count = len(functional_zones) if functional_zones else 0
+            # 根據場景複雜度和類型選擇模板
+            if scene_type in self.templates:
+                scene_templates = self.templates[scene_type]
+                # 根據複雜度選擇合適的模板變體
+                if zone_count >= 3 and object_stats.get("total_objects", 0) >= 10:
+                    template_key = "complex"
+                elif zone_count >= 2 or object_stats.get("total_objects", 0) >= 5:
+                    template_key = "moderate"
+                else:
+                    template_key = "simple"
+                if template_key in scene_templates:
+                    raw_template = scene_templates[template_key]
+                else:
+                    raw_template = scene_templates.get("default", scene_templates[list(scene_templates.keys())[0]])
+            else:
+                # 如果沒有特定場景的模板，使用通用模板
+                raw_template = self._get_generic_template(object_stats, zone_count)
+            # 標準化模板中的佔位符和格式
+            standardized_template = self._standardize_template_format(raw_template)
+            return standardized_template
+        except Exception as e:
+            logger.error(f"Error selecting template for scene type '{scene_type}': {str(e)}")
+            return self._get_fallback_template()
+    def _analyze_scene_composition(self, detected_objects: List[Dict]) -> Dict:
+        """
+        分析場景組成以確定模板複雜度
+        Args:
+            detected_objects: 檢測到的物件列表
+        Returns:
+            Dict: 場景組成統計信息
+        """
+        try:
+            total_objects = len(detected_objects)
+            # 統計不同類型的物件
+            object_categories = {}
+            for obj in detected_objects:
+                class_name = obj.get("class_name", "unknown")
+                object_categories[class_name] = object_categories.get(class_name, 0) + 1
+            # 計算場景多樣性
+            unique_categories = len(object_categories)
+            return {
+                "total_objects": total_objects,
+                "unique_categories": unique_categories,
+                "category_distribution": object_categories,
+                "complexity_score": min(total_objects * 0.3 + unique_categories * 0.7, 10)
+            }
+        except Exception as e:
+            logger.warning(f"Error analyzing scene composition: {str(e)}")
+            return {"total_objects": 0, "unique_categories": 0, "complexity_score": 0}
+    def _get_generic_template(self, object_stats: Dict, zone_count: int) -> str:
+        """
+        獲取通用模板
+        Args:
+            object_stats: 物件統計信息
+            zone_count: 功能區域數量
+        Returns:
+            str: 通用模板字符串
+        """
+        try:
+            complexity_score = object_stats.get("complexity_score", 0)
+            if complexity_score >= 7 or zone_count >= 3:
+                return "This scene presents a comprehensive view featuring {functional_area} with {primary_objects}. The spatial organization demonstrates {spatial_arrangement} across multiple {activity_areas}, creating a dynamic environment with diverse elements and clear functional zones."
+            elif complexity_score >= 4 or zone_count >= 2:
+                return "The scene displays {functional_area} containing {primary_objects}. The arrangement shows {spatial_organization} with distinct areas serving different purposes within the overall space."
+            else:
+                return "A {scene_description} featuring {primary_objects} arranged in {basic_layout} within the visible area."
+        except Exception as e:
+            logger.warning(f"Error getting generic template: {str(e)}")
+            return self._get_fallback_template()
+    def _get_fallback_template(self) -> str:
+        """
+        獲取備用模板
+        Returns:
+            str: 備用模板字符串
+        """
+        return "A scene featuring various elements and organized areas of activity within the visible space."
+    def _standardize_template_format(self, template: str) -> str:
+        """
+        標準化模板格式，確保佔位符和表達方式符合自然語言要求
+        Args:
+            template: 原始模板字符串
+        Returns:
+            str: 標準化後的模板字符串
+        """
+        try:
+            if not template:
+                return self._get_fallback_template()
+            import re
+            standardized = template
+            # 標準化佔位符格式，移除技術性標記
+            placeholder_mapping = {
+                r'\{zone_\d+\}': '{functional_area}',
+                r'\{object_group_\d+\}': '{primary_objects}',
+                r'\{region_\d+\}': '{spatial_area}',
+                r'\{category_\d+\}': '{object_category}',
+                r'\{area_\d+\}': '{activity_area}',
+                r'\{section_\d+\}': '{scene_section}'
+            }
+            for pattern, replacement in placeholder_mapping.items():
+                standardized = re.sub(pattern, replacement, standardized)
+            # 標準化常見的技術性術語
+            term_replacements = {
+                'functional_zones': 'areas of activity',
+                'object_detection': 'visible elements',
+                'category_regions': 'organized sections',
+                'spatial_distribution': 'arrangement throughout the space',
+                'viewpoint_analysis': 'perspective view'
+            }
+            for tech_term, natural_term in term_replacements.items():
+                standardized = standardized.replace(tech_term, natural_term)
+            # 確保模板語法的自然性
+            standardized = self._improve_template_readability(standardized)
+            return standardized
+        except Exception as e:
+            logger.warning(f"Error standardizing template format: {str(e)}")
+            return template if template else self._get_fallback_template()
+    def _improve_template_readability(self, template: str) -> str:
+        """
+        改善模板的可讀性和自然性
+        Args:
+            template: 模板字符串
+        Returns:
+            str: 改善後的模板字符串
+        """
+        try:
+            import re
+            # 移除多餘的空格和換行
+            improved = re.sub(r'\s+', ' ', template).strip()
+            # 改善句子連接
+            improved = improved.replace(' . ', '. ')
+            improved = improved.replace(' , ', ', ')
+            improved = improved.replace(' ; ', '; ')
+            # 確保適當的句號結尾
+            if improved and not improved.endswith(('.', '!', '?')):
+                improved += '.'
+            # 改善常見的表達問題
+            readability_fixes = [
+                (r'\bthe the\b', 'the'),
+                (r'\ba a\b', 'a'),
+                (r'\ban an\b', 'an'),
+                (r'\bwith with\b', 'with'),
+                (r'\bin in\b', 'in'),
+                (r'\bof of\b', 'of'),
+                (r'\band and\b', 'and')
+            ]
+            for pattern, replacement in readability_fixes:
+                improved = re.sub(pattern, replacement, improved, flags=re.IGNORECASE)
+            return improved
+        except Exception as e:
+            logger.warning(f"Error improving template readability: {str(e)}")
+            return template
+    def _extract_lighting_templates(self) -> Dict:
+        """
+        從照明條件模組提取照明描述模板
+        Returns:
+            Dict: 照明模板字典
+        """
+        try:
+            lighting_templates = {}
+            # 從 LIGHTING_CONDITIONS 提取時間描述
+            time_descriptions = LIGHTING_CONDITIONS.get("time_descriptions", {})
+            for time_key, time_data in time_descriptions.items():
+                if isinstance(time_data, dict) and "general" in time_data:
+                    lighting_templates[time_key] = time_data["general"]
+                else:
+                    # 如果數據結構不符合預期，使用備用描述
+                    lighting_templates[time_key] = f"The scene is captured during {time_key.replace('_', ' ')}."
+            # 確保至少有基本的照明模板
+            if not lighting_templates:
+                self.logger.warning("No lighting templates found, using defaults")
+                lighting_templates = self._get_default_lighting_templates()
+            self.logger.debug("Extracted %d lighting templates", len(lighting_templates))
+            return lighting_templates
+        except Exception as e:
+            self.logger.warning(f"Error extracting lighting templates: {str(e)}, using defaults")
+            return self._get_default_lighting_templates()
+    def _get_default_lighting_templates(self) -> Dict:
+        """獲取默認照明模板"""
+        return {
+            "day_clear": "The scene is captured during clear daylight conditions.",
+            "day_overcast": "The scene is captured during overcast daylight.",
+            "night": "The scene is captured at night with artificial lighting.",
+            "dawn": "The scene is captured during dawn with soft natural lighting.",
+            "dusk": "The scene is captured during dusk with diminishing natural light.",
+            "unknown": "The lighting conditions are not clearly identifiable."
+        }
+    def _initialize_default_templates(self, templates: Dict):
+        """
+        初始化默認模板作為備份機制
+        Args:
+            templates: 要檢查和補充的模板字典
+        """
+        try:
+            # 置信度模板備份
+            if "confidence_templates" not in templates or not templates["confidence_templates"]:
+                templates["confidence_templates"] = {
+                    "high": "{description} {details}",
+                    "medium": "This appears to be {description} {details}",
+                    "low": "This might be {description}, but the confidence is low. {details}"
+                }
+            # 場景詳細模板備份
+            if "scene_detail_templates" not in templates or not templates["scene_detail_templates"]:
+                templates["scene_detail_templates"] = {
+                    "default": ["A scene with various elements and objects."]
+                }
+            # 物體填充模板備份
+            if "object_template_fillers" not in templates or not templates["object_template_fillers"]:
+                templates["object_template_fillers"] = {
+                    "default": ["various items", "different objects", "multiple elements"]
+                }
+            # 視角模板備份
+            if "viewpoint_templates" not in templates or not templates["viewpoint_templates"]:
+                templates["viewpoint_templates"] = {
+                    "eye_level": {
+                        "prefix": "From eye level, ",
+                        "observation": "the scene is viewed straight ahead.",
+                        "short_desc": "at eye level"
+                    },
+                    "aerial": {
+                        "prefix": "From above, ",
+                        "observation": "the scene is viewed from a bird's-eye perspective.",
+                        "short_desc": "from above"
+                    },
+                    "low_angle": {
+                        "prefix": "From a low angle, ",
+                        "observation": "the scene is viewed from below looking upward.",
+                        "short_desc": "from below"
+                    },
+                    "elevated": {
+                        "prefix": "From an elevated position, ",
+                        "observation": "the scene is viewed from a higher vantage point.",
+                        "short_desc": "from an elevated position"
+                    }
+                }
+            # 文化模板備份
+            if "cultural_templates" not in templates or not templates["cultural_templates"]:
+                templates["cultural_templates"] = {
+                    "asian": {
+                        "elements": ["traditional architectural elements", "cultural signage", "Asian design features"],
+                        "description": "The scene displays distinctive Asian cultural characteristics with {elements}."
+                    },
+                    "european": {
+                        "elements": ["classical architecture", "European design elements", "historic features"],
+                        "description": "The scene exhibits European architectural and cultural elements including {elements}."
+                    }
+                }
+            self.logger.debug("Default templates initialized as backup")
+        except Exception as e:
+            self.logger.error(f"Error initializing default templates: {str(e)}")
+    def _merge_custom_templates(self, custom_templates: Dict):
+        """
+        合併自定義模板到現有模板庫
+        Args:
+            custom_templates: 自定義模板字典
+        """
+        try:
+            for template_category, custom_content in custom_templates.items():
+                if template_category in self.templates:
+                    if isinstance(self.templates[template_category], dict) and isinstance(custom_content, dict):
+                        self.templates[template_category].update(custom_content)
+                        self.logger.debug(f"Merged custom templates for category: {template_category}")
+                    else:
+                        self.templates[template_category] = custom_content
+                        self.logger.debug(f"Replaced templates for category: {template_category}")
+                else:
+                    self.templates[template_category] = custom_content
+                    self.logger.debug(f"Added new template category: {template_category}")
+            self.logger.info("Successfully merged custom templates")
+        except Exception as e:
+            self.logger.warning(f"Error merging custom templates: {str(e)}")
+    def _validate_templates(self):
+        """
+        驗證模板完整性和有效性
+        """
+        try:
+            required_categories = [
+                "scene_detail_templates",
+                "object_template_fillers",
+                "viewpoint_templates",
+                "cultural_templates",
+                "lighting_templates",
+                "confidence_templates"
+            ]
+            missing_categories = []
+            for category in required_categories:
+                if category not in self.templates:
+                    missing_categories.append(category)
+                elif not self.templates[category]:
+                    self.logger.warning(f"Template category '{category}' is empty")
+            if missing_categories:
+                error_msg = f"Missing required template categories: {missing_categories}"
+                self.logger.warning(error_msg)
+                # 為缺失的類別創建空模板
+                for category in missing_categories:
+                    self.templates[category] = {}
+            # 驗證視角模板結構
+            self._validate_viewpoint_templates()
+            # 驗證文化模板結構
+            self._validate_cultural_templates()
+            self.logger.debug("Template validation completed successfully")
+        except Exception as e:
+            error_msg = f"Template validation failed: {str(e)}"
+            self.logger.error(f"{error_msg}\n{traceback.format_exc()}")
+    def _validate_viewpoint_templates(self):
+        """驗證視角模板結構"""
+        viewpoint_templates = self.templates.get("viewpoint_templates", {})
+        for viewpoint, template_data in viewpoint_templates.items():
+            if not isinstance(template_data, dict):
+                self.logger.warning(f"Invalid viewpoint template structure for '{viewpoint}'")
+                continue
+            required_keys = ["prefix", "observation"]
+            for key in required_keys:
+                if key not in template_data:
+                    self.logger.warning(f"Missing '{key}' in viewpoint template '{viewpoint}'")
+    def _validate_cultural_templates(self):
+        """驗證文化模板結構"""
+        cultural_templates = self.templates.get("cultural_templates", {})
+        for culture, template_data in cultural_templates.items():
+            if not isinstance(template_data, dict):
+                self.logger.warning(f"Invalid cultural template structure for '{culture}'")
+                continue
+            if "elements" not in template_data or "description" not in template_data:
+                self.logger.warning(f"Missing required keys in cultural template '{culture}'")
+    def get_template(self, category: str, key: Optional[str] = None) -> Any:
+        """
+        獲取指定類別的模板
+        Args:
+            category: 模板類別名稱
+            key: 可選的具體模板鍵值
+        Returns:
+            Any: 請求的模板內容，如果不存在則返回空字典或空字符串
+        """
+        try:
+            if category not in self.templates:
+                self.logger.warning(f"Template category '{category}' not found")
+                return {} if key is None else ""
+            if key is None:
+                return self.templates[category]
+            category_templates = self.templates[category]
+            if not isinstance(category_templates, dict):
+                self.logger.warning(f"Template category '{category}' is not a dictionary")
+                return ""
+            if key not in category_templates:
+                self.logger.warning(f"Template key '{key}' not found in category '{category}'")
+                return ""
+            return category_templates[key]
+        except Exception as e:
+            error_msg = f"Error retrieving template {category}.{key}: {str(e)}"
+            self.logger.error(error_msg)
+            return {} if key is None else ""
+    def fill_template(self, template: str, detected_objects: List[Dict], scene_type: str,
+             places365_info: Optional[Dict] = None,
+             object_statistics: Optional[Dict] = None) -> str:
+        """
+        填充模板中的佔位符，增強容錯處理
+        Args:
+            template: 包含佔位符的模板字符串
+            detected_objects: 檢測到的物體列表
+            scene_type: 場景類型
+            places365_info: Places365場景分類信息
+            object_statistics: 物體統計信息
+        Returns:
+            str: 填充後的模板字符串，確保語法正確
+        """
+        try:
+            self.logger.debug(f"Filling template for scene_type: {scene_type}")
+            if not template or not template.strip():
+                return "A scene with various elements."
+            # 預處理模板，移除可能的問題模式
+            template = self._preprocess_template(template)
+            # 查找模板中的佔位符
+            placeholders = re.findall(r'\{([^}]+)\}', template)
+            filled_template = template
+            # 獲取模板填充器
+            fillers = self.templates.get("object_template_fillers", {})
+            # 基於物體統計信息生成替換內容
+            statistics_based_replacements = self._generate_statistics_replacements(object_statistics)
+            # 生成默認替換內容
+            default_replacements = self._generate_default_replacements()
+            # 添加Places365上下文信息
+            places365_replacements = self._generate_places365_replacements(places365_info)
+            # 添加功能區域信息到場景數據中以便後續使用
+            scene_functional_zones = None
+            if hasattr(self, '_current_functional_zones'):
+                scene_functional_zones = self._current_functional_zones
+            # 合併所有替換內容（優先順序是統計信息 > Places365 > 默認）
+            all_replacements = {**default_replacements, **places365_replacements, **statistics_based_replacements}
+            # 填充每個佔位符
+            for placeholder in placeholders:
+                try:
+                    replacement = self._get_placeholder_replacement(
+                        placeholder, fillers, all_replacements, detected_objects, scene_type
+                    )
+                    # 確保替換內容不為空且有意義
+                    if not replacement or not replacement.strip():
+                        replacement = self._get_emergency_replacement(placeholder)
+                    filled_template = filled_template.replace(f"{{{placeholder}}}", replacement)
+                except Exception as placeholder_error:
+                    self.logger.warning(f"Failed to replace placeholder '{placeholder}': {str(placeholder_error)}")
+                    # 使用緊急替換值
+                    emergency_replacement = self._get_emergency_replacement(placeholder)
+                    filled_template = filled_template.replace(f"{{{placeholder}}}", emergency_replacement)
+            # 修復可能的語法問題
+            filled_template = self._postprocess_filled_template(filled_template)
+            self.logger.debug("Template filling completed successfully")
+            return filled_template
+        except Exception as e:
+            error_msg = f"Error filling template: {str(e)}"
+            self.logger.error(f"{error_msg}\n{traceback.format_exc()}")
+            # 返回安全的備用內容
+            return self._generate_fallback_description(scene_type, detected_objects)
+    def _preprocess_template(self, template: str) -> str:
+        """
+        預處理模板，修復常見問題
+        Args:
+            template: 原始模板字符串
+        Returns:
+            str: 預處理後的模板
+        """
+        try:
+            # 移除可能導致問題的模式
+            template = re.sub(r'\{[^}]*\}\s*,\s*\{[^}]*\}', '{combined_elements}', template)
+            # 確保模板不以逗號開始
+            template = re.sub(r'^[,\s]*', '', template)
+            return template.strip()
+        except Exception as e:
+            self.logger.warning(f"Error preprocessing template: {str(e)}")
+            return template
+    def _get_emergency_replacement(self, placeholder: str) -> str:
+        """
+        獲取緊急替換值，確保不會產生語法錯誤
+        Args:
+            placeholder: 佔位符名稱
+        Returns:
+            str: 安全的替換值
+        """
+        emergency_replacements = {
+            "crossing_pattern": "pedestrian walkways",
+            "pedestrian_behavior": "people moving through the area",
+            "traffic_pattern": "vehicle movement",
+            "scene_setting": "this location",
+            "urban_elements": "city features",
+            "street_elements": "urban components"
+        }
+        if placeholder in emergency_replacements:
+            return emergency_replacements[placeholder]
+        # 基於佔位符名稱生成合理的替換
+        cleaned = placeholder.replace('_', ' ')
+        if len(cleaned.split()) > 1:
+            return cleaned
+        else:
+            return f"various {cleaned}"
+    def _postprocess_filled_template(self, filled_template: str) -> str:
+        """
+        後處理填充完成的模板，修復語法問題
+        Args:
+            filled_template: 填充後的模板字符串
+        Returns:
+            str: 修復後的模板字符串
+        """
+        try:
+            # 修復 "In , " 模式
+            filled_template = re.sub(r'\bIn\s*,\s*', 'In this scene, ', filled_template)
+            filled_template = re.sub(r'\bAt\s*,\s*', 'At this location, ', filled_template)
+            filled_template = re.sub(r'\bWithin\s*,\s*', 'Within this area, ', filled_template)
+            # 修復連續逗號
+            filled_template = re.sub(r',\s*,', ',', filled_template)
+            # 修復開頭的逗號
+            filled_template = re.sub(r'^[,\s]*', '', filled_template)
+            # 確保首字母大寫
+            if filled_template and not filled_template[0].isupper():
+                filled_template = filled_template[0].upper() + filled_template[1:]
+            # 確保以句號結尾
+            if filled_template and not filled_template.endswith(('.', '!', '?')):
+                filled_template += '.'
+            return filled_template.strip()
+        except Exception as e:
+            self.logger.warning(f"Error postprocessing filled template: {str(e)}")
+            return filled_template
+    def _generate_fallback_description(self, scene_type: str, detected_objects: List[Dict]) -> str:
+        """
+        生成備用描述，當模板填充完全失敗時使用
+        Args:
+            scene_type: 場景類型
+            detected_objects: 檢測到的物體列表
+        Returns:
+            str: 備用描述
+        """
+        try:
+            object_count = len(detected_objects)
+            if object_count == 0:
+                return f"A {scene_type.replace('_', ' ')} scene."
+            elif object_count == 1:
+                return f"A {scene_type.replace('_', ' ')} scene with one visible element."
+            else:
+                return f"A {scene_type.replace('_', ' ')} scene with {object_count} visible elements."
+        except Exception as e:
+            self.logger.warning(f"Error generating fallback description: {str(e)}")
+            return "A scene with various elements."
+    def _generate_statistics_replacements(self, object_statistics: Optional[Dict]) -> Dict[str, str]:
+        """
+        基於物體統計信息生成模板替換內容
+        Args:
+            object_statistics: 物體統計信息
+        Returns:
+            Dict[str, str]: 統計信息基礎的替換內容
+        """
+        replacements = {}
+        if not object_statistics:
+            return replacements
+        try:
+            # 處理植物元素
+            if "potted plant" in object_statistics:
+                count = object_statistics["potted plant"]["count"]
+                if count == 1:
+                    replacements["plant_elements"] = "a potted plant"
+                elif count <= 3:
+                    replacements["plant_elements"] = f"{count} potted plants"
+                else:
+                    replacements["plant_elements"] = f"multiple potted plants ({count} total)"
+            # 處理座位
+            if "chair" in object_statistics:
+                count = object_statistics["chair"]["count"]
+                if count == 1:
+                    replacements["seating"] = "a chair"
+                elif count <= 4:
+                    replacements["seating"] = f"{count} chairs"
+                else:
+                    replacements["seating"] = f"numerous chairs ({count} total)"
+            # 處理人員
+            if "person" in object_statistics:
+                count = object_statistics["person"]["count"]
+                if count == 1:
+                    replacements["people_and_vehicles"] = "a person"
+                    replacements["pedestrian_flow"] = "an individual walking"
+                elif count <= 5:
+                    replacements["people_and_vehicles"] = f"{count} people"
+                    replacements["pedestrian_flow"] = f"{count} people walking"
+                else:
+                    replacements["people_and_vehicles"] = f"many people ({count} individuals)"
+                    replacements["pedestrian_flow"] = f"a crowd of {count} people"
+            # 處理桌子設置
+            if "dining table" in object_statistics:
+                count = object_statistics["dining table"]["count"]
+                if count == 1:
+                    replacements["table_setup"] = "a dining table"
+                    replacements["table_description"] = "a dining surface"
+                else:
+                    replacements["table_setup"] = f"{count} dining tables"
+                    replacements["table_description"] = f"{count} dining surfaces"
+            self.logger.debug(f"Generated {len(replacements)} statistics-based replacements")
+        except Exception as e:
+            self.logger.warning(f"Error generating statistics replacements: {str(e)}")
+        return replacements
+    def _generate_places365_replacements(self, places365_info: Optional[Dict]) -> Dict[str, str]:
+        """
+        基於Places365信息生成模板替換內容
+        Args:
+            places365_info: Places365場景分類信息
+        Returns:
+            Dict[str, str]: Places365基礎的替換內容
+        """
+        replacements = {}
+        if not places365_info or places365_info.get('confidence', 0) <= 0.35:
+            replacements["places365_context"] = ""
+            replacements["places365_atmosphere"] = ""
+            return replacements
+        try:
+            scene_label = places365_info.get('scene_label', '').replace('_', ' ')
+            attributes = places365_info.get('attributes', [])
+            # 生成場景上下文
+            if scene_label:
+                replacements["places365_context"] = f"characteristic of a {scene_label}"
+            else:
+                replacements["places365_context"] = ""
+            # 生成氛圍描述
+            if 'natural_lighting' in attributes:
+                replacements["places365_atmosphere"] = "with natural illumination"
+            elif 'artificial_lighting' in attributes:
+                replacements["places365_atmosphere"] = "under artificial lighting"
+            else:
+                replacements["places365_atmosphere"] = ""
+            self.logger.debug("Generated Places365-based replacements")
+        except Exception as e:
+            self.logger.warning(f"Error generating Places365 replacements: {str(e)}")
+            replacements["places365_context"] = ""
+            replacements["places365_atmosphere"] = ""
+        return replacements
+    def _generate_default_replacements(self) -> Dict[str, str]:
+        """
+        生成默認的模板替換內容
+        Returns:
+            Dict[str, str]: 默認替換內容
+        """
+        return {
+            "scene_introduction": "this scene",
+            "location_prefix": "this location",
+            "setting_description": "this setting",
+            "area_description": "this area",
+            "environment_description": "this environment",
+            "spatial_introduction": "this space",
+            # 室內相關
+            "furniture": "various furniture pieces",
+            "seating": "comfortable seating",
+            "electronics": "entertainment devices",
+            "bed_type": "a bed",
+            "bed_location": "room",
+            "bed_description": "sleeping arrangements",
+            "extras": "personal items",
+            "table_setup": "a dining table and chairs",
+            "table_description": "a dining surface",
+            "dining_items": "dining furniture and tableware",
+            "appliances": "kitchen appliances",
+            "kitchen_items": "cooking utensils and dishware",
+            "cooking_equipment": "cooking equipment",
+            "office_equipment": "work-related furniture and devices",
+            "desk_setup": "a desk and chair",
+            "computer_equipment": "electronic devices",
+            # 室外/城市相關
+            "traffic_description": "vehicles and pedestrians",
+            "people_and_vehicles": "people and various vehicles",
+            "street_elements": "urban infrastructure",
+            "park_features": "benches and greenery",
+            "outdoor_elements": "natural features",
+            "park_description": "outdoor amenities",
+            "store_elements": "merchandise displays",
+            "shopping_activity": "customers browse and shop",
+            "store_items": "products for sale",
+            # 高級餐廳相關
+            "design_elements": "elegant decor",
+            "lighting": "stylish lighting fixtures",
+            # 亞洲商業街相
+            "storefront_features": "compact shops",
+            "pedestrian_flow": "people walking",
+            "asian_elements": "distinctive cultural elements",
+            "cultural_elements": "traditional design features",
+            "signage": "colorful signs",
+            "street_activities": "busy urban activity",
+            # 金融區相關
+            "buildings": "tall buildings",
+            "traffic_elements": "vehicles",
+            "skyscrapers": "high-rise buildings",
+            "road_features": "wide streets",
+            "architectural_elements": "modern architecture",
+            "city_landmarks": "prominent structures",
+            # 十字路口相關
+            "crossing_pattern": "clearly marked pedestrian crossings",
+            "pedestrian_behavior": "careful pedestrian movement",
+            "pedestrian_density": "multiple groups of pedestrians",
+            "traffic_pattern": "well-regulated traffic flow",
+            "pedestrian_flow": "steady pedestrian movement",
+            "traffic_description": "active urban traffic",
+            "people_and_vehicles": "pedestrians and vehicles",
+            "street_elements": "urban infrastructure elements",
+            # 交通相關
+            "transit_vehicles": "public transportation vehicles",
+            "passenger_activity": "commuter movement",
+            "transportation_modes": "various transit options",
+            "passenger_needs": "waiting areas",
+            "transit_infrastructure": "transit facilities",
+            "passenger_movement": "commuter flow",
+            # 購物區相關
+            "retail_elements": "shops and displays",
+            "store_types": "various retail establishments",
+            "walkway_features": "pedestrian pathways",
+            "commercial_signage": "store signs",
+            "consumer_behavior": "shopping activities",
+            # 空中視角相關
+            "commercial_layout": "organized retail areas",
+            "pedestrian_pattern": "people movement patterns",
+            "gathering_features": "public gathering spaces",
+            "movement_pattern": "crowd flow patterns",
+            "urban_elements": "city infrastructure",
+            "public_activity": "social interaction",
+            # 文化特定元素
+            "stall_elements": "vendor booths",
+            "lighting_features": "decorative lights",
+            "food_elements": "food offerings",
+            "vendor_stalls": "market stalls",
+            "nighttime_activity": "evening commerce",
+            "cultural_lighting": "traditional lighting",
+            "night_market_sounds": "lively market sounds",
+            "evening_crowd_behavior": "nighttime social activity",
+            "architectural_elements": "cultural buildings",
+            "religious_structures": "sacred buildings",
+            "decorative_features": "ornamental designs",
+            "cultural_practices": "traditional activities",
+            "temple_architecture": "religious structures",
+            "sensory_elements": "atmospheric elements",
+            "visitor_activities": "cultural experiences",
+            "ritual_activities": "ceremonial practices",
+            "cultural_symbols": "meaningful symbols",
+            "architectural_style": "historical buildings",
+            "historic_elements": "traditional architecture",
+            "urban_design": "city planning elements",
+            "social_behaviors": "public interactions",
+            "european_features": "European architectural details",
+            "tourist_activities": "visitor activities",
+            "local_customs": "regional practices",
+            # 時間特定元素
+            "lighting_effects": "artificial lighting",
+            "shadow_patterns": "light and shadow",
+            "urban_features": "city elements",
+            "illuminated_elements": "lit structures",
+            "evening_activities": "nighttime activities",
+            "light_sources": "lighting points",
+            "lit_areas": "illuminated spaces",
+            "shadowed_zones": "darker areas",
+            "illuminated_signage": "bright signs",
+            "colorful_lighting": "multicolored lights",
+            "neon_elements": "neon signs",
+            "night_crowd_behavior": "evening social patterns",
+            "light_displays": "lighting installations",
+            "building_features": "architectural elements",
+            "nightlife_activities": "evening entertainment",
+            "lighting_modifier": "bright",
+            # 混合環境元素
+            "transitional_elements": "connecting features",
+            "indoor_features": "interior elements",
+            "outdoor_setting": "exterior spaces",
+            "interior_amenities": "inside comforts",
+            "exterior_features": "outside elements",
+            "inside_elements": "interior design",
+            "outside_spaces": "outdoor areas",
+            "dual_environment_benefits": "combined settings",
+            "passenger_activities": "waiting behaviors",
+            "transportation_types": "transit vehicles",
+            "sheltered_elements": "covered areas",
+            "exposed_areas": "open sections",
+            "waiting_behaviors": "passenger activities",
+            "indoor_facilities": "inside services",
+            "platform_features": "transit platform elements",
+            "transit_routines": "transportation procedures",
+            # 專門場所元素
+            "seating_arrangement": "spectator seating",
+            "playing_surface": "athletic field",
+            "sporting_activities": "sports events",
+            "spectator_facilities": "viewer accommodations",
+            "competition_space": "sports arena",
+            "sports_events": "athletic competitions",
+            "viewing_areas": "audience sections",
+            "field_elements": "field markings and equipment",
+            "game_activities": "competitive play",
+            "construction_equipment": "building machinery",
+            "building_materials": "construction supplies",
+            "construction_activities": "building work",
+            "work_elements": "construction tools",
+            "structural_components": "building structures",
+            "site_equipment": "construction gear",
+            "raw_materials": "building supplies",
+            "construction_process": "building phases",
+            "medical_elements": "healthcare equipment",
+            "clinical_activities": "medical procedures",
+            "facility_design": "healthcare layout",
+            "healthcare_features": "medical facilities",
+            "patient_interactions": "care activities",
+            "equipment_types": "medical devices",
+            "care_procedures": "health services",
+            "treatment_spaces": "clinical areas",
+            "educational_furniture": "learning furniture",
+            "learning_activities": "educational practices",
+            "instructional_design": "teaching layout",
+            "classroom_elements": "school equipment",
+            "teaching_methods": "educational approaches",
+            "student_engagement": "learning participation",
+            "learning_spaces": "educational areas",
+            "educational_tools": "teaching resources",
+            "knowledge_transfer": "learning exchanges"
+        }
+    def _generate_objects_summary(self, detected_objects: List[Dict]) -> str:
+        """
+        基於檢測物件生成自然語言摘要，按重要性排序
+        Args:
+            detected_objects: 檢測到的物件列表
+        Returns:
+            str: 物件摘要描述
+        """
+        try:
+            # detected_objects 裡有幾個 traffic light)
+            tl_count = len([obj for obj in detected_objects if obj.get("class_name","") == "traffic light"])
+            # print(f"[DEBUG] _generate_objects_summary 傳入的 detected_objects 中 traffic light: {tl_count} 個")
+            for obj in detected_objects:
+                if obj.get("class_name","") == "traffic light":
+                    print(f"    - conf={obj.get('confidence',0):.4f}, bbox={obj.get('bbox')}, region={obj.get('region')}")
+            if not detected_objects:
+                return "various elements"
+            # calculate object statistic
+            object_counts = {}
+            total_confidence = 0
+            for obj in detected_objects:
+                class_name = obj.get("class_name", "unknown")
+                confidence = obj.get("confidence", 0.5)
+                if class_name not in object_counts:
+                    object_counts[class_name] = {"count": 0, "total_confidence": 0}
+                object_counts[class_name]["count"] += 1
+                object_counts[class_name]["total_confidence"] += confidence
+                total_confidence += confidence
+            # 計算平均置信度並排序
+            sorted_objects = []
+            for class_name, stats in object_counts.items():
+                avg_confidence = stats["total_confidence"] / stats["count"]
+                count = stats["count"]
+                # 重要性評分：結合數量和置信度
+                importance_score = (count * 0.6) + (avg_confidence * 0.4)
+                sorted_objects.append((class_name, count, importance_score))
+            # 按重要性排序，取前5個最重要的物件
+            sorted_objects.sort(key=lambda x: x[2], reverse=True)
+            top_objects = sorted_objects[:5]
+            # 生成自然語言描述
+            descriptions = []
+            for class_name, count, _ in top_objects:
+                clean_name = class_name.replace('_', ' ')
+                if count == 1:
+                    article = "an" if clean_name[0].lower() in 'aeiou' else "a"
+                    descriptions.append(f"{article} {clean_name}")
+                else:
+                    descriptions.append(f"{count} {clean_name}s")
+            # 組合描述
+            if len(descriptions) == 1:
+                return descriptions[0]
+            elif len(descriptions) == 2:
+                return f"{descriptions[0]} and {descriptions[1]}"
+            else:
+                return ", ".join(descriptions[:-1]) + f", and {descriptions[-1]}"
+        except Exception as e:
+            self.logger.warning(f"Error generating objects summary: {str(e)}")
+            return "various elements"
+    def _get_placeholder_replacement(self, placeholder: str, fillers: Dict,
+                           all_replacements: Dict, detected_objects: List[Dict],
+                           scene_type: str) -> str:
+        """
+        獲取特定佔位符的替換內容，確保永遠不返回空值
+        """
+        try:
+            # 優先處理動態內容生成的佔位符
+            dynamic_placeholders = [
+                'primary_objects', 'detected_objects_summary', 'main_objects',
+                'functional_area', 'functional_zones_description', 'scene_elements'
+            ]
+            if placeholder in dynamic_placeholders:
+                dynamic_content = self._generate_objects_summary(detected_objects)
+                if dynamic_content and dynamic_content.strip():
+                    return dynamic_content.strip()
+            # 檢查預定義替換內容
+            if placeholder in all_replacements:
+                replacement = all_replacements[placeholder]
+                if replacement and replacement.strip():
+                    return replacement.strip()
+            # 檢查物體模板填充器
+            if placeholder in fillers:
+                options = fillers[placeholder]
+                if options and isinstance(options, list):
+                    valid_options = [opt.strip() for opt in options if opt and str(opt).strip()]
+                    if valid_options:
+                        num_items = min(len(valid_options), random.randint(1, 3))
+                        selected_items = random.sample(valid_options, num_items)
+                        if len(selected_items) == 1:
+                            return selected_items[0]
+                        elif len(selected_items) == 2:
+                            return f"{selected_items[0]} and {selected_items[1]}"
+                        else:
+                            return ", ".join(selected_items[:-1]) + f", and {selected_items[-1]}"
+            # 基於檢測對象生成動態內容
+            scene_specific_replacement = self._generate_scene_specific_content(
+                placeholder, detected_objects, scene_type
+            )
+            if scene_specific_replacement and scene_specific_replacement.strip():
+                return scene_specific_replacement.strip()
+            # 通用備用字典 - 擴展版本
+            fallback_replacements = {
+                # 交通和城市相關
+                "crossing_pattern": "pedestrian crosswalks",
+                "pedestrian_behavior": "people moving carefully",
+                "traffic_pattern": "vehicle movement",
+                "urban_elements": "city infrastructure",
+                "street_elements": "urban features",
+                "intersection_features": "traffic management systems",
+                "pedestrian_density": "groups of people",
+                "pedestrian_flow": "pedestrian movement",
+                "traffic_description": "vehicle traffic",
+                "people_and_vehicles": "pedestrians and cars",
+                # 場景設置相關
+                "scene_setting": "this urban environment",
+                "location_context": "the area",
+                "spatial_context": "the scene",
+                "environmental_context": "this location",
+                # 常見的家具和設備
+                "furniture": "various furniture pieces",
+                "seating": "seating arrangements",
+                "electronics": "electronic devices",
+                "appliances": "household appliances",
+                # 活動和行為
+                "activities": "various activities",
+                "interactions": "people interacting",
+                "movement": "movement patterns",
+                # 照明和氛圍
+                "lighting_conditions": "ambient lighting",
+                "atmosphere": "the overall atmosphere",
+                "ambiance": "environmental ambiance",
+                # 空間描述
+                "spatial_arrangement": "spatial organization",
+                "layout": "the layout",
+                "composition": "visual composition",
+                # 物體和元素
+                "objects": "various objects",
+                "elements": "scene elements",
+                "features": "notable features",
+                "details": "observable details"
+            }
+            if placeholder in fallback_replacements:
+                return fallback_replacements[placeholder]
+            # 基於場景類型的智能默認值
+            scene_based_defaults = self._get_scene_based_default(placeholder, scene_type)
+            if scene_based_defaults:
+                return scene_based_defaults
+            # 最終備用：將下劃線轉換為有意義的短語
+            cleaned_placeholder = placeholder.replace('_', ' ')
+            # 對常見模式提供更好的默認值
+            if placeholder.endswith('_pattern'):
+                return f"{cleaned_placeholder.replace(' pattern', '')} arrangement"
+            elif placeholder.endswith('_behavior'):
+                return f"{cleaned_placeholder.replace(' behavior', '')} activity"
+            elif placeholder.endswith('_description'):
+                return f"{cleaned_placeholder.replace(' description', '')} elements"
+            elif placeholder.endswith('_elements'):
+                return cleaned_placeholder
+            elif placeholder.endswith('_features'):
+                return cleaned_placeholder
+            else:
+                return cleaned_placeholder if cleaned_placeholder != placeholder else "various elements"
+        except Exception as e:
+            self.logger.warning(f"Error getting replacement for placeholder '{placeholder}': {str(e)}")
+            # 確保即使在異常情況下也返回有意義的內容
+            return placeholder.replace('_', ' ') if placeholder else "scene elements"
+    def _get_scene_based_default(self, placeholder: str, scene_type: str) -> Optional[str]:
+        """
+        基於場景類型提供智能默認值
+        Args:
+            placeholder: 佔位符名稱
+            scene_type: 場景類型
+        Returns:
+            Optional[str]: 場景特定的默認值或None
+        """
+        try:
+            # 針對不同場景類型的特定默認值
+            scene_defaults = {
+                "urban_intersection": {
+                    "crossing_pattern": "marked crosswalks",
+                    "pedestrian_behavior": "pedestrians crossing carefully",
+                    "traffic_pattern": "controlled traffic flow"
+                },
+                "city_street": {
+                    "traffic_description": "urban vehicle traffic",
+                    "street_elements": "city infrastructure",
+                    "people_and_vehicles": "pedestrians and vehicles"
+                },
+                "living_room": {
+                    "furniture": "comfortable living room furniture",
+                    "seating": "sofas and chairs",
+                    "electronics": "entertainment equipment"
+                },
+                "kitchen": {
+                    "appliances": "kitchen appliances",
+                    "cooking_equipment": "cooking tools and equipment"
+                },
+                "office_workspace": {
+                    "office_equipment": "work furniture and devices",
+                    "desk_setup": "desk and office chair"
+                }
+            }
+            if scene_type in scene_defaults and placeholder in scene_defaults[scene_type]:
+                return scene_defaults[scene_type][placeholder]
+            return None
+        except Exception as e:
+            self.logger.warning(f"Error getting scene-based default for '{placeholder}' in '{scene_type}': {str(e)}")
+            return None
+    def _generate_scene_specific_content(self, placeholder: str, detected_objects: List[Dict],
+                                       scene_type: str) -> Optional[str]:
+        """
+        基於場景特定邏輯生成佔位符內容
+        Args:
+            placeholder: 佔位符名稱
+            detected_objects: 檢測到的物體列表
+            scene_type: 場景類型
+        Returns:
+            Optional[str]: 生成的內容或None
+        """
+        try:
+            if placeholder == "furniture":
+                # 提取家具物品
+                furniture_ids = [56, 57, 58, 59, 60, 61]  # 家具類別ID
+                furniture_objects = [obj for obj in detected_objects if obj.get("class_id") in furniture_ids]
+                if furniture_objects:
+                    furniture_names = [obj.get("class_name", "furniture") for obj in furniture_objects[:3]]
+                    unique_names = list(set(furniture_names))
+                    return ", ".join(unique_names) if len(unique_names) > 1 else unique_names[0]
+                return "various furniture items"
+            elif placeholder == "electronics":
+                # 提取電子設備
+                electronics_ids = [62, 63, 64, 65, 66, 67, 68, 69, 70]  # 電子設備類別ID
+                electronics_objects = [obj for obj in detected_objects if obj.get("class_id") in electronics_ids]
+                if electronics_objects:
+                    electronics_names = [obj.get("class_name", "electronic device") for obj in electronics_objects[:3]]
+                    unique_names = list(set(electronics_names))
+                    return ", ".join(unique_names) if len(unique_names) > 1 else unique_names[0]
+                return "electronic devices"
+            elif placeholder == "people_count":
+                # 計算人數
+                people_count = len([obj for obj in detected_objects if obj.get("class_id") == 0])
+                if people_count == 0:
+                    return "no people"
+                elif people_count == 1:
+                    return "one person"
+                elif people_count < 5:
+                    return f"{people_count} people"
+                else:
+                    return "several people"
+            elif placeholder == "seating":
+                # 提取座位物品
+                seating_ids = [56, 57]  # chair, sofa
+                seating_objects = [obj for obj in detected_objects if obj.get("class_id") in seating_ids]
+                if seating_objects:
+                    seating_names = [obj.get("class_name", "seating") for obj in seating_objects[:2]]
+                    unique_names = list(set(seating_names))
+                    return ", ".join(unique_names) if len(unique_names) > 1 else unique_names[0]
+                return "seating arrangements"
+            # 如果沒有匹配的特定邏輯，返回None
+            return None
+        except Exception as e:
+            self.logger.warning(f"Error generating scene-specific content for '{placeholder}': {str(e)}")
+            return None
+    def get_confidence_template(self, confidence_level: str) -> str:
+        """
+        獲取指定信心度級別的模板
+        Args:
+            confidence_level: 信心度級別 ('high', 'medium', 'low')
+        Returns:
+            str: 信心度模板字符串
+        """
+        try:
+            confidence_templates = self.templates.get("confidence_templates", {})
+            if confidence_level in confidence_templates:
+                return confidence_templates[confidence_level]
+            # 備用模板
+            fallback_templates = {
+                "high": "{description} {details}",
+                "medium": "This appears to be {description} {details}",
+                "low": "This might be {description}, but the confidence is low. {details}"
+            }
+            return fallback_templates.get(confidence_level, "{description} {details}")
+        except Exception as e:
+            self.logger.warning(f"Error getting confidence template for '{confidence_level}': {str(e)}")
+            return "{description} {details}"
+    def get_lighting_template(self, lighting_type: str) -> str:
+        """
+        獲取指定照明類型的模板
+        Args:
+            lighting_type: 照明類型
+        Returns:
+            str: 照明描述模板
+        """
+        try:
+            lighting_templates = self.templates.get("lighting_templates", {})
+            if lighting_type in lighting_templates:
+                return lighting_templates[lighting_type]
+            # 備用模板
+            return f"The scene is captured with {lighting_type.replace('_', ' ')} lighting conditions."
+        except Exception as e:
+            self.logger.warning(f"Error getting lighting template for '{lighting_type}': {str(e)}")
+            return "The lighting conditions are not clearly identifiable."
+    def get_viewpoint_template(self, viewpoint: str) -> Dict[str, str]:
+        """
+        獲取指定視角的模板
+        Args:
+            viewpoint: 視角類型
+        Returns:
+            Dict[str, str]: 包含prefix、observation等鍵的視角模板字典
+        """
+        try:
+            viewpoint_templates = self.templates.get("viewpoint_templates", {})
+            if viewpoint in viewpoint_templates:
+                return viewpoint_templates[viewpoint]
+            # 備用模板
+            fallback_templates = {
+                "eye_level": {
+                    "prefix": "From eye level, ",
+                    "observation": "the scene is viewed straight ahead.",
+                    "short_desc": "at eye level"
+                },
+                "aerial": {
+                    "prefix": "From above, ",
+                    "observation": "the scene is viewed from a bird's-eye perspective.",
+                    "short_desc": "from above"
+                },
+                "low_angle": {
+                    "prefix": "From a low angle, ",
+                    "observation": "the scene is viewed from below looking upward.",
+                    "short_desc": "from below"
+                },
+                "elevated": {
+                    "prefix": "From an elevated position, ",
+                    "observation": "the scene is viewed from a higher vantage point.",
+                    "short_desc": "from an elevated position"
+                }
+            }
+            return fallback_templates.get(viewpoint, fallback_templates["eye_level"])
+        except Exception as e:
+            self.logger.warning(f"Error getting viewpoint template for '{viewpoint}': {str(e)}")
+            return {
+                "prefix": "",
+                "observation": "the scene is viewed normally.",
+                "short_desc": "normally"
+            }
+    def get_cultural_template(self, cultural_context: str) -> Dict[str, Any]:
+        """
+        獲取指定文化語境的模板
+        Args:
+            cultural_context: 文化語境
+        Returns:
+            Dict[str, Any]: 文化模板字典
+        """
+        try:
+            cultural_templates = self.templates.get("cultural_templates", {})
+            if cultural_context in cultural_templates:
+                return cultural_templates[cultural_context]
+            # 備用模板
+            return {
+                "elements": ["cultural elements"],
+                "description": f"The scene displays {cultural_context} cultural characteristics."
+            }
+        except Exception as e:
+            self.logger.warning(f"Error getting cultural template for '{cultural_context}': {str(e)}")
+            return {
+                "elements": ["various elements"],
+                "description": "The scene displays cultural characteristics."
+            }
+    def get_scene_detail_templates(self, scene_type: str, viewpoint: Optional[str] = None) -> List[str]:
+        """
+        獲取場景詳細描述模板
+        Args:
+            scene_type: 場景類型
+            viewpoint: 可選的視角類型
+        Returns:
+            List[str]: 場景描述模板列表
+        """
+        try:
+            scene_templates = self.templates.get("scene_detail_templates", {})
+            # 首先嘗試獲取特定視角的模板
+            if viewpoint:
+                viewpoint_key = f"{scene_type}_{viewpoint}"
+                if viewpoint_key in scene_templates:
+                    return scene_templates[viewpoint_key]
+            # 然後嘗試獲取場景類型的通用模板
+            if scene_type in scene_templates:
+                return scene_templates[scene_type]
+            # 最後使用默認模板
+            if "default" in scene_templates:
+                return scene_templates["default"]
+            # 備用模板
+            return ["A scene with various elements and objects."]
+        except Exception as e:
+            self.logger.warning(f"Error getting scene detail templates for '{scene_type}': {str(e)}")
+            return ["A scene with various elements and objects."]
+    def reload_templates(self):
+        """
+        重新載入所有模板
+        """
+        try:
+            self.template_manager.reload_templates()
+            self.logger.info("Templates reloaded successfully")
+        except Exception as e:
+            self.logger.error(f"Error reloading templates: {str(e)}")
+    def get_template_categories(self) -> List[str]:
+        """
+        獲取所有可用的模板類別名稱
+        Returns:
+            List[str]: 模板類別名稱列表
+        """
+        return list(self.templates.keys())
+    def template_exists(self, category: str, key: Optional[str] = None) -> bool:
+        """
+        檢查模板是否存在
+        Args:
+            category: 模板類別
+            key: 可選的模板鍵值
+        Returns:
+            bool: 模板是否存在
+        """
+        try:
+            if category not in self.templates:
+                return False
+            if key is None:
+                return True
+            category_templates = self.templates[category]
+            if isinstance(category_templates, dict):
+                return key in category_templates
+            return False
+        except Exception as e:
+            self.logger.warning(f"Error checking template existence for {category}.{key}: {str(e)}")
+            return False
+    def apply_template(self, template: Union[str, Dict[str, Any]], scene_data: Dict[str, Any]) -> str:
+        """
+        應用選定的模板來生成場景描述
+        Args:
+            template: 模板字符串或模板內容字典
+            scene_data: 場景分析的資料字典
+        Returns:
+            str: 最終生成的場景描述
+        """
+        try:
+            # 如果傳入的是字符串模板，直接使用填充邏輯
+            if isinstance(template, str):
+                self.logger.debug("Processing string template directly")
+                # 提取場景數據
+                detected_objects = scene_data.get("detected_objects", [])
+                scene_type = scene_data.get("scene_type", "general")
+                places365_info = scene_data.get("places365_info")
+                object_statistics = scene_data.get("object_statistics")
+                functional_zones = scene_data.get("functional_zones", {})
+                # 暫存功能區域資訊供填充邏輯使用
+                self._current_functional_zones = functional_zones
+                # 使用現有的填充邏輯
+                filled_description = self.fill_template(
+                    template,
+                    detected_objects,
+                    scene_type,
+                    places365_info,
+                    object_statistics
+                )
+                # 清理暫存資訊
+                if hasattr(self, '_current_functional_zones'):
+                    delattr(self, '_current_functional_zones')
+                return filled_description
+            # 如果傳入的是字典結構模板
+            elif isinstance(template, dict):
+                self.logger.debug("Processing structured template")
+                return self._process_structured_template(template, scene_data)
+            # 如果是模板名稱字符串且需要從registry獲取
+            elif hasattr(self, 'template_registry') and template in self.template_registry:
+                template_dict = self.template_registry[template]
+                return self._process_structured_template(template_dict, scene_data)
+            else:
+                self.logger.warning(f"Invalid template format or template not found: {type(template)}")
+                return self._generate_fallback_scene_description(scene_data)
+        except Exception as e:
+            self.logger.error(f"Error applying template: {str(e)}")
+            return self._generate_fallback_scene_description(scene_data)
+    def _process_structured_template(self, template: Dict[str, Any], scene_data: Dict[str, Any]) -> str:
+        """
+        處理結構化模板字典
+        Args:
+            template: 結構化模板字典
+            scene_data: 場景分析資料
+        Returns:
+            str: 生成的場景描述
+        """
+        try:
+            # 提取 scene_data 中各區塊資料
+            zone_data = scene_data.get("functional_zones", scene_data.get("zones", {}))
+            object_data = scene_data.get("detected_objects", [])
+            scene_context = scene_data.get("scene_context", "")
+            # 獲取模板結構
+            structure = template.get("structure", [])
+            if not structure:
+                self.logger.warning("Template has no structure defined")
+                return self._generate_fallback_scene_description(scene_data)
+            description_parts = []
+            # 按照模板結構生成描述
+            for section in structure:
+                section_type = section.get("type", "")
+                content = section.get("content", "")
+                if section_type == "opening":
+                    description_parts.append(content)
+                elif section_type == "zone_analysis":
+                    zone_descriptions = self._generate_zone_descriptions(zone_data, section)
+                    if zone_descriptions:
+                        description_parts.extend(zone_descriptions)
+                elif section_type == "object_summary":
+                    object_summary = self._generate_object_summary(object_data, section)
+                    if object_summary:
+                        description_parts.append(object_summary)
+                elif section_type == "conclusion":
+                    conclusion = self._generate_conclusion(template, zone_data, object_data)
+                    if conclusion:
+                        description_parts.append(conclusion)
+            # 合併並標準化輸出
+            final_description = self._standardize_final_description(" ".join(description_parts))
+            self.logger.info("Successfully applied structured template")
+            return final_description
+        except Exception as e:
+            self.logger.error(f"Error processing structured template: {str(e)}")
+            return self._generate_fallback_scene_description(scene_data)
+    def _generate_fallback_scene_description(self, scene_data: Dict[str, Any]) -> str:
+        """
+        生成備用場景描述
+        Args:
+            scene_data: 場景分析資料
+        Returns:
+            str: 備用場景描述
+        """
+        try:
+            detected_objects = scene_data.get("detected_objects", [])
+            zones = scene_data.get("functional_zones", scene_data.get("zones", {}))
+            scene_type = scene_data.get("scene_type", "general")
+            object_count = len(detected_objects)
+            zone_count = len(zones)
+            if zone_count > 0 and object_count > 0:
+                return f"Scene analysis completed with {zone_count} functional areas containing {object_count} identified objects."
+            elif object_count > 0:
+                return f"Scene analysis identified {object_count} objects in this {scene_type.replace('_', ' ')} environment."
+            else:
+                return f"Scene analysis completed for this {scene_type.replace('_', ' ')} environment."
+        except Exception as e:
+            self.logger.warning(f"Error generating fallback description: {str(e)}")
+            return "Scene analysis completed with detected objects and functional areas."
+    def _generate_zone_descriptions(self, zone_data: Dict[str, Any], section: Dict[str, Any]) -> List[str]:
+        """
+        生成功能區域描述
+        """
+        try:
+            descriptions = []
+            if not zone_data:
+                return descriptions
+            # 直接處理區域資料（zone_data 本身就是區域字典）
+            sorted_zones = sorted(zone_data.items(),
+                                key=lambda x: len(x[1].get("objects", [])),
+                                reverse=True)
+            for zone_name, zone_info in sorted_zones:
+                description = zone_info.get("description", "")
+                objects = zone_info.get("objects", [])
+                if objects:
+                    # 使用現有描述或生成基於物件的描述
+                    if description and not any(tech in description.lower() for tech in ['zone', 'area', 'region']):
+                        zone_desc = description
+                    else:
+                        # 生成更自然的區域描述
+                        clean_zone_name = zone_name.replace('_', ' ').replace(' area', '').replace(' zone', '')
+                        object_list = ', '.join(objects[:3])
+                        if 'crossing' in zone_name or 'pedestrian' in zone_name:
+                            zone_desc = f"In the central crossing area, there are {object_list}."
+                        elif 'vehicle' in zone_name or 'traffic' in zone_name:
+                            zone_desc = f"The vehicle movement area includes {object_list}."
+                        elif 'control' in zone_name:
+                            zone_desc = f"Traffic control elements include {object_list}."
+                        else:
+                            zone_desc = f"The {clean_zone_name} contains {object_list}."
+                        if len(objects) > 3:
+                            zone_desc += f" Along with {len(objects) - 3} additional elements."
+                    descriptions.append(zone_desc)
+            return descriptions
+        except Exception as e:
+            logger.error(f"Error generating zone descriptions: {str(e)}")
+            return []
+    def _generate_object_summary(self, object_data: List[Dict], section: Dict[str, Any]) -> str:
+        """
+        生成物件摘要描述
+        """
+        try:
+            if not object_data:
+                return ""
+            # 統計物件類型並計算重要性
+            object_stats = {}
+            for obj in object_data:
+                class_name = obj.get("class_name", "unknown")
+                confidence = obj.get("confidence", 0.5)
+                if class_name not in object_stats:
+                    object_stats[class_name] = {"count": 0, "total_confidence": 0}
+                object_stats[class_name]["count"] += 1
+                object_stats[class_name]["total_confidence"] += confidence
+            # 按重要性排序（結合數量和置信度）
+            sorted_objects = []
+            for class_name, stats in object_stats.items():
+                count = stats["count"]
+                avg_confidence = stats["total_confidence"] / count
+                importance = count * 0.6 + avg_confidence * 0.4
+                sorted_objects.append((class_name, count, importance))
+            sorted_objects.sort(key=lambda x: x[2], reverse=True)
+            # 生成自然語言描述
+            descriptions = []
+            for class_name, count, _ in sorted_objects[:5]:
+                clean_name = class_name.replace('_', ' ')
+                if count == 1:
+                    article = "an" if clean_name[0].lower() in 'aeiou' else "a"
+                    descriptions.append(f"{article} {clean_name}")
+                else:
+                    descriptions.append(f"{count} {clean_name}s")
+            if len(descriptions) == 1:
+                return f"The scene features {descriptions[0]}."
+            elif len(descriptions) == 2:
+                return f"The scene features {descriptions[0]} and {descriptions[1]}."
+            else:
+                main_items = ", ".join(descriptions[:-1])
+                return f"The scene features {main_items}, and {descriptions[-1]}."
+        except Exception as e:
+            self.logger.error(f"Error generating object summary: {str(e)}")
+            return ""
+    def _generate_conclusion(self, template: Dict[str, Any], zone_data: Dict[str, Any],
+                            object_data: List[Dict]) -> str:
+        """
+        生成結論描述
+        """
+        try:
+            scene_type = template.get("scene_type", "general")
+            zones_count = len(zone_data)
+            objects_count = len(object_data)
+            if scene_type == "indoor":
+                conclusion = f"This indoor environment demonstrates clear functional organization with {zones_count} distinct areas and {objects_count} identified objects."
+            elif scene_type == "outdoor":
+                conclusion = f"This outdoor scene shows dynamic activity patterns across {zones_count} functional zones with {objects_count} detected elements."
+            else:
+                conclusion = f"The scene analysis reveals {zones_count} functional areas containing {objects_count} identifiable objects."
+            return conclusion
+        except Exception as e:
+            logger.error(f"Error generating conclusion: {str(e)}")
+            return ""
+    def _standardize_final_description(self, description: str) -> str:
+        """
+        對最終描述進行標準化處理
+        Args:
+            description: 原始描述文本
+        Returns:
+            str: 標準化後的描述文本
+        """
+        try:
+            # 移除多餘空格
+            description = " ".join(description.split())
+            # 確保句子間有適當間距
+            description = description.replace(". ", ". ")
+            # 移除任何殘留的技術性標識符
+            technical_patterns = [
+                r'zone_\d+', r'area_\d+', r'region_\d+',
+                r'_zone', r'_area', r'_region'
+            ]
+            for pattern in technical_patterns:
+                description = re.sub(pattern, '', description, flags=re.IGNORECASE)
+            return description.strip()
+        except Exception as e:
+            logger.error(f"Error standardizing final description: {str(e)}")
+            return description

text_formatter.py ADDED Viewed

	@@ -0,0 +1,545 @@

+import logging
+import traceback
+import re
+from typing import Dict, List, Optional
+from landmark_data import ALL_LANDMARKS
+class TextFormattingError(Exception):
+    """文本格式化過程中的自定義異常"""
+    pass
+class TextFormatter:
+    """
+    文本格式化器 - 負責文本拼接、格式化和最終輸出優化
+    該類別處理所有與文本格式化相關的邏輯，包括智能文本拼接、
+    標點符號處理、大小寫規範化以及地標引用的過濾功能。
+    """
+    def __init__(self):
+        """
+        初始化文本格式化器
+        """
+        self.logger = logging.getLogger(self.__class__.__name__)
+        try:
+            # 載入地標數據用於引用過濾
+            self.landmark_data = self._load_landmark_data()
+            self.logger.info("TextFormatter initialized successfully")
+        except Exception as e:
+            error_msg = f"Failed to initialize TextFormatter: {str(e)}"
+            self.logger.error(f"{error_msg}\n{traceback.format_exc()}")
+            raise TextFormattingError(error_msg) from e
+    def _load_landmark_data(self) -> Dict:
+        """
+        載入地標數據
+        Returns:
+            Dict: 地標數據字典
+        """
+        try:
+            return ALL_LANDMARKS
+        except ImportError:
+            self.logger.warning("Failed to import landmark data, landmark filtering will be disabled")
+            return {}
+        except Exception as e:
+            self.logger.warning(f"Error loading landmark data: {str(e)}")
+            return {}
+    def smart_append(self, current_text: str, new_fragment: str) -> str:
+        """
+        將新文本片段附加到現有文本，處理標點符號和大小寫
+        Args:
+            current_text: 要加到的現有文本
+            new_fragment: 要加的新文本片段
+        Returns:
+            str: 合併後的文本，具有適當的格式化
+        """
+        try:
+            # 處理空值情況
+            if not new_fragment:
+                return current_text
+            if not current_text:
+                # 確保第一個字符大寫
+                return new_fragment[0].upper() + new_fragment[1:] if new_fragment else ""
+            # 清理現有文本
+            current_text = current_text.rstrip()
+            # 檢查結尾標點符號
+            ends_with_sentence = current_text.endswith(('.', '!', '?'))
+            ends_with_comma = current_text.endswith(',')
+            # 特別處理 "A xxx A yyy" 模式
+            if (current_text.startswith("A ") or current_text.startswith("An ")) and \
+               (new_fragment.startswith("A ") or new_fragment.startswith("An ")):
+                return current_text + ". " + new_fragment
+            # 檢查新片段是否包含地標名稱（通常為專有名詞）
+            has_landmark_name = any(word[0].isupper() for word in new_fragment.split()
+                                  if len(word) > 2 and not word.startswith(("A ", "An ", "The ")))
+            # 決定如何連接文本
+            if ends_with_sentence:
+                # 句子後，以大寫開始並添加適當間距
+                joined_text = current_text + " " + (new_fragment[0].upper() + new_fragment[1:])
+            elif ends_with_comma:
+                # 逗號後，要保持流暢性，除非是專有名詞或特殊情況
+                if new_fragment.startswith(('I ', 'I\'', 'A ', 'An ', 'The ')) or new_fragment[0].isupper() or has_landmark_name:
+                    joined_text = current_text + " " + new_fragment
+                else:
+                    joined_text = current_text + " " + new_fragment[0].lower() + new_fragment[1:]
+            elif "scene is" in new_fragment.lower() or "scene includes" in new_fragment.lower():
+                # 加關於場景的新句子時，使用句號
+                joined_text = current_text + ". " + new_fragment
+            else:
+                # 其他情況，根據內容決定
+                if self._is_related_phrases(current_text, new_fragment):
+                    if new_fragment.startswith(('I ', 'I\'', 'A ', 'An ', 'The ')) or new_fragment[0].isupper() or has_landmark_name:
+                        joined_text = current_text + ", " + new_fragment
+                    else:
+                        joined_text = current_text + ", " + new_fragment[0].lower() + new_fragment[1:]
+                else:
+                    # 對不相關的短語使用句號
+                    joined_text = current_text + ". " + (new_fragment[0].upper() + new_fragment[1:])
+            return joined_text
+        except Exception as e:
+            self.logger.warning(f"Error in smart_append: {str(e)}")
+            # 備用簡單拼接
+            return f"{current_text} {new_fragment}" if current_text else new_fragment
+    def _is_related_phrases(self, text1: str, text2: str) -> bool:
+        """
+        判斷兩個短語是否相關，應該用逗號
+        Args:
+            text1: 第一個文本片段
+            text2: 要加的第二個文本片段
+        Returns:
+            bool: 短語是否相關
+        """
+        try:
+            # 檢查兩個短語是否都以 "A" 或 "An" 開始 - 這些是獨立的描述
+            if (text1.startswith("A ") or text1.startswith("An ")) and \
+               (text2.startswith("A ") or text2.startswith("An ")):
+                return False  # 這些是獨立的描述，不是相關短語
+            # 檢查第二個短語是否以連接詞開始
+            connecting_words = ["which", "where", "who", "whom", "whose", "with", "without",
+                              "this", "these", "that", "those", "and", "or", "but"]
+            first_word = text2.split()[0].lower() if text2 else ""
+            if first_word in connecting_words:
+                return True
+            # 檢查第一個短語是否以暗示連續性的內容結尾
+            ending_patterns = ["such as", "including", "like", "especially", "particularly",
+                             "for example", "for instance", "namely", "specifically"]
+            for pattern in ending_patterns:
+                if text1.lower().endswith(pattern):
+                    return True
+            # 檢查兩個短語是否都關於場景
+            if "scene" in text1.lower() and "scene" in text2.lower():
+                return False  # 關於場景的獨立陳述應該是分開的句子
+            return False
+        except Exception as e:
+            self.logger.warning(f"Error checking phrase relationship: {str(e)}")
+            return False
+    def format_final_description(self, text: str) -> str:
+        """
+        格式化最終描述文本，確保正確的標點符號、大小寫和間距
+        Args:
+            text: 要格式化的文本
+        Returns:
+            str: 格式化後的文本
+        """
+        try:
+            if not text or not text.strip():
+                return ""
+            # 首先修剪前導/尾隨空白
+            text = text.strip()
+            # 1. 處理連續的 "A/An" 段落（可能將它們分成句子）
+            text = re.sub(r'(A\s+[^.!?]+?[\w\.])\s+(A\s+)', r'\1. \2', text, flags=re.IGNORECASE)
+            text = re.sub(r'(An\s+[^.!?]+?[\w\.])\s+(An?\s+)', r'\1. \2', text, flags=re.IGNORECASE)
+            # 2. 確保整個文本的第一個字符大寫
+            if text:
+                text = text[0].upper() + text[1:]
+            # 3. 規範化空白：多個空格變為一個
+            text = re.sub(r'\s{2,}', ' ', text)
+            # 4. 句子結尾標點符號後大寫
+            def capitalize_after_punctuation(match):
+                return match.group(1) + match.group(2).upper()
+            text = re.sub(r'([.!?]\s+)([a-z])', capitalize_after_punctuation, text)
+            # 5. 處理逗號後的大小寫
+            def fix_capitalization_after_comma(match):
+                leading_comma_space = match.group(1)  # (,\s+)
+                word_after_comma = match.group(2)     # ([A-Z][a-zA-Z]*)
+                proper_nouns_exceptions = ["I", "I'm", "I've", "I'd", "I'll",
+                                         "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday",
+                                         "January", "February", "March", "April", "May", "June", "July",
+                                         "August", "September", "October", "November", "December"]
+                if word_after_comma in proper_nouns_exceptions:
+                    return match.group(0)
+                # 如果詞看起來像專有名詞（已經大寫且不是常用詞），保持不變
+                if len(word_after_comma) > 2 and word_after_comma[0].isupper() and word_after_comma.lower() not in ["this", "that", "these", "those", "they", "their", "then", "thus"]:
+                    return match.group(0)  # 如果看起來已經是專有名詞則保持不變
+                return leading_comma_space + word_after_comma[0].lower() + word_after_comma[1:]
+            text = re.sub(r'(,\s+)([A-Z][a-zA-Z\'\-]+)', fix_capitalization_after_comma, text)
+            # 6. 修正標點符號周圍的間距
+            text = re.sub(r'\s*([.,;:!?])\s*', r'\1 ', text)  # 確保標點符號後有一個空格，前面沒有
+            text = text.replace(' .', '.').replace(' ,', ',')  # 清理標點符號前可能的空格
+            # 7. 合併多個句子結尾標點符號
+            text = re.sub(r'[.!?]{2,}', '.', text)  # 將多個轉換為單個句號
+            text = re.sub(r',+', ',', text)  # 多個逗號變為一個
+            # 8. 確保文本以單個句子結尾標點符號結尾
+            text = text.strip()  # 檢查最後一個字符前移除尾隨空白
+            if text and not text[-1] in '.!?':
+                text += '.'
+            # 9. 處理空的佔位符和前導標點符號
+            text = re.sub(r'\bIn\s*,\s*', 'In this scene, ', text)  # 修復 "In , " 問題
+            text = re.sub(r'\s*,\s*([A-Z])', r'. \1', text)  # 修復逗號後直接跟大寫字母的問題
+            text = re.sub(r'^[.,;:!?\s]+', '', text)  # 移除前導標點符號
+            # 10. 第一個字母大寫的最終檢查
+            if text:
+                text = text[0].upper() + text[1:]
+            # 11. 移除最終標點符號前的空格（如果規則7意外添加）
+            text = re.sub(r'\s+([.!?])$', r'\1', text)
+            return text.strip()  # 最終修剪
+        except Exception as e:
+            self.logger.warning(f"Error formatting final description: {str(e)}")
+            # 備用基本格式化
+            if text:
+                text = text.strip()
+                if text and not text.endswith(('.', '!', '?')):
+                    text += '.'
+                if text:
+                    text = text[0].upper() + text[1:]
+                return text
+            return ""
+    def filter_landmark_references(self, text: str, enable_landmark: bool = True) -> str:
+        """
+        動態過濾文本中的地標引用
+        Args:
+            text: 需要過濾的文本
+            enable_landmark: 是否啟用地標功能
+        Returns:
+            str: 過濾後的文本
+        """
+        try:
+            if enable_landmark or not text:
+                return text
+            # 動態收集所有地標名稱和位置
+            landmark_names = []
+            locations = []
+            for landmark_id, info in self.landmark_data.items():
+                # 收集地標名稱及其別名
+                landmark_names.append(info["name"])
+                landmark_names.extend(info.get("aliases", []))
+                # 收集地理位置
+                if "location" in info:
+                    location = info["location"]
+                    locations.append(location)
+                    # 處理分離的城市和國家名稱
+                    parts = location.split(",")
+                    if len(parts) >= 1:
+                        locations.append(parts[0].strip())
+                    if len(parts) >= 2:
+                        locations.append(parts[1].strip())
+            # 替換所有地標名稱
+            for name in landmark_names:
+                if name and len(name) > 2:  # 避免過短的名稱
+                    text = re.sub(r'\b' + re.escape(name) + r'\b', "tall structure", text, flags=re.IGNORECASE)
+            # 動態替換所有位置引用
+            for location in locations:
+                if location and len(location) > 2:
+                    # 替換常見位置表述模式
+                    text = re.sub(r'in ' + re.escape(location), "in the urban area", text, flags=re.IGNORECASE)
+                    text = re.sub(r'of ' + re.escape(location), "of the urban area", text, flags=re.IGNORECASE)
+                    text = re.sub(r'\b' + re.escape(location) + r'\b', "the urban area", text, flags=re.IGNORECASE)
+            # 通用地標描述模式替換
+            landmark_patterns = [
+                (r'a (tourist|popular|famous) landmark', r'an urban structure'),
+                (r'an iconic structure in ([A-Z][a-zA-Z\s,]+)', r'an urban structure in the area'),
+                (r'a famous (monument|tower|landmark) in ([A-Z][a-zA-Z\s,]+)', r'an urban structure in the area'),
+                (r'(centered|built|located|positioned) around the ([A-Z][a-zA-Z\s]+? (Tower|Monument|Landmark))', r'located in this area'),
+                (r'(sightseeing|guided tours|cultural tourism) (at|around|near) (this landmark|the [A-Z][a-zA-Z\s]+)', r'\1 in this area'),
+                (r'this (famous|iconic|historic|well-known) (landmark|monument|tower|structure)', r'this urban structure'),
+                (r'([A-Z][a-zA-Z\s]+) Tower', r'tall structure'),
+                (r'a (tower|structure) in ([A-Z][a-zA-Z\s,]+)', r'a \1 in the area'),
+                (r'landmark scene', r'urban scene'),
+                (r'tourist destination', r'urban area'),
+                (r'tourist attraction', r'urban area')
+            ]
+            for pattern, replacement in landmark_patterns:
+                text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
+            return text
+        except Exception as e:
+            self.logger.warning(f"Error filtering landmark references: {str(e)}")
+            return text
+    def optimize_text_flow(self, text: str) -> str:
+        """
+        優化文本流暢性，減少重複和改善可讀性
+        Args:
+            text: 要優化的文本
+        Returns:
+            str: 優化後的文本
+        """
+        try:
+            if not text:
+                return text
+            # 移除重複的短語
+            text = self._remove_duplicate_phrases(text)
+            # 優化連接詞使用
+            text = self._optimize_connectors(text)
+            # 平衡句子長度
+            text = self._balance_sentence_length(text)
+            return text
+        except Exception as e:
+            self.logger.warning(f"Error optimizing text flow: {str(e)}")
+            return text
+    def _remove_duplicate_phrases(self, text: str) -> str:
+        """
+        移除文本中的重複短語
+        Args:
+            text: 輸入文本
+        Returns:
+            str: 移除重複後的文本
+        """
+        try:
+            # 分割成句子
+            sentences = re.split(r'[.!?]+', text)
+            unique_sentences = []
+            seen_content = set()
+            for sentence in sentences:
+                sentence = sentence.strip()
+                if not sentence:
+                    continue
+                # 規範化以進行比較（移除額外空白和標點符號）
+                normalized = re.sub(r'\s+', ' ', sentence.lower().strip())
+                # 檢查是否實質相似
+                is_duplicate = False
+                for seen in seen_content:
+                    if self._sentences_similar(normalized, seen):
+                        is_duplicate = True
+                        break
+                if not is_duplicate:
+                    unique_sentences.append(sentence)
+                    seen_content.add(normalized)
+            return '. '.join(unique_sentences) + '.' if unique_sentences else ""
+        except Exception as e:
+            self.logger.warning(f"Error removing duplicate phrases: {str(e)}")
+            return text
+    def _sentences_similar(self, sent1: str, sent2: str) -> bool:
+        """
+        檢查兩個句子是否相似
+        Args:
+            sent1: 第一個句子
+            sent2: 第二個句子
+        Returns:
+            bool: 句子是否相似
+        """
+        try:
+            # 簡單的相似性檢查：如果80%的詞彙重疊
+            words1 = set(sent1.split())
+            words2 = set(sent2.split())
+            if not words1 or not words2:
+                return False
+            intersection = len(words1 & words2)
+            union = len(words1 | words2)
+            similarity = intersection / union if union > 0 else 0
+            return similarity > 0.8
+        except Exception as e:
+            self.logger.warning(f"Error checking sentence similarity: {str(e)}")
+            return False
+    def _optimize_connectors(self, text: str) -> str:
+        """
+        優化連接詞的使用
+        Args:
+            text: 輸入文本
+        Returns:
+            str: 優化連接詞後的文本
+        """
+        try:
+            # 替換重複的連接詞
+            text = re.sub(r'\band\s+and\b', 'and', text, flags=re.IGNORECASE)
+            text = re.sub(r'\bwith\s+with\b', 'with', text, flags=re.IGNORECASE)
+            # 改善過度使用 "and" 的情況
+            text = re.sub(r'(\w+),\s+and\s+(\w+),\s+and\s+(\w+)', r'\1, \2, and \3', text)
+            return text
+        except Exception as e:
+            self.logger.warning(f"Error optimizing connectors: {str(e)}")
+            return text
+    def _balance_sentence_length(self, text: str) -> str:
+        """
+        平衡句子長度，分割過長的句子
+        Args:
+            text: 輸入文本
+        Returns:
+            str: 平衡句子長度後的文本
+        """
+        try:
+            sentences = re.split(r'([.!?]+)', text)
+            balanced_text = ""
+            for i in range(0, len(sentences), 2):
+                if i + 1 < len(sentences):
+                    sentence = sentences[i]
+                    punctuation = sentences[i + 1]
+                    # 如果句子太長（超過150個字符），嘗試在適當位置分割
+                    if len(sentence) > 150:
+                        # 在逗號或連接詞處分割
+                        split_points = [m.start() for m in re.finditer(r',\s+(?:and|but|or|while|when|where)', sentence)]
+                        if split_points:
+                            mid_point = split_points[len(split_points) // 2]
+                            first_part = sentence[:mid_point].strip()
+                            second_part = sentence[mid_point + 1:].strip()
+                            if second_part and not second_part[0].isupper():
+                                second_part = second_part[0].upper() + second_part[1:]
+                            balanced_text += first_part + ". " + second_part + punctuation + " "
+                        else:
+                            balanced_text += sentence + punctuation + " "
+                    else:
+                        balanced_text += sentence + punctuation + " "
+            return balanced_text.strip()
+        except Exception as e:
+            self.logger.warning(f"Error balancing sentence length: {str(e)}")
+            return text
+    def validate_text_quality(self, text: str) -> Dict[str, bool]:
+        """
+        驗證文本質量
+        Args:
+            text: 要驗證的文本
+        Returns:
+            Dict[str, bool]: 質量檢查結果
+        """
+        try:
+            quality_checks = {
+                "has_content": bool(text and text.strip()),
+                "proper_capitalization": bool(text and text[0].isupper()) if text else False,
+                "ends_with_punctuation": bool(text and text.strip()[-1] in '.!?') if text else False,
+                "no_double_spaces": "  " not in text if text else True,
+                "no_leading_punctuation": not bool(re.match(r'^[.,;:!?]', text.strip())) if text else True,
+                "reasonable_length": 20 <= len(text) <= 1000 if text else False
+            }
+            return quality_checks
+        except Exception as e:
+            self.logger.warning(f"Error validating text quality: {str(e)}")
+            return {"error": True}
+    def get_text_statistics(self, text: str) -> Dict[str, int]:
+        """
+        獲取文本統計信息
+        Args:
+            text: 要分析的文本
+        Returns:
+            Dict[str, int]: 文本統計信息
+        """
+        try:
+            if not text:
+                return {"characters": 0, "words": 0, "sentences": 0}
+            characters = len(text)
+            words = len(text.split())
+            sentences = len(re.findall(r'[.!?]+', text))
+            return {
+                "characters": characters,
+                "words": words,
+                "sentences": sentences
+            }
+        except Exception as e:
+            self.logger.warning(f"Error getting text statistics: {str(e)}")
+            return {"characters": 0, "words": 0, "sentences": 0}

text_quality_validator.py ADDED Viewed

	@@ -0,0 +1,452 @@

+import re
+import logging
+import traceback
+from typing import Dict, List, Any, Optional, Set, Tuple
+class TextQualityValidator:
+    """
+    負責驗證和確保生成文本的品質和事實準確性。
+    包含事實檢查、視角一致性、場景類型一致性等驗證功能。
+    """
+    def __init__(self):
+        """初始化文本品質驗證器"""
+        # 設置專屬logger
+        self.logger = logging.getLogger(self.__class__.__name__)
+        if not self.logger.handlers:
+            handler = logging.StreamHandler()
+            formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+            handler.setFormatter(formatter)
+            self.logger.addHandler(handler)
+            self.logger.setLevel(logging.INFO)
+        # 初始化驗證規則
+        self._initialize_validation_rules()
+        self.logger.info("TextQualityValidator initialized successfully")
+    def _initialize_validation_rules(self):
+        """初始化各種驗證規則和詞彙庫"""
+        try:
+            # 地點和文化詞彙列表
+            self.location_terms = ["plaza", "square", "market", "mall", "avenue", "boulevard"]
+            self.cultural_terms = ["european", "asian", "american", "african", "western", "eastern"]
+            # 視角詞彙對應表
+            self.perspective_terms = {
+                "aerial": ["aerial", "bird's-eye", "overhead", "top-down", "above", "looking down"],
+                "ground": ["street-level", "ground level", "eye-level", "standing"],
+                "indoor": ["inside", "interior", "indoor", "within"],
+                "close-up": ["close-up", "detailed view", "close shot"]
+            }
+            # 視角前綴對應表
+            self.perspective_prefixes = {
+                "aerial": "From an aerial perspective, ",
+                "ground": "From street level, ",
+                "indoor": "In this indoor setting, ",
+                "close-up": "In this close-up view, "
+            }
+            # 數值檢測模式
+            self.number_patterns = [
+                (r'(\d+)\s+(people|person|pedestrians|individuals)', r'\1', r'\2'),
+                (r'(\d+)\s+(cars|vehicles|automobiles)', r'\1', r'\2'),
+                (r'(\d+)\s+(buildings|structures)', r'\1', r'\2'),
+                (r'(\d+)\s+(plants|potted plants|flowers)', r'\1', r'\2'),
+                (r'(\d+)\s+(beds|furniture|tables|chairs)', r'\1', r'\2')
+            ]
+            # 禁用場景詞列表
+            self.prohibited_scene_words = ["plaza", "square", "european", "asian", "american"]
+            self.logger.info("Validation rules initialized successfully")
+        except Exception as e:
+            error_msg = f"Failed to initialize validation rules: {str(e)}"
+            self.logger.error(error_msg)
+            self.logger.error(traceback.format_exc())
+            raise Exception(error_msg) from e
+    def verify_factual_accuracy(self,
+                               original_desc: str,
+                               generated_desc: str,
+                               object_list: str) -> str:
+        """
+        驗證生成描述的事實準確性
+        Args:
+            original_desc: 原始場景描述
+            generated_desc: 生成的描述
+            object_list: 檢測到的物件列表
+        Returns:
+            str: 驗證並可能修正後的描述
+        """
+        try:
+            self.logger.debug("Starting factual accuracy verification")
+            # 將原始描述和物體列表合併為授權詞彙源
+            authorized_content = original_desc.lower() + " " + object_list.lower()
+            # 檢查和替換未授權的地點和文化詞彙
+            verified_desc = self._check_unauthorized_terms(generated_desc, authorized_content)
+            # 檢查重複用詞問題
+            verified_desc = self._detect_repetitive_patterns(verified_desc)
+            self.logger.debug("Factual accuracy verification completed")
+            return verified_desc
+        except Exception as e:
+            error_msg = f"Factual accuracy verification failed: {str(e)}"
+            self.logger.error(error_msg)
+            self.logger.error(traceback.format_exc())
+            return generated_desc  # 發生錯誤時返回原始生成描述
+    def _check_unauthorized_terms(self, generated_desc: str, authorized_content: str) -> str:
+        """檢查並替換未授權的詞彙"""
+        # 檢查生成文本中的每個詞
+        for term in self.location_terms + self.cultural_terms:
+            # 僅當該詞出現在生成文本但不在授權內容中時進行替換
+            if term in generated_desc.lower() and term not in authorized_content:
+                # 根據詞語類型選擇適當的替換詞
+                if term in self.location_terms:
+                    replacement = "area"
+                else:
+                    replacement = "scene"
+                # 使用正則表達式進��完整詞匹配替換
+                pattern = re.compile(r'\b' + term + r'\b', re.IGNORECASE)
+                generated_desc = pattern.sub(replacement, generated_desc)
+        return generated_desc
+    def _detect_repetitive_patterns(self, generated_desc: str) -> str:
+        """檢測並處理重複用詞問題"""
+        repetitive_patterns = [
+            (r'\b(visible)\b.*?\b(visible)\b', 'Multiple uses of "visible" detected'),
+            (r'\b(positioned)\b.*?\b(positioned)\b', 'Multiple uses of "positioned" detected'),
+            (r'\b(located)\b.*?\b(located)\b', 'Multiple uses of "located" detected'),
+            (r'\b(situated)\b.*?\b(situated)\b', 'Multiple uses of "situated" detected'),
+            (r'\b(appears)\b.*?\b(appears)\b', 'Multiple uses of "appears" detected'),
+            (r'\b(features)\b.*?\b(features)\b', 'Multiple uses of "features" detected'),
+            (r'\bThis\s+(\w+)\s+.*?\bThis\s+\1\b', 'Repetitive sentence structure detected')
+        ]
+        # 替換詞典
+        replacement_dict = {
+            'visible': ['present', 'evident', 'apparent', 'observable'],
+            'positioned': ['arranged', 'placed', 'set', 'organized'],
+            'located': ['found', 'placed', 'situated', 'established'],
+            'situated': ['placed', 'positioned', 'arranged', 'set'],
+            'appears': ['seems', 'looks', 'presents', 'exhibits'],
+            'features': ['includes', 'contains', 'displays', 'showcases']
+        }
+        for pattern, issue in repetitive_patterns:
+            matches = list(re.finditer(pattern, generated_desc, re.IGNORECASE | re.DOTALL))
+            if matches:
+                self.logger.warning(f"Text quality issue detected: {issue}")
+                # 針對特定重複詞彙進行替換
+                for word in replacement_dict.keys():
+                    if word in issue.lower():
+                        word_pattern = re.compile(r'\b' + word + r'\b', re.IGNORECASE)
+                        word_matches = list(word_pattern.finditer(generated_desc))
+                        # 保留第一次出現，替換後續出現
+                        for i, match in enumerate(word_matches[1:], 1):
+                            if i <= len(replacement_dict[word]):
+                                replacement = replacement_dict[word][(i-1) % len(replacement_dict[word])]
+                                # 保持原始大小寫格式
+                                if match.group().isupper():
+                                    replacement = replacement.upper()
+                                elif match.group().istitle():
+                                    replacement = replacement.capitalize()
+                                # 執行替換
+                                generated_desc = generated_desc[:match.start()] + replacement + generated_desc[match.end():]
+                                # 重新計算後續匹配位置
+                                word_matches = list(word_pattern.finditer(generated_desc))
+                        break
+        return generated_desc
+    def fact_check_description(self,
+                             original_desc: str,
+                             enhanced_desc: str,
+                             scene_type: str,
+                             detected_objects: List[str]) -> str:
+        """
+        對增強後的描述進行全面的事實檢查
+        Args:
+            original_desc: 原始場景描述
+            enhanced_desc: 增強後的描述
+            scene_type: 場景類型
+            detected_objects: 檢測到的物體名稱列表
+        Returns:
+            str: 經過事實檢查的描述
+        """
+        try:
+            self.logger.debug("Starting comprehensive fact checking")
+            # 如果增強描述為空或太短，返回原始描述
+            if not enhanced_desc or len(enhanced_desc) < 30:
+                return original_desc
+            # 1. 檢查數值一致性
+            enhanced_desc = self._check_numerical_consistency(original_desc, enhanced_desc)
+            # 2. 檢查視角一致性
+            enhanced_desc = self._check_perspective_consistency(original_desc, enhanced_desc)
+            # 3. 檢查場景類型一致性
+            enhanced_desc = self._check_scene_type_consistency(enhanced_desc, scene_type)
+            # 4. 確保文字長度適當
+            enhanced_desc = self._ensure_appropriate_length(enhanced_desc)
+            self.logger.debug("Comprehensive fact checking completed")
+            return enhanced_desc
+        except Exception as e:
+            error_msg = f"Fact checking failed: {str(e)}"
+            self.logger.error(error_msg)
+            self.logger.error(traceback.format_exc())
+            return enhanced_desc  # 發生錯誤時返回增強描述
+    def _check_numerical_consistency(self, original_desc: str, enhanced_desc: str) -> str:
+        """檢查數值一致性"""
+        # 檢查原始描述中的每個數字
+        for pattern, num_group, word_group in self.number_patterns:
+            original_matches = re.finditer(pattern, original_desc, re.IGNORECASE)
+            for match in original_matches:
+                number = match.group(1)
+                noun = match.group(2)
+                # 檢查增強描述中是否保留了這個數字
+                enhanced_pattern = r'(\d+)\s+(' + re.escape(noun) + r'|' + re.escape(noun.rstrip('s')) + r'|' + re.escape(noun + 's') + r')'
+                enhanced_matches = list(re.finditer(enhanced_pattern, enhanced_desc, re.IGNORECASE))
+                if not enhanced_matches:
+                    # 數字+名詞未在增強描述中找到
+                    plural_form = noun if noun.endswith('s') or number == '1' else noun + 's'
+                    if enhanced_desc.startswith("This") or enhanced_desc.startswith("The"):
+                        enhanced_desc = enhanced_desc.replace("This ", f"This scene with {number} {plural_form} ", 1)
+                        enhanced_desc = enhanced_desc.replace("The ", f"The scene with {number} {plural_form} ", 1)
+                    else:
+                        enhanced_desc = f"The scene includes {number} {plural_form}. " + enhanced_desc
+                elif enhanced_matches and enhanced_matches[0].group(1) != number:
+                    # 存在但數字不一致，需要更正數字
+                    for ematch in enhanced_matches:
+                        wrong_number = ematch.group(1)
+                        enhanced_desc = enhanced_desc.replace(f"{wrong_number} {ematch.group(2)}", f"{number} {ematch.group(2)}")
+        return enhanced_desc
+    def _check_perspective_consistency(self, original_desc: str, enhanced_desc: str) -> str:
+        """檢查視角一致性"""
+        # 確定原始視角
+        original_perspective = None
+        for persp, terms in self.perspective_terms.items():
+            if any(term in original_desc.lower() for term in terms):
+                original_perspective = persp
+                break
+        # 檢查是否保留了視角
+        if original_perspective:
+            enhanced_has_perspective = any(term in enhanced_desc.lower() for term in self.perspective_terms[original_perspective])
+            if not enhanced_has_perspective:
+                # 添加缺失的視角
+                prefix = self.perspective_prefixes.get(original_perspective, "")
+                if prefix:
+                    if enhanced_desc[0].isupper():
+                        enhanced_desc = prefix + enhanced_desc[0].lower() + enhanced_desc[1:]
+                    else:
+                        enhanced_desc = prefix + enhanced_desc
+        return enhanced_desc
+    def _check_scene_type_consistency(self, enhanced_desc: str, scene_type: str) -> str:
+        """檢查場景類型一致性"""
+        if scene_type and scene_type.lower() != "unknown" and scene_type.lower() not in enhanced_desc.lower():
+            # 添加場景類型
+            if enhanced_desc.startswith("This ") or enhanced_desc.startswith("The "):
+                # 避免產生重複
+                if "scene" in enhanced_desc[:15].lower():
+                    fixed_type = scene_type.lower()
+                    enhanced_desc = enhanced_desc.replace("scene", fixed_type, 1)
+                else:
+                    enhanced_desc = enhanced_desc.replace("This ", f"This {scene_type} ", 1)
+                    enhanced_desc = enhanced_desc.replace("The ", f"The {scene_type} ", 1)
+            else:
+                enhanced_desc = f"This {scene_type} " + enhanced_desc
+        return enhanced_desc
+    def _ensure_appropriate_length(self, enhanced_desc: str) -> str:
+        """確保文字長度適當"""
+        words = enhanced_desc.split()
+        if len(words) > 200:
+            # 找尋接近字數限制的句子結束處
+            truncated = ' '.join(words[:200])
+            last_period = max(truncated.rfind('.'), truncated.rfind('!'), truncated.rfind('?'))
+            if last_period > 0:
+                enhanced_desc = truncated[:last_period+1]
+            else:
+                enhanced_desc = truncated + '.'
+        return enhanced_desc
+    def ensure_scene_type_consistency(self,
+                                    description: str,
+                                    scene_type: str,
+                                    original_desc: str) -> str:
+        """
+        確保描述中的場景類型與指定的場景類型一致
+        Args:
+            description: 待檢查的描述
+            scene_type: 指定的場景類型
+            original_desc: 原始描述（用於參考）
+        Returns:
+            str: 場景類型一致的描述
+        """
+        try:
+            self.logger.debug("Ensuring scene type consistency")
+            scene_type = scene_type.replace('_', ' ')
+            # 檢查是否包含禁止的場景詞
+            for word in self.prohibited_scene_words:
+                if word in description.lower() and word not in original_desc.lower() and word not in scene_type.lower():
+                    # 替換錯誤場景詞為正確場景類型
+                    pattern = re.compile(r'\b' + word + r'\b', re.IGNORECASE)
+                    description = pattern.sub(scene_type, description)
+            # 確保場景類型在描述中被提及
+            if scene_type.lower() not in description.lower():
+                # 尋找通用場景詞並替換
+                for general_term in ["scene", "area", "place", "location"]:
+                    if general_term in description.lower():
+                        pattern = re.compile(r'\b' + general_term + r'\b', re.IGNORECASE)
+                        description = pattern.sub(scene_type, description, count=1)
+                        break
+                else:
+                    # 如果沒有找到通用詞，在開頭添加場景類型
+                    if description.startswith("The "):
+                        description = description.replace("The ", f"The {scene_type} ", 1)
+                    elif description.startswith("This "):
+                        description = description.replace("This ", f"This {scene_type} ", 1)
+                    else:
+                        description = f"This {scene_type} " + description
+            self.logger.debug("Scene type consistency ensured")
+            return description
+        except Exception as e:
+            error_msg = f"Scene type consistency check failed: {str(e)}"
+            self.logger.error(error_msg)
+            self.logger.error(traceback.format_exc())
+            return description
+    def extract_perspective_from_description(self, description: str) -> str:
+        """
+        從原始描述中提取視角信息
+        Args:
+            description: 原始場景描述
+        Returns:
+            str: 提取到的視角描述，如果沒有則返回空字符串
+        """
+        try:
+            for persp_type, terms in self.perspective_terms.items():
+                for term in terms:
+                    if term.lower() in description.lower():
+                        self.logger.debug(f"Perspective detected: {term}")
+                        return term
+            return ""
+        except Exception as e:
+            self.logger.error(f"Perspective extraction failed: {str(e)}")
+            return ""
+    def extract_objects_from_description(self, description: str) -> List[str]:
+        """
+        從原始描述中提取物件提及
+        Args:
+            description: 原始場景描述
+        Returns:
+            List[str]: 提取到的物件列表
+        """
+        try:
+            extracted_objects = []
+            for pattern in self.number_patterns:
+                matches = re.finditer(pattern[0], description, re.IGNORECASE)
+                for match in matches:
+                    number = match.group(1)
+                    object_type = match.group(2)
+                    extracted_objects.append(f"{number} {object_type}")
+            self.logger.debug(f"Extracted {len(extracted_objects)} objects from description")
+            return extracted_objects
+        except Exception as e:
+            self.logger.error(f"Object extraction failed: {str(e)}")
+            return []
+    def validate_response_completeness(self, response: str) -> Tuple[bool, str]:
+        """
+        驗證回應的完整性
+        Args:
+            response: 待驗證的回應
+        Returns:
+            Tuple[bool, str]: (是否完整, 問題描述)
+        """
+        try:
+            # 檢查回應長度
+            if len(response) < 100:
+                return False, "Response too short"
+            # 檢查句子結尾
+            if len(response) < 200 and "." not in response[-30:]:
+                return False, "No proper sentence ending"
+            # 檢查不完整短語
+            incomplete_phrases = ["in the", "with the", "and the"]
+            if any(response.endswith(phrase) for phrase in incomplete_phrases):
+                return False, "Ends with incomplete phrase"
+            return True, "Response is complete"
+        except Exception as e:
+            self.logger.error(f"Response completeness validation failed: {str(e)}")
+            return False, "Validation error"
+    def get_validator_info(self) -> Dict[str, Any]:
+        """
+        獲取驗證器信息
+        Returns:
+            Dict[str, Any]: 包含驗證器狀態和配置的信息
+        """
+        return {
+            "location_terms_count": len(self.location_terms),
+            "cultural_terms_count": len(self.cultural_terms),
+            "perspective_types_count": len(self.perspective_terms),
+            "number_patterns_count": len(self.number_patterns),
+            "prohibited_words_count": len(self.prohibited_scene_words),
+            "initialization_status": "success"
+        }

viewpoint_detector.py ADDED Viewed

	@@ -0,0 +1,437 @@

+import logging
+import traceback
+from typing import Dict, List, Tuple, Optional
+import numpy as np
+class ViewpointDetectionError(Exception):
+    """Custom exception for errors during viewpoint detection."""
+    pass
+class ViewpointDetector:
+    """
+    視角檢測器 - 分析物體分布模式以識別圖像視角類型
+    此class負責通過分析檢測到的物體在圖像中的空間分布、大小變化和位置模式，
+    來確定圖像的拍攝視角。特別針對行人密集的十字路口場景進行了優化。
+    """
+    def __init__(self,
+                 aerial_threshold: float = 0.7,
+                 aerial_size_variance_threshold: float = 0.15,
+                 low_angle_threshold: float = 0.3,
+                 vertical_size_ratio_threshold: float = 1.8,
+                 elevated_threshold: float = 0.6,
+                 elevated_top_threshold: float = 0.3,
+                 crosswalk_position_tolerance: float = 0.1,
+                 crosswalk_axis_tolerance: float = 0.15,
+                 min_people_for_crosswalk: int = 8,
+                 min_people_for_aerial: int = 10):
+        """
+        初始化視角檢測器
+        Args:
+            aerial_threshold: 空中視角檢測的物體密度閾值
+            aerial_size_variance_threshold: 空中視角的大小變異閾值
+            low_angle_threshold: 低角度視角的底部分布閾值
+            vertical_size_ratio_threshold: 垂直大小比例閾值
+            elevated_threshold: 高位視角的物體分布閾值
+            elevated_top_threshold: 高位視角的頂部物體閾值
+            crosswalk_position_tolerance: 十字路口位置容差
+            crosswalk_axis_tolerance: 十字路口軸線容差
+            min_people_for_crosswalk: 檢測十字路口所需的最少人數
+            min_people_for_aerial: 檢測空中視角所需的最少人數
+        """
+        self.logger = logging.getLogger(self.__class__.__name__)
+        # 視角檢測參數配置
+        self.viewpoint_params = {
+            "aerial_threshold": aerial_threshold,
+            "aerial_size_variance_threshold": aerial_size_variance_threshold,
+            "low_angle_threshold": low_angle_threshold,
+            "vertical_size_ratio_threshold": vertical_size_ratio_threshold,
+            "elevated_threshold": elevated_threshold,
+            "elevated_top_threshold": elevated_top_threshold,
+            "crosswalk_position_tolerance": crosswalk_position_tolerance,
+            "crosswalk_axis_tolerance": crosswalk_axis_tolerance,
+            "min_people_for_crosswalk": min_people_for_crosswalk,
+            "min_people_for_aerial": min_people_for_aerial
+        }
+        self.logger.info("ViewpointDetector initialized with parameters: %s", self.viewpoint_params)
+    def detect_viewpoint(self, detected_objects: List[Dict]) -> str:
+        """
+        檢測圖像視角類型
+        Args:
+            detected_objects: 檢測到的物體列表，每個物體應包含位置、大小等信息
+        Returns:
+            str: 檢測到的視角類型 ('aerial', 'low_angle', 'elevated', 'eye_level')
+        """
+        try:
+            if not detected_objects:
+                self.logger.warning("No detected objects provided for viewpoint detection")
+                return "eye_level"
+            self.logger.info(f"Starting viewpoint detection with {len(detected_objects)} objects")
+            # 優先檢測十字路口模式（通常為空中視角）
+            if self._detect_crosswalk_pattern(detected_objects):
+                self.logger.info("Crosswalk pattern detected - returning aerial viewpoint")
+                return "aerial"
+            # 檢測基於行人分布的空中視角
+            if self._detect_aerial_from_pedestrian_distribution(detected_objects):
+                self.logger.info("Aerial viewpoint detected from pedestrian distribution")
+                return "aerial"
+            # 標準視角檢測流程
+            return self._detect_standard_viewpoint(detected_objects)
+        except Exception as e:
+            error_msg = f"Error during viewpoint detection: {str(e)}"
+            self.logger.error(f"{error_msg}\n{traceback.format_exc()}")
+            return "eye_level"  # 返回默認值
+    def _detect_crosswalk_pattern(self, detected_objects: List[Dict]) -> bool:
+        """
+        檢測十字路口/斑馬線模式
+        Args:
+            detected_objects: 檢測到的物體列表
+        Returns:
+            bool: 是否檢測到十字路口模式
+        """
+        try:
+            people_objs = [obj for obj in detected_objects if obj.get("class_id") == 0]
+            if len(people_objs) < self.viewpoint_params["min_people_for_crosswalk"]:
+                return False
+            # 提取行人位置
+            people_positions = []
+            for obj in people_objs:
+                if "normalized_center" in obj:
+                    people_positions.append(obj["normalized_center"])
+            if len(people_positions) < 4:
+                return False
+            # 檢測十字形分布
+            if self._detect_cross_pattern(people_positions):
+                self.logger.debug("Cross pattern detected in pedestrian positions")
+                return True
+            # 檢測線性聚類分布
+            if self._detect_linear_crosswalk_clusters(people_positions):
+                self.logger.debug("Linear crosswalk clusters detected")
+                return True
+            return False
+        except Exception as e:
+            self.logger.warning(f"Error in crosswalk pattern detection: {str(e)}")
+            return False
+    def _detect_cross_pattern(self, positions: List[Tuple[float, float]]) -> bool:
+        """
+        檢測十字形分布模式
+        Args:
+            positions: 物體位置列表 [(x, y), ...]
+        Returns:
+            bool: 是否檢測到十字形模式
+        """
+        try:
+            x_coords = [pos[0] for pos in positions]
+            y_coords = [pos[1] for pos in positions]
+            x_range = max(x_coords) - min(x_coords)
+            y_range = max(y_coords) - min(y_coords)
+            # 檢查 x 和 y 方向都有較大範圍且範圍相似
+            if x_range <= 0.5 or y_range <= 0.5:
+                return False
+            if not (0.7 < (x_range / y_range) < 1.3):
+                return False
+            # 計算到中心點的距離並檢查軸線分布
+            center_x = np.mean(x_coords)
+            center_y = np.mean(y_coords)
+            close_to_axis_count = 0
+            axis_tolerance = self.viewpoint_params["crosswalk_axis_tolerance"]
+            for x, y in positions:
+                x_distance_to_center = abs(x - center_x)
+                y_distance_to_center = abs(y - center_y)
+                # 檢查是否接近水平或垂直軸線
+                if x_distance_to_center < axis_tolerance or y_distance_to_center < axis_tolerance:
+                    close_to_axis_count += 1
+            # 如果足夠多的點接近軸線，認為是十字路口
+            axis_ratio = close_to_axis_count / len(positions)
+            return axis_ratio >= 0.6
+        except Exception as e:
+            self.logger.warning(f"Error detecting cross pattern: {str(e)}")
+            return False
+    def _detect_linear_crosswalk_clusters(self, positions: List[Tuple[float, float]]) -> bool:
+        """
+        檢測線性聚類分布（交叉的斑馬線）
+        Args:
+            positions: 物體位置列表
+        Returns:
+            bool: 是否檢測到線性交叉模式
+        """
+        try:
+            x_coords = [pos[0] for pos in positions]
+            y_coords = [pos[1] for pos in positions]
+            # 檢測 x 和 y 方向的聚類
+            x_clusters = self._detect_linear_clusters(x_coords)
+            y_clusters = self._detect_linear_clusters(y_coords)
+            # 如果在 x 和 y 方向上都有多個聚類，可能是交叉的斑馬線
+            return len(x_clusters) >= 2 and len(y_clusters) >= 2
+        except Exception as e:
+            self.logger.warning(f"Error detecting linear crosswalk clusters: {str(e)}")
+            return False
+    def _detect_linear_clusters(self, coords: List[float], threshold: float = 0.05) -> List[List[float]]:
+        """
+        檢測坐標中的線性聚類
+        Args:
+            coords: 一維坐標列表
+            threshold: 聚類閾值
+        Returns:
+            List[List[float]]: 聚類列表
+        """
+        if not coords:
+            return []
+        try:
+            sorted_coords = sorted(coords)
+            clusters = []
+            current_cluster = [sorted_coords[0]]
+            for i in range(1, len(sorted_coords)):
+                if sorted_coords[i] - sorted_coords[i-1] < threshold:
+                    current_cluster.append(sorted_coords[i])
+                else:
+                    if len(current_cluster) >= 2:
+                        clusters.append(current_cluster)
+                    current_cluster = [sorted_coords[i]]
+            # 添加最後一個聚類
+            if len(current_cluster) >= 2:
+                clusters.append(current_cluster)
+            return clusters
+        except Exception as e:
+            self.logger.warning(f"Error in linear cluster detection: {str(e)}")
+            return []
+    def _detect_aerial_from_pedestrian_distribution(self, detected_objects: List[Dict]) -> bool:
+        """
+        基於行人分布檢測空中視角
+        Args:
+            detected_objects: 檢測到的物體列表
+        Returns:
+            bool: 是否為空中視角
+        """
+        try:
+            people_objs = [obj for obj in detected_objects if obj.get("class_id") == 0]
+            if len(people_objs) < self.viewpoint_params["min_people_for_aerial"]:
+                return False
+            # 統計不同區域的行人數量
+            people_region_counts = {}
+            for obj in people_objs:
+                region = obj.get("region", "unknown")
+                people_region_counts[region] = people_region_counts.get(region, 0) + 1
+            # 檢查行人是否分布在多個區域
+            regions_with_multiple_people = sum(1 for count in people_region_counts.values() if count >= 2)
+            if regions_with_multiple_people < 4:
+                return False
+            # 檢查行人分布的均勻性
+            region_counts = list(people_region_counts.values())
+            if not region_counts:
+                return False
+            region_counts_variance = np.var(region_counts)
+            region_counts_mean = np.mean(region_counts)
+            if region_counts_mean > 0:
+                variation_coefficient = region_counts_variance / region_counts_mean
+                return variation_coefficient < 0.5
+            return False
+        except Exception as e:
+            self.logger.warning(f"Error in aerial detection from pedestrian distribution: {str(e)}")
+            return False
+    def _detect_standard_viewpoint(self, detected_objects: List[Dict]) -> str:
+        """
+        標準視角檢測流程
+        Args:
+            detected_objects: 檢測到的物體列表
+        Returns:
+            str: 檢測到的視角類型
+        """
+        try:
+            # 計算基本統計指標
+            metrics = self._calculate_viewpoint_metrics(detected_objects)
+            # 基於計算的指標判斷視角類型
+            if self._is_aerial_viewpoint(metrics):
+                return "aerial"
+            elif self._is_low_angle_viewpoint(metrics):
+                return "low_angle"
+            elif self._is_elevated_viewpoint(metrics):
+                return "elevated"
+            else:
+                return "eye_level"
+        except Exception as e:
+            self.logger.warning(f"Error in standard viewpoint detection: {str(e)}")
+            return "eye_level"
+    def _calculate_viewpoint_metrics(self, detected_objects: List[Dict]) -> Dict:
+        """
+        計算視角檢測所需的各項指標
+        Args:
+            detected_objects: 檢測到的物體列表
+        Returns:
+            Dict: 包含各項指標的字典
+        """
+        total_objects = len(detected_objects)
+        top_region_count = 0
+        bottom_region_count = 0
+        sizes = []
+        height_width_ratios = []
+        try:
+            for obj in detected_objects:
+                # 統計頂部和底部區域的物體數量
+                region = obj.get("region", "")
+                if "top" in region:
+                    top_region_count += 1
+                elif "bottom" in region:
+                    bottom_region_count += 1
+                # 收集大小信息
+                if "normalized_area" in obj:
+                    sizes.append(obj["normalized_area"])
+                # 計算高寬比
+                if "normalized_size" in obj:
+                    width, height = obj["normalized_size"]
+                    if width > 0:
+                        height_width_ratios.append(height / width)
+            # 計算比例
+            top_ratio = top_region_count / total_objects if total_objects > 0 else 0
+            bottom_ratio = bottom_region_count / total_objects if total_objects > 0 else 0
+            # 計算大小變異係數
+            size_variance_coefficient = 0
+            if sizes and len(sizes) > 1:
+                mean_size = np.mean(sizes)
+                if mean_size > 0:
+                    size_variance = np.var(sizes)
+                    size_variance_coefficient = size_variance / (mean_size ** 2)
+            # 計算平均高寬比
+            avg_height_width_ratio = np.mean(height_width_ratios) if height_width_ratios else 1.0
+            metrics = {
+                "top_ratio": top_ratio,
+                "bottom_ratio": bottom_ratio,
+                "size_variance_coefficient": size_variance_coefficient,
+                "avg_height_width_ratio": avg_height_width_ratio,
+                "total_objects": total_objects
+            }
+            self.logger.debug(f"Calculated viewpoint metrics: {metrics}")
+            return metrics
+        except Exception as e:
+            self.logger.error(f"Error calculating viewpoint metrics: {str(e)}")
+            return {
+                "top_ratio": 0,
+                "bottom_ratio": 0,
+                "size_variance_coefficient": 0,
+                "avg_height_width_ratio": 1.0,
+                "total_objects": total_objects
+            }
+    def _is_aerial_viewpoint(self, metrics: Dict) -> bool:
+        """判斷是否為空中視角"""
+        return (metrics["size_variance_coefficient"] < self.viewpoint_params["aerial_size_variance_threshold"] and
+                metrics["bottom_ratio"] < 0.3 and
+                metrics["top_ratio"] > self.viewpoint_params["aerial_threshold"])
+    def _is_low_angle_viewpoint(self, metrics: Dict) -> bool:
+        """判斷是否為低角度視角"""
+        return (metrics["avg_height_width_ratio"] > self.viewpoint_params["vertical_size_ratio_threshold"] and
+                metrics["top_ratio"] > self.viewpoint_params["low_angle_threshold"])
+    def _is_elevated_viewpoint(self, metrics: Dict) -> bool:
+        """判斷是否為高位視角"""
+        return (metrics["bottom_ratio"] > self.viewpoint_params["elevated_threshold"] and
+                metrics["top_ratio"] < self.viewpoint_params["elevated_top_threshold"])
+    def get_viewpoint_confidence(self, detected_objects: List[Dict]) -> Tuple[str, float]:
+        """
+        獲取視角檢測結果及其信心度
+        Args:
+            detected_objects: 檢測到的物體列表
+        Returns:
+            Tuple[str, float]: (視角類型, 信心度)
+        """
+        try:
+            viewpoint = self.detect_viewpoint(detected_objects)
+            # 基於檢測條件計算信心度
+            if viewpoint == "aerial" and self._detect_crosswalk_pattern(detected_objects):
+                confidence = 0.95  # 十字路口模式有很高信心度
+            elif viewpoint == "aerial":
+                confidence = 0.8
+            elif viewpoint == "eye_level":
+                confidence = 0.7  # 默認視角信心度較低
+            else:
+                confidence = 0.85
+            self.logger.info(f"Viewpoint detection result: {viewpoint} (confidence: {confidence:.2f})")
+            return viewpoint, confidence
+        except Exception as e:
+            self.logger.warning("Using fallback viewpoint due to detection error")
+            return "eye_level", 0.3

visualization_helper.py CHANGED Viewed

@@ -16,7 +16,6 @@ class VisualizationHelper:
                             filter_classes: Optional[List[int]] = None) -> Optional[Image.Image]:
         """
         Visualize detection results on a single image
         Args:
             image: Image path or numpy array
             result: Detection result object

                             filter_classes: Optional[List[int]] = None) -> Optional[Image.Image]:
         """
         Visualize detection results on a single image
         Args:
             image: Image path or numpy array
             result: Detection result object

zone_evaluator.py ADDED Viewed

	@@ -0,0 +1,272 @@

+import logging
+import traceback
+import numpy as np
+from typing import Dict, List, Any, Optional
+logger = logging.getLogger(__name__)
+class ZoneEvaluator:
+    """
+    負責功能區域辨識的可行性評估和物件關聯性計算
+    評估是否應該進行區域劃分以及計算物件間的功能關聯性
+    """
+    def __init__(self):
+        """初始化區域評估器"""
+        try:
+            # 定義物件間的功能關聯性評分表
+            # 分數越高表示兩個物件在功能上越相關，更可能出現在同一功能區域
+            self.relationship_pairs = {
+                # 家具組合關係 - 這些組合通常出現在特定功能區域
+                frozenset([56, 60]): 1.0,  # 椅子+桌子 (dining/work area)
+                frozenset([57, 62]): 0.9,  # 沙發+電視 (living area)
+                frozenset([59, 58]): 0.7,  # 床+植物 (bedroom decor)
+                # 工作相關組合 - 工作環境的典型配置
+                frozenset([63, 66]): 0.9,  # 筆電+鍵盤 (workspace)
+                frozenset([63, 64]): 0.8,  # 筆電+滑鼠 (workspace)
+                frozenset([60, 63]): 0.8,  # 桌子+筆電 (workspace)
+                # 廚房相關組合 - 廚房設備的常見的物品
+                frozenset([68, 72]): 0.9,  # 微波爐+冰箱 (kitchen)
+                frozenset([69, 71]): 0.8,  # 烤箱+水槽 (kitchen)
+                # 用餐相關組合 - 餐廳或用餐區域的典型物品
+                frozenset([60, 40]): 0.8,  # 桌子+酒杯 (dining)
+                frozenset([60, 41]): 0.8,  # 桌子+杯子 (dining)
+                frozenset([56, 40]): 0.7,  # 椅子+酒杯 (dining)
+                # 交通相關組合 - 城市交通的環境
+                frozenset([2, 9]): 0.8,   # 汽車+交通燈 (traffic)
+                frozenset([0, 9]): 0.7,   # 行人+交通燈 (crosswalk)
+            }
+            logger.info("ZoneEvaluator initialized with predefined relationship pairs")
+        except Exception as e:
+            logger.error(f"Failed to initialize ZoneEvaluator: {str(e)}")
+            logger.error(traceback.format_exc())
+            raise
+    def evaluate_zone_identification_feasibility(self, detected_objects: List[Dict], scene_type: str) -> bool:
+        """
+        基於物件關聯性和分布特徵的彈性可行性評估
+        決定是否應該進行功能區域劃分
+        Args:
+            detected_objects: 檢測到的物件列表
+            scene_type: 場景類型
+        Returns:
+            是否適合進行區域識別
+        """
+        try:
+            if len(detected_objects) < 2:
+                logger.info("Insufficient objects for zone identification (minimum 2 required)")
+                return False
+            # 計算不同置信度層級的物件分布
+            # 高信心度物件更可靠，用於核心區域判斷
+            high_conf_objects = [obj for obj in detected_objects if obj.get("confidence", 0) >= 0.6]
+            # 中等置信度物件提供補充資訊
+            medium_conf_objects = [obj for obj in detected_objects if obj.get("confidence", 0) >= 0.4]
+            # 基礎條件：至少需要一定數量的可信物件才值得進行區域分析
+            if len(medium_conf_objects) < 2:
+                logger.info("Insufficient medium confidence objects for zone identification")
+                return False
+            # 評估物件間的功能關聯性，關聯性高的物件更適合劃分功能區域
+            functional_relationships = self.calculate_functional_relationships(detected_objects)
+            # 評估空間分布多樣性 - 物件分散在多個區域才有劃分的意義
+            spatial_diversity = self.calculate_spatial_diversity(detected_objects)
+            # 綜合評分機制，用各項指標加權計算最終可行性評分
+            feasibility_score = 0
+            # 物件數量的貢獻（權重30%）- 更多物件提供更多劃分依據
+            object_count_score = min(len(detected_objects) / 5.0, 1.0) * 0.3
+            # 信心度質量貢獻（權重25%）- 高置信度物件比例影響可靠性
+            confidence_score = len(high_conf_objects) / max(len(detected_objects), 1) * 0.25
+            # 功能關聯性貢獻（權重25%）- 有功能關聯的物件更適合劃分區域
+            relationship_score = functional_relationships * 0.25
+            # 空間多樣性貢獻（權重20%）- 分散的物件才需要區域劃分
+            diversity_score = spatial_diversity * 0.20
+            feasibility_score = object_count_score + confidence_score + relationship_score + diversity_score
+            # 動態閾值：根據場景複雜度調整可行性標準
+            complexity_threshold = self.get_complexity_threshold(scene_type)
+            is_feasible = feasibility_score >= complexity_threshold
+            logger.info(f"Zone identification feasibility: {is_feasible} (score: {feasibility_score:.3f}, threshold: {complexity_threshold:.3f})")
+            logger.debug(f"Score breakdown - objects: {object_count_score:.3f}, confidence: {confidence_score:.3f}, relationships: {relationship_score:.3f}, diversity: {diversity_score:.3f}")
+            return is_feasible
+        except Exception as e:
+            logger.error(f"Error evaluating zone identification feasibility: {str(e)}")
+            logger.error(traceback.format_exc())
+            return False
+    def calculate_functional_relationships(self, detected_objects: List[Dict]) -> float:
+        """
+        計算物件間的功能關聯性評分
+        基於常見的物件組合模式評估功能相關性
+        Args:
+            detected_objects: 檢測到的物件列表
+        Returns:
+            功能關聯性評分 (0.0-1.0)
+        """
+        try:
+            detected_class_ids = set(obj.get("class_id") for obj in detected_objects)
+            max_possible_score = 0
+            actual_score = 0
+            # 遍歷所有預定義的關聯性組合，計算實際場景中的關聯性評分
+            for pair, score in self.relationship_pairs.items():
+                max_possible_score += score
+                # 如果檢測到的物件中包含這個關聯組合，累加其評分
+                if pair.issubset(detected_class_ids):
+                    actual_score += score
+                    logger.debug(f"Found functional relationship: {pair} with score {score}")
+            # 標準化評分：實際評分除以最大可能評分
+            relationship_score = actual_score / max_possible_score if max_possible_score > 0 else 0
+            logger.info(f"Functional relationships calculated: {relationship_score:.3f} (found {actual_score:.1f}/{max_possible_score:.1f} possible relationships)")
+            return relationship_score
+        except Exception as e:
+            logger.error(f"Error calculating functional relationships: {str(e)}")
+            logger.error(traceback.format_exc())
+            return 0
+    def calculate_spatial_diversity(self, detected_objects: List[Dict]) -> float:
+        """
+        計算物件空間分布的多樣性
+        評估物件是否分散在不同區域，避免所有物件集中在單一區域
+        Args:
+            detected_objects: 檢測到的物件列表
+        Returns:
+            空間多樣性評分 (0.0-1.0)
+        """
+        try:
+            # 收集所有物件所在的不同區域
+            regions = set(obj.get("region", "center") for obj in detected_objects)
+            unique_regions = len(regions)
+            # 標準化多樣性評分：假設理想情況是物件分散在2個以上區域
+            # 更多區域意味著更高的空間多樣性，更適合進行區域劃分
+            diversity_score = min(unique_regions / 2.0, 1.0)
+            logger.info(f"Spatial diversity calculated: {diversity_score:.3f} (objects distributed across {unique_regions} regions)")
+            return diversity_score
+        except Exception as e:
+            logger.error(f"Error calculating spatial diversity: {str(e)}")
+            logger.error(traceback.format_exc())
+            return 0
+    def get_complexity_threshold(self, scene_type: str) -> float:
+        """
+        根據場景類型返回適當的複雜度閾值
+        平衡不同場景的區域劃分需求
+        Args:
+            scene_type: 場景類型
+        Returns:
+            複雜度閾值 (0.0-1.0)
+        """
+        try:
+            # 較簡單場景需要較高分數才進行區域劃分
+            # 這些場景通常功能較為單純，不太需要細分
+            simple_scenes = ["bedroom", "bathroom", "closet"]
+            # 較複雜場景可以較低分數進行區域劃分
+            # 這些場景通常有多種功能，適合劃分不同區域
+            complex_scenes = ["living_room", "kitchen", "office_workspace", "dining_area"]
+            if scene_type in simple_scenes:
+                threshold = 0.65  # 較高閾值，避免過度細分
+                logger.debug(f"Using high threshold {threshold} for simple scene: {scene_type}")
+            elif scene_type in complex_scenes:
+                threshold = 0.45  # 較低閾值，允許合理劃分
+                logger.debug(f"Using low threshold {threshold} for complex scene: {scene_type}")
+            else:
+                threshold = 0.55  # 中等閾值，平衡策略
+                logger.debug(f"Using medium threshold {threshold} for scene: {scene_type}")
+            return threshold
+        except Exception as e:
+            logger.error(f"Error getting complexity threshold for scene '{scene_type}': {str(e)}")
+            logger.error(traceback.format_exc())
+            return 0.55  # 預設中等閾值
+    def analyze_object_clustering(self, detected_objects: List[Dict]) -> Dict:
+        """
+        分析物件的聚集模式
+        識別物件是否形成明顯的聚集群組，這有助於功���區域的劃分
+        Args:
+            detected_objects: 檢測到的物件列表
+        Returns:
+            包含聚集分析結果的字典
+        """
+        try:
+            clustering_result = {
+                "has_clusters": False,
+                "cluster_count": 0,
+                "cluster_regions": [],
+                "clustering_score": 0.0
+            }
+            if len(detected_objects) < 3:
+                logger.info("Insufficient objects for clustering analysis")
+                return clustering_result
+            # 統計每個區域的物件數量
+            region_counts = {}
+            for obj in detected_objects:
+                region = obj.get("region", "unknown")
+                region_counts[region] = region_counts.get(region, 0) + 1
+            # 找出有顯著物件聚集的區域（物件數量 >= 2）
+            significant_regions = [region for region, count in region_counts.items() if count >= 2]
+            # 計算聚集：聚集區域數量與總區域數量的比例
+            total_regions_with_objects = len([count for count in region_counts.values() if count > 0])
+            clustering_score = len(significant_regions) / max(total_regions_with_objects, 1)
+            clustering_result.update({
+                "has_clusters": len(significant_regions) >= 2,
+                "cluster_count": len(significant_regions),
+                "cluster_regions": significant_regions,
+                "clustering_score": clustering_score
+            })
+            logger.info(f"Object clustering analysis: {len(significant_regions)} clusters found in regions {significant_regions}")
+            return clustering_result
+        except Exception as e:
+            logger.error(f"Error analyzing object clustering: {str(e)}")
+            logger.error(traceback.format_exc())
+            return {
+                "has_clusters": False,
+                "cluster_count": 0,
+                "cluster_regions": [],
+                "clustering_score": 0.0
+            }