Spaces:

DawnC
/

VisionScout

Running on Zero

File size: 59,486 Bytes

import logging
import traceback
from typing import Dict, List, Tuple, Optional, Any
import numpy as np

class ObjectDescriptionError(Exception):
    """物件描述生成過程中的自定義異常"""
    pass


class ObjectDescriptionGenerator:
    """
    物件描述生成器 - 負責將檢測到的物件轉換為自然語言描述

    該類別處理物件相關的所有描述生成邏輯，包括重要物件的識別、
    空間位置描述、物件列表格式化以及描述文本的優化。
    """

    def __init__(self,
                 min_prominence_score: float = 0.1,
                 max_categories_to_return: int = 5,
                 max_total_objects: int = 7,
                 confidence_threshold_for_description: float = 0.25,
                 region_analyzer: Optional[Any] = None):
        """
        初始化物件描述生成器

        Args:
            min_prominence_score: 物件顯著性的最低分數閾值
            max_categories_to_return: 返回的物件類別最大數量
            max_total_objects: 返回的物件總數上限
            confidence_threshold_for_description: 用於描述的置信度閾值
        """
        self.logger = logging.getLogger(self.__class__.__name__)

        self.min_prominence_score = min_prominence_score
        self.max_categories_to_return = max_categories_to_return
        self.max_total_objects = max_total_objects
        self.confidence_threshold_for_description = confidence_threshold_for_description
        self.region_analyzer = region_analyzer

        self.logger.info("ObjectDescriptionGenerator initialized with prominence_score=%.2f, "
                        "max_categories=%d, max_objects=%d, confidence_threshold=%.2f",
                        min_prominence_score, max_categories_to_return,
                        max_total_objects, confidence_threshold_for_description)

    def get_prominent_objects(self, detected_objects: List[Dict],
                          min_prominence_score: float = 0.5,
                          max_categories_to_return: Optional[int] = None) -> List[Dict]:
        """
        獲取最重要的物件，基於置信度、大小和位置計算重要性評分

        Args:
            detected_objects: 檢測到的物件列表
            min_prominence_score: 最小重要性分數閾值，範圍 0.0-1.0
            max_categories_to_return: 可選的最大返回類別數量限制

        Returns:
            List[Dict]: 按重要性排序的物件列表
        """
        try:
            if not detected_objects:
                return []

            prominent_objects = []

            for obj in detected_objects:
                # 計算重要性評分
                prominence_score = self._calculate_prominence_score(obj)

                # 只保留超過閾值的物件
                if prominence_score >= min_prominence_score:
                    obj_copy = obj.copy()
                    obj_copy['prominence_score'] = prominence_score
                    prominent_objects.append(obj_copy)

            # 按重要性評分排序（從高到低）
            prominent_objects.sort(key=lambda x: x.get('prominence_score', 0), reverse=True)

            # 如果指定了最大類別數量限制，進行過濾
            if max_categories_to_return is not None and max_categories_to_return > 0:
                categories_seen = set()
                filtered_objects = []

                for obj in prominent_objects:
                    class_name = obj.get("class_name", "unknown")

                    # 如果是新類別且未達到限制
                    if class_name not in categories_seen:
                        if len(categories_seen) < max_categories_to_return:
                            categories_seen.add(class_name)
                            filtered_objects.append(obj)
                    else:
                        # 已見過的類別，直接添加
                        filtered_objects.append(obj)

                return filtered_objects

            return prominent_objects

        except Exception as e:
            self.logger.error(f"Error calculating prominent objects: {str(e)}")
            return []

    def set_region_analyzer(self, region_analyzer: Any) -> None:
        """
        設置RegionAnalyzer，用於標準化空間描述生成

        Args:
            region_analyzer: RegionAnalyzer實例
        """
        try:
            self.region_analyzer = region_analyzer
            self.logger.info("RegionAnalyzer instance set for ObjectDescriptionGenerator")
        except Exception as e:
            self.logger.warning(f"Error setting RegionAnalyzer: {str(e)}")

    def _get_standardized_spatial_description(self, obj: Dict) -> str:
        """
        使用RegionAnalyzer生成標準化空間描述的內部方法

        Args:
            obj: 物件字典

        Returns:
            str: 標準化空間描述，失敗時返回空字串
        """
        try:
            if hasattr(self, 'region_analyzer') and self.region_analyzer:
                region = obj.get("region", "")
                object_type = obj.get("class_name", "")

                if hasattr(self.region_analyzer, 'get_contextual_spatial_description'):
                    return self.region_analyzer.get_contextual_spatial_description(region, object_type)
                elif hasattr(self.region_analyzer, 'get_spatial_description_phrase'):
                    return self.region_analyzer.get_spatial_description_phrase(region)

            return ""

        except Exception as e:
            self.logger.warning(f"Error getting standardized spatial description: {str(e)}")
            if object_type:
                return f"visible in the scene"
            return "present in the view"

    def _calculate_prominence_score(self, obj: Dict) -> float:
        """
        計算物件的重要性評分

        Args:
            obj: 物件字典，包含檢測信息

        Returns:
            float: 重要性評分 (0.0-1.0)
        """
        try:
            # 基礎置信度評分 (權重: 40%)
            confidence = obj.get("confidence", 0.5)
            confidence_score = confidence * 0.4

            # 大小評分 (權重: 30%)
            normalized_area = obj.get("normalized_area", 0.1)
            # 使用對數縮放避免過大物件主導評分
            size_score = min(np.log(normalized_area * 10 + 1) / np.log(11), 1.0) * 0.3

            # 位置評分 (權重: 20%)
            # 中心區域的物件通常更重要
            center_x, center_y = obj.get("normalized_center", [0.5, 0.5])
            distance_from_center = np.sqrt((center_x - 0.5)**2 + (center_y - 0.5)**2)
            position_score = (1 - min(distance_from_center * 2, 1.0)) * 0.2

            # 類別重要性評分 (權重: 10%)
            class_importance = self._get_class_importance(obj.get("class_name", "unknown"))
            class_score = class_importance * 0.1

            total_score = confidence_score + size_score + position_score + class_score

            # 確保評分在有效範圍內
            return max(0.0, min(1.0, total_score))

        except Exception as e:
            self.logger.warning(f"Error calculating prominence score for object: {str(e)}")
            return 0.5  # 返回中等評分作為備用

    def _get_class_importance(self, class_name: str) -> float:
        """
        根據物件類別返回重要性係數

        Args:
            class_name: 物件類別名稱

        Returns:
            float: 類別重要性係數 (0.0-1.0)
        """
        # 高重要性物件（人、車輛、建築）
        high_importance = ["person", "car", "truck", "bus", "motorcycle", "bicycle", "building"]

        # 中等重要性物件（家具、電器）
        medium_importance = ["chair", "couch", "tv", "laptop", "refrigerator", "dining table", "bed"]

        # 低重要性物件（小物品、配件）
        low_importance = ["handbag", "backpack", "umbrella", "cell phone", "remote", "mouse"]

        class_name_lower = class_name.lower()

        if any(item in class_name_lower for item in high_importance):
            return 1.0
        elif any(item in class_name_lower for item in medium_importance):
            return 0.7
        elif any(item in class_name_lower for item in low_importance):
            return 0.4
        else:
            return 0.6  # 預設中等重要性

    def format_object_list_for_description(self,
                                          objects: List[Dict],
                                          use_indefinite_article_for_one: bool = False,
                                          count_threshold_for_generalization: int = -1,
                                          max_types_to_list: int = 5) -> str:
        """
        將物件列表格式化為人類可讀的字符串，包含計數信息

        Args:
            objects: 物件字典列表，每個應包含 'class_name'
            use_indefinite_article_for_one: 單個物件是否使用 "a/an"，否則使用 "one"
            count_threshold_for_generalization: 超過此計數時使用通用術語，-1表示精確計數
            max_types_to_list: 列表中包含的不同物件類型最大數量

        Returns:
            str: 格式化的物件描述字符串
        """
        try:
            if not objects:
                return "no specific objects clearly identified"

            counts: Dict[str, int] = {}
            for obj in objects:
                name = obj.get("class_name", "unknown object")
                if name == "unknown object" or not name:
                    continue
                counts[name] = counts.get(name, 0) + 1

            if not counts:
                return "no specific objects clearly identified"

            descriptions = []
            # 按計數降序然後按名稱升序排序，限制物件類型數量
            sorted_counts = sorted(counts.items(), key=lambda item: (-item[1], item[0]))[:max_types_to_list]

            for name, count in sorted_counts:
                if count == 1:
                    if use_indefinite_article_for_one:
                        if name[0].lower() in 'aeiou':
                            descriptions.append(f"an {name}")
                        else:
                            descriptions.append(f"a {name}")
                    else:
                        descriptions.append(f"one {name}")
                else:
                    # 處理複數形式
                    plural_name = name
                    if name.endswith("y") and not name.lower().endswith(("ay", "ey", "iy", "oy", "uy")):
                        plural_name = name[:-1] + "ies"
                    elif name.endswith(("s", "sh", "ch", "x", "z")):
                        plural_name = name + "es"
                    elif not name.endswith("s"):
                        plural_name = name + "s"

                    if count_threshold_for_generalization != -1 and count > count_threshold_for_generalization:
                        if count <= count_threshold_for_generalization + 3:
                            descriptions.append(f"several {plural_name}")
                        else:
                            descriptions.append(f"many {plural_name}")
                    else:
                        descriptions.append(f"{count} {plural_name}")

            if not descriptions:
                return "no specific objects clearly identified"

            if len(descriptions) == 1:
                return descriptions[0]
            elif len(descriptions) == 2:
                return f"{descriptions[0]} and {descriptions[1]}"
            else:
                # 使用牛津逗號格式
                return ", ".join(descriptions[:-1]) + f", and {descriptions[-1]}"

        except Exception as e:
            self.logger.warning(f"Error formatting object list: {str(e)}")
            return "various objects"

    def get_spatial_description(self, obj: Dict, image_width: Optional[int] = None,
                           image_height: Optional[int] = None,
                           region_analyzer: Optional[Any] = None) -> str:
        """
        為物件生成空間位置描述

        Args:
            obj: 物件字典
            image_width: 可選的圖像寬度
            image_height: 可選的圖像高度
            region_analyzer: 可選的RegionAnalyzer實例，用於生成標準化描述

        Returns:
            str: 空間描述字符串，空值region時返回空字串
        """
        try:
            region = obj.get("region") or ""

            # 處理空值或無效region，直接返回空字串避免不完整描述
            if not region.strip() or region == "unknown":
                # 根據物件類型提供合適的預設位置描述
                if object_type and any(vehicle in object_type.lower() for vehicle in ["car", "truck", "bus"]):
                    return "positioned in the scene"
                elif object_type and "person" in object_type.lower():
                    return "present in the area"
                else:
                    return "located in the scene"

            # 如果提供了RegionAnalyzer實例，使用其標準化方法
            if region_analyzer and hasattr(region_analyzer, 'get_spatial_description_phrase'):
                object_type = obj.get("class_name", "")
                if hasattr(region_analyzer, 'get_contextual_spatial_description'):
                    spatial_desc = region_analyzer.get_contextual_spatial_description(region, object_type)
                else:
                    spatial_desc = region_analyzer.get_spatial_description_phrase(region)

                if spatial_desc:
                    return spatial_desc

            # 備用邏輯：使用改進的內建映射
            clean_region = region.replace('_', ' ').strip().lower()

            region_map = {
                "top left": "in the upper left area",
                "top center": "in the upper area",
                "top right": "in the upper right area",
                "middle left": "on the left side",
                "middle center": "in the center",
                "center": "in the center",
                "middle right": "on the right side",
                "bottom left": "in the lower left area",
                "bottom center": "in the lower area",
                "bottom right": "in the lower right area"
            }

            # 直接映射匹配
            if clean_region in region_map:
                return region_map[clean_region]

            # 模糊匹配處理
            if "top" in clean_region and "left" in clean_region:
                return "in the upper left area"
            elif "top" in clean_region and "right" in clean_region:
                return "in the upper right area"
            elif "bottom" in clean_region and "left" in clean_region:
                return "in the lower left area"
            elif "bottom" in clean_region and "right" in clean_region:
                return "in the lower right area"
            elif "top" in clean_region:
                return "in the upper area"
            elif "bottom" in clean_region:
                return "in the lower area"
            elif "left" in clean_region:
                return "on the left side"
            elif "right" in clean_region:
                return "on the right side"
            elif "center" in clean_region or "middle" in clean_region:
                return "in the center"

            # 如果region無法識別，使用normalized_center作為最後備用
            norm_center = obj.get("normalized_center")
            if norm_center and image_width and image_height:
                x_norm, y_norm = norm_center
                h_pos = "left" if x_norm < 0.4 else "right" if x_norm > 0.6 else "center"
                v_pos = "upper" if y_norm < 0.4 else "lower" if y_norm > 0.6 else "center"

                if h_pos == "center" and v_pos == "center":
                    return "in the center"
                return f"in the {v_pos} {h_pos} area"

            # 如果所有方法都失敗，返回空字串
            return ""

        except Exception as e:
            self.logger.warning(f"Error generating spatial description: {str(e)}")
            return ""

    def optimize_object_description(self, description: str) -> str:
        """
        優化物件描述，避免重複列舉相同物件

        Args:
            description: 原始描述文本

        Returns:
            str: 優化後的描述文本
        """
        try:
            import re

            # 處理床鋪重複描述
            if "bed in the room" in description:
                description = description.replace("a bed in the room", "a bed")

            # 處理重複的物件列表
            object_lists = re.findall(r'with ([^\.]+?)(?:\.|\band\b)', description)

            for obj_list in object_lists:
                # 計算每個物件出現次數
                items = re.findall(r'([a-zA-Z\s]+)(?:,|\band\b|$)', obj_list)
                item_counts = {}

                for item in items:
                    item = item.strip()
                    if item and item not in ["and", "with"]:
                        if item not in item_counts:
                            item_counts[item] = 0
                        item_counts[item] += 1

                # 生成優化後的物件列表
                if item_counts:
                    new_items = []
                    for item, count in item_counts.items():
                        if count > 1:
                            new_items.append(f"{count} {item}s")
                        else:
                            new_items.append(item)

                    # 格式化新列表
                    if len(new_items) == 1:
                        new_list = new_items[0]
                    elif len(new_items) == 2:
                        new_list = f"{new_items[0]} and {new_items[1]}"
                    else:
                        new_list = ", ".join(new_items[:-1]) + f", and {new_items[-1]}"

                    # 替換原始列表
                    description = description.replace(obj_list, new_list)

            return description

        except Exception as e:
            self.logger.warning(f"Error optimizing object description: {str(e)}")
            return description

    def generate_dynamic_everyday_description(self,
                                            detected_objects: List[Dict],
                                            lighting_info: Optional[Dict] = None,
                                            viewpoint: str = "eye_level",
                                            spatial_analysis: Optional[Dict] = None,
                                            image_dimensions: Optional[Tuple[int, int]] = None,
                                            places365_info: Optional[Dict] = None,
                                            object_statistics: Optional[Dict] = None) -> str:
        """
        為日常場景動態生成描述，基於所有相關的檢測物件、計數和上下文

        Args:
            detected_objects: 檢測到的物件列表
            lighting_info: 照明信息
            viewpoint: 視角類型
            spatial_analysis: 空間分析結果
            image_dimensions: 圖像尺寸
            places365_info: Places365場景分類信息
            object_statistics: 物件統計信息

        Returns:
            str: 動態生成的場景描述
        """
        try:
            description_segments = []
            image_width, image_height = image_dimensions if image_dimensions else (None, None)

            self.logger.debug(f"Generating dynamic description for {len(detected_objects)} objects, "
                            f"viewpoint: {viewpoint}, lighting: {lighting_info is not None}")

            # 1. 整體氛圍（照明和視角）
            ambiance_parts = []
            if lighting_info:
                time_of_day = lighting_info.get("time_of_day", "unknown lighting")
                is_indoor = lighting_info.get("is_indoor")
                ambiance_statement = "This is"
                if is_indoor is True:
                    ambiance_statement += " an indoor scene"
                elif is_indoor is False:
                    ambiance_statement += " an outdoor scene"
                else:
                    ambiance_statement += " a scene"

                # remove underline
                readable_lighting = f"with {time_of_day.replace('_', ' ')} lighting conditions"
                ambiance_statement += f", likely {readable_lighting}."
                ambiance_parts.append(ambiance_statement)

            if viewpoint and viewpoint != "eye_level":
                if not ambiance_parts:
                    ambiance_parts.append(f"From {viewpoint.replace('_', ' ')}, the general layout of the scene is observed.")
                else:
                    ambiance_parts[-1] = ambiance_parts[-1].rstrip('.') + f", viewed from {viewpoint.replace('_', ' ')}."

            if ambiance_parts:
                description_segments.append(" ".join(ambiance_parts))

            # 2. 描述所有檢測到的物件，按類別分組，使用準確計數和位置
            if not detected_objects:
                if not description_segments:
                    description_segments.append("A general scene is visible, but no specific objects were clearly identified.")
                else:
                    description_segments.append("Within this setting, no specific objects were clearly identified.")
            else:
                objects_by_class: Dict[str, List[Dict]] = {}

                # 使用置信度過濾
                confident_objects = [obj for obj in detected_objects
                                   if obj.get("confidence", 0) >= self.confidence_threshold_for_description]
                
                # print(f"DEBUG: After confidence filtering (threshold={self.confidence_threshold_for_description}):")                
                # for class_name in ["car", "traffic light", "person", "handbag"]:                   
                #     class_objects = [obj for obj in confident_objects if obj.get("class_name") == class_name]
                #     print(f"DEBUG: {class_name}: {len(class_objects)} confident objects")

                if not confident_objects:
                    no_confident_obj_msg = "While some elements might be present, no objects were identified with sufficient confidence for a detailed description."
                    if not description_segments:
                        description_segments.append(no_confident_obj_msg)
                    else:
                        description_segments.append(no_confident_obj_msg.lower().capitalize())
                else:
                    if object_statistics:
                        # 使用預計算的統計信息，採用動態的信心度
                        for class_name, stats in object_statistics.items():
                            count = stats.get("count", 0)
                            avg_confidence = stats.get("avg_confidence", 0)

                            # 動態調整置信度閾值
                            dynamic_threshold = self.confidence_threshold_for_description
                            if class_name in ["potted plant", "vase", "clock", "book"]:
                                dynamic_threshold = max(0.15, self.confidence_threshold_for_description * 0.6)
                            elif count >= 3:
                                dynamic_threshold = max(0.2, self.confidence_threshold_for_description * 0.8)

                            if count > 0 and avg_confidence >= dynamic_threshold:
                                matching_objects = [obj for obj in confident_objects if obj.get("class_name") == class_name]
                                if not matching_objects:
                                    matching_objects = [obj for obj in detected_objects
                                                      if obj.get("class_name") == class_name and obj.get("confidence", 0) >= dynamic_threshold]

                                if matching_objects:
                                    actual_count = min(stats["count"], len(matching_objects))
                                    objects_by_class[class_name] = matching_objects[:actual_count]
                    else:
                        # 備用邏輯，同樣使用動態閾值
                        for obj in confident_objects:
                            name = obj.get("class_name", "unknown object")
                            if name == "unknown object" or not name:
                                continue
                            if name not in objects_by_class:
                                objects_by_class[name] = []
                            objects_by_class[name].append(obj)
                            # print(f"DEBUG: Before spatial deduplication:")
                            # for class_name in ["car", "traffic light", "person", "handbag"]:
                            #     if class_name in objects_by_class:                                    
                            #         print(f"DEBUG: {class_name}: {len(objects_by_class[class_name])} objects before dedup")

                    if not objects_by_class:
                        description_segments.append("No common objects were confidently identified for detailed description.")
                    else:
                        # 物件組排序函數
                        def sort_key_object_groups(item_tuple: Tuple[str, List[Dict]]):
                            class_name_key, obj_group_list = item_tuple
                            priority = 3
                            count = len(obj_group_list)

                            # 確保類別名稱已標準化
                            normalized_class_name = self._normalize_object_class_name(class_name_key)

                            # 動態優先級
                            if normalized_class_name == "person":
                                priority = 0
                            elif normalized_class_name in ["dining table", "chair", "sofa", "bed"]:
                                priority = 1
                            elif normalized_class_name in ["car", "bus", "truck", "traffic light"]:
                                priority = 2
                            elif count >= 3:
                                priority = max(1, priority - 1)
                            elif normalized_class_name in ["potted plant", "vase", "clock", "book"] and count >= 2:
                                priority = 2

                            avg_area = sum(o.get("normalized_area", 0.0) for o in obj_group_list) / len(obj_group_list) if obj_group_list else 0
                            quantity_bonus = min(count / 5.0, 1.0)

                            return (priority, -len(obj_group_list), -avg_area, -quantity_bonus)

                        # remove duplicate
                        deduplicated_objects_by_class = {}
                        processed_positions = []

                        for class_name, group_of_objects in objects_by_class.items():
                            unique_objects = []

                            for obj in group_of_objects:
                                obj_position = obj.get("normalized_center", [0.5, 0.5])
                                is_duplicate = False

                                for processed_pos in processed_positions:
                                    position_distance = abs(obj_position[0] - processed_pos[0]) + abs(obj_position[1] - processed_pos[1])
                                    if position_distance < 0.15:
                                        is_duplicate = True
                                        break

                                if not is_duplicate:
                                    unique_objects.append(obj)
                                    processed_positions.append(obj_position)

                            if unique_objects:
                                deduplicated_objects_by_class[class_name] = unique_objects

                        objects_by_class = deduplicated_objects_by_class
                        
                        # print(f"DEBUG: After spatial deduplication:")
                        # for class_name in ["car", "traffic light", "person", "handbag"]:
                        #     if class_name in objects_by_class:                                
                        #         print(f"DEBUG: {class_name}: {len(objects_by_class[class_name])} objects after dedup")
                            
                        sorted_object_groups = sorted(objects_by_class.items(), key=sort_key_object_groups)

                        object_clauses = []

                        for class_name, group_of_objects in sorted_object_groups:
                            count = len(group_of_objects)
                            
                            # if class_name in ["car", "traffic light", "person", "handbag"]:                                
                            #     print(f"DEBUG: Final count for {class_name}: {count}")
                                
                            if count == 0:
                                continue

                            # 標準化class name
                            normalized_class_name = self._normalize_object_class_name(class_name)

                            # 使用統計信息確保準確的數量描述
                            if object_statistics and class_name in object_statistics:
                                actual_count = object_statistics[class_name]["count"]
                                formatted_name_with_exact_count = self._format_object_count_description(
                                    normalized_class_name, actual_count
                                )
                            else:
                                formatted_name_with_exact_count = self._format_object_count_description(
                                    normalized_class_name, count
                                )

                            if formatted_name_with_exact_count == "no specific objects clearly identified" or not formatted_name_with_exact_count:
                                continue

                            # 確定群組的集體位置
                            location_description_suffix = ""
                            if count == 1:
                                spatial_desc = self.get_spatial_description(group_of_objects[0], image_width, image_height, self.region_analyzer)
                                if spatial_desc:
                                    location_description_suffix = f"is {spatial_desc}"
                                else:
                                    distinct_regions = sorted(list(set(obj.get("region", "") for obj in group_of_objects if obj.get("region"))))
                                    valid_regions = [r for r in distinct_regions if r and r != "unknown" and r.strip()]
                                    if not valid_regions:
                                        location_description_suffix = "is positioned in the scene"
                                    elif len(valid_regions) == 1:
                                        spatial_desc = self.get_spatial_description_phrase(valid_regions[0])
                                        location_description_suffix = f"is primarily {spatial_desc}" if spatial_desc else "is positioned in the scene"
                                    elif len(valid_regions) == 2:
                                        clean_region1 = valid_regions[0].replace('_', ' ')
                                        clean_region2 = valid_regions[1].replace('_', ' ')
                                        location_description_suffix = f"is mainly across the {clean_region1} and {clean_region2} areas"
                                    else:
                                        location_description_suffix = "is distributed in various parts of the scene"
                            else:
                                distinct_regions = sorted(list(set(obj.get("region", "") for obj in group_of_objects if obj.get("region"))))
                                valid_regions = [r for r in distinct_regions if r and r != "unknown" and r.strip()]
                                if not valid_regions:
                                    location_description_suffix = "are visible in the scene"
                                elif len(valid_regions) == 1:
                                    clean_region = valid_regions[0].replace('_', ' ')
                                    location_description_suffix = f"are primarily in the {clean_region} area"
                                elif len(valid_regions) == 2:
                                    clean_region1 = valid_regions[0].replace('_', ' ')
                                    clean_region2 = valid_regions[1].replace('_', ' ')
                                    location_description_suffix = f"are mainly across the {clean_region1} and {clean_region2} areas"
                                else:
                                    location_description_suffix = "are distributed in various parts of the scene"

                            # 首字母大寫
                            formatted_name_capitalized = formatted_name_with_exact_count[0].upper() + formatted_name_with_exact_count[1:]
                            object_clauses.append(f"{formatted_name_capitalized} {location_description_suffix}")

                        if object_clauses:
                            if not description_segments:
                                if object_clauses:
                                    first_clause = object_clauses.pop(0)
                                    description_segments.append(first_clause + ".")
                            else:
                                if object_clauses:
                                    description_segments.append("The scene features:")

                            if object_clauses:
                                joined_object_clauses = ". ".join(object_clauses)
                                if joined_object_clauses and not joined_object_clauses.endswith("."):
                                    joined_object_clauses += "."
                                description_segments.append(joined_object_clauses)

                        elif not description_segments:
                            return "The image depicts a scene, but specific objects could not be described with confidence or detail."

            # 最終組裝和格式化
            raw_description = ""
            for i, segment in enumerate(filter(None, description_segments)):
                segment = segment.strip()
                if not segment:
                    continue

                if not raw_description:
                    raw_description = segment
                else:
                    if not raw_description.endswith(('.', '!', '?')):
                        raw_description += "."
                    raw_description += " " + (segment[0].upper() + segment[1:] if len(segment) > 1 else segment.upper())

            if raw_description and not raw_description.endswith(('.', '!', '?')):
                raw_description += "."

            if not raw_description or len(raw_description.strip()) < 20:
                if 'confident_objects' in locals() and confident_objects:
                    return "The scene contains several detected objects, but a detailed textual description could not be fully constructed."
                else:
                    return "A general scene is depicted with no objects identified with high confidence."

            return raw_description

        except Exception as e:
            error_msg = f"Error generating dynamic everyday description: {str(e)}"
            self.logger.error(f"{error_msg}\n{traceback.format_exc()}")
            raise ObjectDescriptionError(error_msg) from e

    def _format_object_count_description(self, class_name: str, count: int) -> str:
        """
        格式化物件數量描述，提供多樣化的表達方式

        Args:
            class_name: 標準化後的類別名稱
            count: 物件數量

        Returns:
            str: 格式化的數量描述
        """
        try:
            if count <= 0:
                return ""

            # 單數情況
            if count == 1:
                article = "an" if class_name[0].lower() in 'aeiou' else "a"
                return f"{article} {class_name}"

            # 複數情況
            plural_form = self._get_plural_form(class_name)

            # 根據數量選擇不同的表達方式
            if count == 2:
                return f"two {plural_form}"
            elif count == 3:
                return f"three {plural_form}"
            elif count <= 5:
                return f"{count} {plural_form}"
            elif count <= 10:
                return f"several {plural_form}"
            else:
                return f"numerous {plural_form}"

        except Exception as e:
            self.logger.warning(f"Error formatting object count for '{class_name}': {str(e)}")
            return f"{count} {class_name}s" if count > 1 else class_name

    def _get_plural_form(self, word: str) -> str:
        """
        獲取詞彙的複數形式

        Args:
            word: 單數詞彙

        Returns:
            str: 複數形式
        """
        try:
            # 特殊複數形式
            irregular_plurals = {
                'person': 'people',
                'child': 'children',
                'foot': 'feet',
                'tooth': 'teeth',
                'mouse': 'mice',
                'man': 'men',
                'woman': 'women'
            }

            if word.lower() in irregular_plurals:
                return irregular_plurals[word.lower()]

            # 規則複數形式
            if word.endswith(('s', 'sh', 'ch', 'x', 'z')):
                return word + 'es'
            elif word.endswith('y') and word[-2] not in 'aeiou':
                return word[:-1] + 'ies'
            elif word.endswith('f'):
                return word[:-1] + 'ves'
            elif word.endswith('fe'):
                return word[:-2] + 'ves'
            else:
                return word + 's'

        except Exception as e:
            self.logger.warning(f"Error getting plural form for '{word}': {str(e)}")
            return word + 's'

    def _normalize_object_class_name(self, class_name: str) -> str:
        """
        標準化物件類別名稱，確保輸出自然語言格式

        Args:
            class_name: 原始類別名稱

        Returns:
            str: 標準化後的類別名稱
        """
        try:
            if not class_name or not isinstance(class_name, str):
                return "object"

            # 移除可能的技術性前綴或後綴
            import re
            normalized = re.sub(r'^(class_|id_|type_)', '', class_name.lower())
            normalized = re.sub(r'(_class|_id|_type)$', '', normalized)

            # 將下劃線和連字符替換為空格
            normalized = normalized.replace('_', ' ').replace('-', ' ')

            # 移除多餘空格
            normalized = ' '.join(normalized.split())

            # 特殊類別名稱的標準化映射
            class_name_mapping = {
                'traffic light': 'traffic light',
                'stop sign': 'stop sign',
                'fire hydrant': 'fire hydrant',
                'dining table': 'dining table',
                'potted plant': 'potted plant',
                'tv monitor': 'television',
                'cell phone': 'mobile phone',
                'wine glass': 'wine glass',
                'hot dog': 'hot dog',
                'teddy bear': 'teddy bear',
                'hair drier': 'hair dryer',
                'toothbrush': 'toothbrush'
            }

            return class_name_mapping.get(normalized, normalized)

        except Exception as e:
            self.logger.warning(f"Error normalizing class name '{class_name}': {str(e)}")
            return class_name if isinstance(class_name, str) else "object"

    def generate_basic_details(self, scene_type: str, detected_objects: List[Dict]) -> str:
        """
        當模板不可用時生成基本詳細信息

        Args:
            scene_type: 識別的場景類型
            detected_objects: 檢測到的物件列表

        Returns:
            str: 基本場景詳細信息
        """
        try:
            # 處理特定場景類型的自定義邏輯
            if scene_type == "living_room":
                tv_objs = [obj for obj in detected_objects if obj.get("class_id") == 62]  # TV
                sofa_objs = [obj for obj in detected_objects if obj.get("class_id") == 57]  # Sofa

                if tv_objs and sofa_objs:
                    tv_region = tv_objs[0].get("region", "center")
                    sofa_region = sofa_objs[0].get("region", "center")

                    arrangement = f"The TV is in the {tv_region.replace('_', ' ')} of the image, "
                    arrangement += f"while the sofa is in the {sofa_region.replace('_', ' ')}. "

                    return f"{arrangement}This appears to be a space designed for relaxation and entertainment."

            elif scene_type == "bedroom":
                bed_objs = [obj for obj in detected_objects if obj.get("class_id") == 59]  # Bed

                if bed_objs:
                    bed_region = bed_objs[0].get("region", "center")
                    extra_items = []

                    for obj in detected_objects:
                        if obj.get("class_id") == 74:  # Clock
                            extra_items.append("clock")
                        elif obj.get("class_id") == 73:  # Book
                            extra_items.append("book")

                    extras = ""
                    if extra_items:
                        extras = f" There is also a {' and a '.join(extra_items)} visible."

                    return f"The bed is located in the {bed_region.replace('_', ' ')} of the image.{extras}"

            elif scene_type in ["dining_area", "kitchen"]:
                # 計算食物和餐飲相關物品
                food_items = []
                for obj in detected_objects:
                    if obj.get("class_id") in [39, 41, 42, 43, 44, 45]:  # 廚房物品
                        food_items.append(obj.get("class_name", "kitchen item"))

                food_str = ""
                if food_items:
                    unique_items = list(set(food_items))
                    if len(unique_items) <= 3:
                        food_str = f" with {', '.join(unique_items)}"
                    else:
                        food_str = f" with {', '.join(unique_items[:3])} and other items"

                return f"{food_str}."

            elif scene_type == "city_street":
                # 計算人員和車輛
                people_count = len([obj for obj in detected_objects if obj.get("class_id") == 0])
                vehicle_count = len([obj for obj in detected_objects
                                   if obj.get("class_id") in [1, 2, 3, 5, 7]])  # Bicycle, car, motorbike, bus, truck

                traffic_desc = ""
                if people_count > 0 and vehicle_count > 0:
                    traffic_desc = f" with {people_count} {'people' if people_count > 1 else 'person'} and "
                    traffic_desc += f"{vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}"
                elif people_count > 0:
                    traffic_desc = f" with {people_count} {'people' if people_count > 1 else 'person'}"
                elif vehicle_count > 0:
                    traffic_desc = f" with {vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}"

                return f"{traffic_desc}."

            elif scene_type == "asian_commercial_street":
                # 尋找關鍵城市元素
                people_count = len([obj for obj in detected_objects if obj.get("class_id") == 0])
                vehicle_count = len([obj for obj in detected_objects if obj.get("class_id") in [1, 2, 3]])

                # 分析行人分布
                people_positions = []
                for obj in detected_objects:
                    if obj.get("class_id") == 0:  # Person
                        people_positions.append(obj.get("normalized_center", (0.5, 0.5)))

                # 檢查人員是否沿線分布（表示步行路徑）
                structured_path = False
                if len(people_positions) >= 3:
                    # 簡化檢查 - 查看多個人員的y坐標是否相似
                    y_coords = [pos[1] for pos in people_positions]
                    y_mean = sum(y_coords) / len(y_coords)
                    y_variance = sum((y - y_mean)**2 for y in y_coords) / len(y_coords)
                    if y_variance < 0.05:  # 低變異數表示線性排列
                        structured_path = True

                street_desc = "A commercial street with "
                if people_count > 0:
                    street_desc += f"{people_count} {'pedestrians' if people_count > 1 else 'pedestrian'}"
                    if vehicle_count > 0:
                        street_desc += f" and {vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}"
                elif vehicle_count > 0:
                    street_desc += f"{vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}"
                else:
                    street_desc += "various commercial elements"

                if structured_path:
                    street_desc += ". The pedestrians appear to be following a defined walking path"

                # 添加文化元素
                street_desc += ". The signage and architectural elements suggest an Asian urban setting."

                return street_desc

            # 默認通用描述
            return "The scene contains various elements characteristic of this environment."

        except Exception as e:
            self.logger.warning(f"Error generating basic details for scene_type '{scene_type}': {str(e)}")
            return "The scene contains various elements characteristic of this environment."

    def generate_placeholder_content(self, placeholder: str, detected_objects: List[Dict], scene_type: str) -> str:
        """
        為模板佔位符生成內容

        Args:
            placeholder: 模板佔位符
            detected_objects: 檢測到的物件列表
            scene_type: 場景類型

        Returns:
            str: 生成的佔位符內容
        """
        try:
            # 處理不同類型的佔位符與自定義邏輯
            if placeholder == "furniture":
                # 提取家具物品
                furniture_ids = [56, 57, 58, 59, 60, 61]  # 家具類別ID示例
                furniture_objects = [obj for obj in detected_objects if obj.get("class_id") in furniture_ids]

                if furniture_objects:
                    furniture_names = []
                    for obj in furniture_objects[:3]:
                        raw_name = obj.get("class_name", "furniture")
                        normalized_name = self._normalize_object_class_name(raw_name)
                        furniture_names.append(normalized_name)

                    unique_names = list(set(furniture_names))
                    if len(unique_names) == 1:
                        return unique_names[0]
                    elif len(unique_names) == 2:
                        return f"{unique_names[0]} and {unique_names[1]}"
                    else:
                        return ", ".join(unique_names[:-1]) + f", and {unique_names[-1]}"
                return "various furniture items"

            elif placeholder == "electronics":
                # 提取電子物品
                electronics_ids = [62, 63, 64, 65, 66, 67, 68, 69, 70]  # 電子設備類別ID示例
                electronics_objects = [obj for obj in detected_objects if obj.get("class_id") in electronics_ids]

                if electronics_objects:
                    electronics_names = [obj.get("class_name", "electronic device") for obj in electronics_objects[:3]]
                    return ", ".join(set(electronics_names))
                return "electronic devices"

            elif placeholder == "people_count":
                # 計算人數
                people_count = len([obj for obj in detected_objects if obj.get("class_id") == 0])

                if people_count == 0:
                    return "no people"
                elif people_count == 1:
                    return "one person"
                elif people_count < 5:
                    return f"{people_count} people"
                else:
                    return "several people"

            elif placeholder == "seating":
                # 提取座位物品
                seating_ids = [56, 57]  # chair, sofa
                seating_objects = [obj for obj in detected_objects if obj.get("class_id") in seating_ids]

                if seating_objects:
                    seating_names = [obj.get("class_name", "seating") for obj in seating_objects[:2]]
                    return ", ".join(set(seating_names))
                return "seating arrangements"

            # 默認情況 - 空字符串
            return ""

        except Exception as e:
            self.logger.warning(f"Error generating placeholder content for '{placeholder}': {str(e)}")
            return ""

    def describe_functional_zones(self, functional_zones: Dict) -> str:
        """
        生成場景功能區域的描述，優化處理行人區域、人數統計和物品重複問題

        Args:
            functional_zones: 識別出的功能區域字典

        Returns:
            str: 功能區域描述
        """
        try:
            if not functional_zones:
                return ""

            # 處理不同類型的 functional_zones 參數
            if isinstance(functional_zones, list):
                # 如果是列表，轉換為字典格式
                zones_dict = {}
                for i, zone in enumerate(functional_zones):
                    if isinstance(zone, dict) and 'name' in zone:
                        zone_name = self._normalize_zone_name(zone['name'])
                    else:
                        zone_name = f"functional area {i+1}"
                    zones_dict[zone_name] = zone if isinstance(zone, dict) else {"description": str(zone)}
                functional_zones = zones_dict
            elif not isinstance(functional_zones, dict):
                return ""

            # 標準化所有區域鍵名，移除內部標識符格式
            normalized_zones = {}
            for zone_key, zone_data in functional_zones.items():
                normalized_key = self._normalize_zone_name(zone_key)
                normalized_zones[normalized_key] = zone_data
            functional_zones = normalized_zones

            # 計算場景中的總人數
            total_people_count = 0
            people_by_zone = {}

            # 計算每個區域的人數並累計總人數
            for zone_name, zone_info in functional_zones.items():
                if "objects" in zone_info:
                    zone_people_count = zone_info["objects"].count("person")
                    people_by_zone[zone_name] = zone_people_count
                    total_people_count += zone_people_count

            # 分類區域為行人區域和其他區域
            pedestrian_zones = []
            other_zones = []

            for zone_name, zone_info in functional_zones.items():
                # 檢查是否是行人相關區域
                if any(keyword in zone_name.lower() for keyword in ["pedestrian", "crossing", "people"]):
                    pedestrian_zones.append((zone_name, zone_info))
                else:
                    other_zones.append((zone_name, zone_info))

            # 獲取最重要的行人區域和其他區域
            main_pedestrian_zones = sorted(pedestrian_zones,
                                        key=lambda z: people_by_zone.get(z[0], 0),
                                        reverse=True)[:1]  # 最多1個主要行人區域

            top_other_zones = sorted(other_zones,
                                key=lambda z: len(z[1].get("objects", [])),
                                reverse=True)[:2]  # 最多2個其他區域

            # 合併區域
            top_zones = main_pedestrian_zones + top_other_zones

            if not top_zones:
                return ""

            # 生成匯總描述
            summary = ""
            max_mentioned_people = 0  # 追蹤已經提到的最大人數

            # 如果總人數顯著且還沒在主描述中提到，添加總人數描述
            if total_people_count > 5:
                summary = f"The scene contains a significant number of pedestrians ({total_people_count} people). "
                max_mentioned_people = total_people_count  # 更新已提到的最大人數

            # 處理每個區域的描述，確保人數信息的一致性
            processed_zones = []

            for zone_name, zone_info in top_zones:
                zone_desc = zone_info.get("description", "a functional zone")
                zone_people_count = people_by_zone.get(zone_name, 0)

                # 檢查描述中是否包含人數資訊
                contains_people_info = "with" in zone_desc and ("person" in zone_desc.lower() or "people" in zone_desc.lower())

                # 如果描述包含人數信息，且人數較小（小於已提到的最大人數），則修改描述
                if contains_people_info and zone_people_count < max_mentioned_people:
                    parts = zone_desc.split("with")
                    if len(parts) > 1:
                        # 移除人數部分
                        zone_desc = parts[0].strip() + " area"

                processed_zones.append((zone_name, {"description": zone_desc}))

            # 根據處理後的區域數量生成最終描述
            final_desc = ""

            if len(processed_zones) == 1:
                _, zone_info = processed_zones[0]
                zone_desc = zone_info["description"]
                final_desc = summary + f"The scene includes {zone_desc}."
            elif len(processed_zones) == 2:
                _, zone1_info = processed_zones[0]
                _, zone2_info = processed_zones[1]
                zone1_desc = zone1_info["description"]
                zone2_desc = zone2_info["description"]
                final_desc = summary + f"The scene is divided into two main areas: {zone1_desc} and {zone2_desc}."
            else:
                zones_desc = ["The scene contains multiple functional areas including"]
                zone_descriptions = [z[1]["description"] for z in processed_zones]

                # 格式化最終的多區域描述
                if len(zone_descriptions) == 3:
                    formatted_desc = f"{zone_descriptions[0]}, {zone_descriptions[1]}, and {zone_descriptions[2]}"
                else:
                    formatted_desc = ", ".join(zone_descriptions[:-1]) + f", and {zone_descriptions[-1]}"

                final_desc = summary + f"{zones_desc[0]} {formatted_desc}."

            return self.optimize_object_description(final_desc)

        except Exception as e:
            self.logger.warning(f"Error describing functional zones: {str(e)}")
            return ""

    def _normalize_zone_name(self, zone_name: str) -> str:
        """
        將內部區域鍵名標準化為自然語言描述

        Args:
            zone_name: 原始區域名稱

        Returns:
            str: 標準化後的區域名稱
        """
        try:
            if not zone_name or not isinstance(zone_name, str):
                return "functional area"

            # 移除數字後綴（如 crossing_zone_1 -> crossing_zone）
            import re
            base_name = re.sub(r'_\d+$', '', zone_name)

            # 將下劃線替換為空格
            normalized = base_name.replace('_', ' ')

            # 標準化常見的區域類型名稱
            zone_type_mapping = {
                'crossing zone': 'pedestrian crossing area',
                'vehicle zone': 'vehicle movement area',
                'pedestrian zone': 'pedestrian activity area',
                'traffic zone': 'traffic flow area',
                'waiting zone': 'waiting area',
                'seating zone': 'seating area',
                'dining zone': 'dining area',
                'furniture zone': 'furniture arrangement area',
                'electronics zone': 'electronics area',
                'people zone': 'social activity area',
                'functional area': 'activity area'
            }

            # 檢查是否有對應的標準化名稱
            for pattern, replacement in zone_type_mapping.items():
                if pattern in normalized.lower():
                    return replacement

            # 如果沒有特定映射，使用通用格式
            if 'zone' in normalized.lower():
                normalized = normalized.replace('zone', 'area')
            elif not any(keyword in normalized.lower() for keyword in ['area', 'space', 'region']):
                normalized += ' area'

            return normalized.strip()

        except Exception as e:
            self.logger.warning(f"Error normalizing zone name '{zone_name}': {str(e)}")
            return "activity area"

    def get_configuration(self) -> Dict[str, Any]:
        """
        獲取當前配置參數

        Returns:
            Dict[str, Any]: 配置參數字典
        """
        return {
            "min_prominence_score": self.min_prominence_score,
            "max_categories_to_return": self.max_categories_to_return,
            "max_total_objects": self.max_total_objects,
            "confidence_threshold_for_description": self.confidence_threshold_for_description
        }

    def update_configuration(self, **kwargs):
        """
        更新配置參數

        Args:
            **kwargs: 要更新的配置參數
        """
        try:
            for key, value in kwargs.items():
                if hasattr(self, key):
                    old_value = getattr(self, key)
                    setattr(self, key, value)
                    self.logger.info(f"Updated {key}: {old_value} -> {value}")
                else:
                    self.logger.warning(f"Unknown configuration parameter: {key}")

        except Exception as e:
            self.logger.error(f"Error updating configuration: {str(e)}")
            raise ObjectDescriptionError(f"Failed to update configuration: {str(e)}") from e