import logging import traceback import re from typing import Dict, List, Tuple, Optional, Any import numpy as np class ObjectDescriptionError(Exception): """物件描述生成過程中的自定義異常""" pass class ObjectDescriptionGenerator: """ 物件描述生成器 - 負責將檢測到的物件轉換為自然語言描述 該類別處理物件相關的所有描述生成邏輯,包括重要物件的識別、 空間位置描述、物件列表格式化以及描述文本的優化。 """ def __init__(self, min_prominence_score: float = 0.1, max_categories_to_return: int = 5, max_total_objects: int = 7, confidence_threshold_for_description: float = 0.25, region_analyzer: Optional[Any] = None): """ 初始化物件描述生成器 Args: min_prominence_score: 物件顯著性的最低分數閾值 max_categories_to_return: 返回的物件類別最大數量 max_total_objects: 返回的物件總數上限 confidence_threshold_for_description: 用於描述的置信度閾值 """ self.logger = logging.getLogger(self.__class__.__name__) self.min_prominence_score = min_prominence_score self.max_categories_to_return = max_categories_to_return self.max_total_objects = max_total_objects self.confidence_threshold_for_description = confidence_threshold_for_description self.region_analyzer = region_analyzer self.logger.info("ObjectDescriptionGenerator initialized with prominence_score=%.2f, " "max_categories=%d, max_objects=%d, confidence_threshold=%.2f", min_prominence_score, max_categories_to_return, max_total_objects, confidence_threshold_for_description) def get_prominent_objects(self, detected_objects: List[Dict], min_prominence_score: float = 0.5, max_categories_to_return: Optional[int] = None) -> List[Dict]: """ 獲取最重要的物件,基於置信度、大小和位置計算重要性評分 Args: detected_objects: 檢測到的物件列表 min_prominence_score: 最小重要性分數閾值,範圍 0.0-1.0 max_categories_to_return: 可選的最大返回類別數量限制 Returns: List[Dict]: 按重要性排序的物件列表 """ try: if not detected_objects: return [] prominent_objects = [] for obj in detected_objects: # 計算重要性評分 prominence_score = self._calculate_prominence_score(obj) # 只保留超過閾值的物件 if prominence_score >= min_prominence_score: obj_copy = obj.copy() obj_copy['prominence_score'] = prominence_score prominent_objects.append(obj_copy) # 按重要性評分排序(從高到低) prominent_objects.sort(key=lambda x: x.get('prominence_score', 0), reverse=True) # 如果指定了最大類別數量限制,進行過濾 if max_categories_to_return is not None and max_categories_to_return > 0: categories_seen = set() filtered_objects = [] for obj in prominent_objects: class_name = obj.get("class_name", "unknown") # 如果是新類別且未達到限制 if class_name not in categories_seen: if len(categories_seen) < max_categories_to_return: categories_seen.add(class_name) filtered_objects.append(obj) else: # 已見過的類別,直接添加 filtered_objects.append(obj) return filtered_objects return prominent_objects except Exception as e: self.logger.error(f"Error calculating prominent objects: {str(e)}") return [] def set_region_analyzer(self, region_analyzer: Any) -> None: """ 設置RegionAnalyzer,用於標準化空間描述生成 Args: region_analyzer: RegionAnalyzer實例 """ try: self.region_analyzer = region_analyzer self.logger.info("RegionAnalyzer instance set for ObjectDescriptionGenerator") except Exception as e: self.logger.warning(f"Error setting RegionAnalyzer: {str(e)}") def _get_standardized_spatial_description(self, obj: Dict) -> str: """ 使用RegionAnalyzer生成標準化空間描述的內部方法 Args: obj: 物件字典 Returns: str: 標準化空間描述,失敗時返回空字串 """ try: if hasattr(self, 'region_analyzer') and self.region_analyzer: region = obj.get("region", "") object_type = obj.get("class_name", "") if hasattr(self.region_analyzer, 'get_contextual_spatial_description'): return self.region_analyzer.get_contextual_spatial_description(region, object_type) elif hasattr(self.region_analyzer, 'get_spatial_description_phrase'): return self.region_analyzer.get_spatial_description_phrase(region) return "" except Exception as e: self.logger.warning(f"Error getting standardized spatial description: {str(e)}") if object_type: return f"visible in the scene" return "present in the view" def _calculate_prominence_score(self, obj: Dict) -> float: """ 計算物件的重要性評分 Args: obj: 物件字典,包含檢測信息 Returns: float: 重要性評分 (0.0-1.0) """ try: # 基礎置信度評分 (權重: 40%) confidence = obj.get("confidence", 0.5) confidence_score = confidence * 0.4 # 大小評分 (權重: 30%) normalized_area = obj.get("normalized_area", 0.1) # 使用對數縮放避免過大物件主導評分 size_score = min(np.log(normalized_area * 10 + 1) / np.log(11), 1.0) * 0.3 # 位置評分 (權重: 20%) # 中心區域的物件通常更重要 center_x, center_y = obj.get("normalized_center", [0.5, 0.5]) distance_from_center = np.sqrt((center_x - 0.5)**2 + (center_y - 0.5)**2) position_score = (1 - min(distance_from_center * 2, 1.0)) * 0.2 # 類別重要性評分 (權重: 10%) class_importance = self._get_class_importance(obj.get("class_name", "unknown")) class_score = class_importance * 0.1 total_score = confidence_score + size_score + position_score + class_score # 確保評分在有效範圍內 return max(0.0, min(1.0, total_score)) except Exception as e: self.logger.warning(f"Error calculating prominence score for object: {str(e)}") return 0.5 # 返回中等評分作為備用 def _get_class_importance(self, class_name: str) -> float: """ 根據物件類別返回重要性係數 Args: class_name: 物件類別名稱 Returns: float: 類別重要性係數 (0.0-1.0) """ # 高重要性物件(人、車輛、建築) high_importance = ["person", "car", "truck", "bus", "motorcycle", "bicycle", "building"] # 中等重要性物件(家具、電器) medium_importance = ["chair", "couch", "tv", "laptop", "refrigerator", "dining table", "bed"] # 低重要性物件(小物品、配件) low_importance = ["handbag", "backpack", "umbrella", "cell phone", "remote", "mouse"] class_name_lower = class_name.lower() if any(item in class_name_lower for item in high_importance): return 1.0 elif any(item in class_name_lower for item in medium_importance): return 0.7 elif any(item in class_name_lower for item in low_importance): return 0.4 else: return 0.6 # 預設中等重要性 def format_object_list_for_description(self, objects: List[Dict], use_indefinite_article_for_one: bool = False, count_threshold_for_generalization: int = -1, max_types_to_list: int = 5) -> str: """ 將物件列表格式化為人類可讀的字符串,包含計數信息 Args: objects: 物件字典列表,每個應包含 'class_name' use_indefinite_article_for_one: 單個物件是否使用 "a/an",否則使用 "one" count_threshold_for_generalization: 超過此計數時使用通用術語,-1表示精確計數 max_types_to_list: 列表中包含的不同物件類型最大數量 Returns: str: 格式化的物件描述字符串 """ try: if not objects: return "no specific objects clearly identified" counts: Dict[str, int] = {} for obj in objects: name = obj.get("class_name", "unknown object") if name == "unknown object" or not name: continue counts[name] = counts.get(name, 0) + 1 if not counts: return "no specific objects clearly identified" descriptions = [] # 按計數降序然後按名稱升序排序,限制物件類型數量 sorted_counts = sorted(counts.items(), key=lambda item: (-item[1], item[0]))[:max_types_to_list] for name, count in sorted_counts: if count == 1: if use_indefinite_article_for_one: if name[0].lower() in 'aeiou': descriptions.append(f"an {name}") else: descriptions.append(f"a {name}") else: descriptions.append(f"one {name}") else: # 處理複數形式 plural_name = name if name.endswith("y") and not name.lower().endswith(("ay", "ey", "iy", "oy", "uy")): plural_name = name[:-1] + "ies" elif name.endswith(("s", "sh", "ch", "x", "z")): plural_name = name + "es" elif not name.endswith("s"): plural_name = name + "s" if count_threshold_for_generalization != -1 and count > count_threshold_for_generalization: if count <= count_threshold_for_generalization + 3: descriptions.append(f"several {plural_name}") else: descriptions.append(f"many {plural_name}") else: descriptions.append(f"{count} {plural_name}") if not descriptions: return "no specific objects clearly identified" if len(descriptions) == 1: return descriptions[0] elif len(descriptions) == 2: return f"{descriptions[0]} and {descriptions[1]}" else: # 使用牛津逗號格式 return ", ".join(descriptions[:-1]) + f", and {descriptions[-1]}" except Exception as e: self.logger.warning(f"Error formatting object list: {str(e)}") return "various objects" def get_spatial_description(self, obj: Dict, image_width: Optional[int] = None, image_height: Optional[int] = None, region_analyzer: Optional[Any] = None) -> str: """ 為物件生成空間位置描述 Args: obj: 物件字典 image_width: 可選的圖像寬度 image_height: 可選的圖像高度 region_analyzer: 可選的RegionAnalyzer實例,用於生成標準化描述 Returns: str: 空間描述字符串,空值region時返回空字串 """ try: region = obj.get("region") or "" # 處理空值或無效region,直接返回空字串避免不完整描述 if not region.strip() or region == "unknown": # 根據物件類型提供合適的預設位置描述 if object_type and any(vehicle in object_type.lower() for vehicle in ["car", "truck", "bus"]): return "positioned in the scene" elif object_type and "person" in object_type.lower(): return "present in the area" else: return "located in the scene" # 如果提供了RegionAnalyzer實例,使用其標準化方法 if region_analyzer and hasattr(region_analyzer, 'get_spatial_description_phrase'): object_type = obj.get("class_name", "") if hasattr(region_analyzer, 'get_contextual_spatial_description'): spatial_desc = region_analyzer.get_contextual_spatial_description(region, object_type) else: spatial_desc = region_analyzer.get_spatial_description_phrase(region) if spatial_desc: return spatial_desc # 備用邏輯:使用改進的內建映射 clean_region = region.replace('_', ' ').strip().lower() region_map = { "top left": "in the upper left area", "top center": "in the upper area", "top right": "in the upper right area", "middle left": "on the left side", "middle center": "in the center", "center": "in the center", "middle right": "on the right side", "bottom left": "in the lower left area", "bottom center": "in the lower area", "bottom right": "in the lower right area" } # 直接映射匹配 if clean_region in region_map: return region_map[clean_region] # 模糊匹配處理 if "top" in clean_region and "left" in clean_region: return "in the upper left area" elif "top" in clean_region and "right" in clean_region: return "in the upper right area" elif "bottom" in clean_region and "left" in clean_region: return "in the lower left area" elif "bottom" in clean_region and "right" in clean_region: return "in the lower right area" elif "top" in clean_region: return "in the upper area" elif "bottom" in clean_region: return "in the lower area" elif "left" in clean_region: return "on the left side" elif "right" in clean_region: return "on the right side" elif "center" in clean_region or "middle" in clean_region: return "in the center" # 如果region無法識別,使用normalized_center作為最後備用 norm_center = obj.get("normalized_center") if norm_center and image_width and image_height: x_norm, y_norm = norm_center h_pos = "left" if x_norm < 0.4 else "right" if x_norm > 0.6 else "center" v_pos = "upper" if y_norm < 0.4 else "lower" if y_norm > 0.6 else "center" if h_pos == "center" and v_pos == "center": return "in the center" return f"in the {v_pos} {h_pos} area" # 如果所有方法都失敗,返回空字串 return "" except Exception as e: self.logger.warning(f"Error generating spatial description: {str(e)}") return "" def optimize_object_description(self, description: str) -> str: """ 優化物件描述,避免重複列舉相同物件 Args: description: 原始描述文本 Returns: str: 優化後的描述文本 """ try: import re # 處理床鋪重複描述 if "bed in the room" in description: description = description.replace("a bed in the room", "a bed") # 處理重複的物件列表 object_lists = re.findall(r'with ([^\.]+?)(?:\.|\band\b)', description) for obj_list in object_lists: # 計算每個物件出現次數 items = re.findall(r'([a-zA-Z\s]+)(?:,|\band\b|$)', obj_list) item_counts = {} for item in items: item = item.strip() if item and item not in ["and", "with"]: if item not in item_counts: item_counts[item] = 0 item_counts[item] += 1 # 生成優化後的物件列表 if item_counts: new_items = [] for item, count in item_counts.items(): if count > 1: new_items.append(f"{count} {item}s") else: new_items.append(item) # 格式化新列表 if len(new_items) == 1: new_list = new_items[0] elif len(new_items) == 2: new_list = f"{new_items[0]} and {new_items[1]}" else: new_list = ", ".join(new_items[:-1]) + f", and {new_items[-1]}" # 替換原始列表 description = description.replace(obj_list, new_list) return description except Exception as e: self.logger.warning(f"Error optimizing object description: {str(e)}") return description def generate_dynamic_everyday_description(self, detected_objects: List[Dict], lighting_info: Optional[Dict] = None, viewpoint: str = "eye_level", spatial_analysis: Optional[Dict] = None, image_dimensions: Optional[Tuple[int, int]] = None, places365_info: Optional[Dict] = None, object_statistics: Optional[Dict] = None) -> str: """ 為日常場景動態生成描述,基於所有相關的檢測物件、計數和上下文 Args: detected_objects: 檢測到的物件列表 lighting_info: 照明信息 viewpoint: 視角類型 spatial_analysis: 空間分析結果 image_dimensions: 圖像尺寸 places365_info: Places365場景分類信息 object_statistics: 物件統計信息 Returns: str: 動態生成的場景描述 """ try: description_segments = [] image_width, image_height = image_dimensions if image_dimensions else (None, None) self.logger.debug(f"Generating dynamic description for {len(detected_objects)} objects, " f"viewpoint: {viewpoint}, lighting: {lighting_info is not None}") # 1. 整體氛圍(照明和視角) ambiance_parts = [] if lighting_info: time_of_day = lighting_info.get("time_of_day", "unknown lighting") is_indoor = lighting_info.get("is_indoor") ambiance_statement = "This is" if is_indoor is True: ambiance_statement += " an indoor scene" elif is_indoor is False: ambiance_statement += " an outdoor scene" else: ambiance_statement += " a scene" # remove underline readable_lighting = f"with {time_of_day.replace('_', ' ')} lighting conditions" ambiance_statement += f", likely {readable_lighting}." ambiance_parts.append(ambiance_statement) if viewpoint and viewpoint != "eye_level": if not ambiance_parts: ambiance_parts.append(f"From {viewpoint.replace('_', ' ')}, the general layout of the scene is observed.") else: ambiance_parts[-1] = ambiance_parts[-1].rstrip('.') + f", viewed from {viewpoint.replace('_', ' ')}." if ambiance_parts: description_segments.append(" ".join(ambiance_parts)) # 2. 描述所有檢測到的物件,按類別分組,使用準確計數和位置 if not detected_objects: if not description_segments: description_segments.append("A general scene is visible, but no specific objects were clearly identified.") else: description_segments.append("Within this setting, no specific objects were clearly identified.") else: objects_by_class: Dict[str, List[Dict]] = {} # 使用置信度過濾 confident_objects = [obj for obj in detected_objects if obj.get("confidence", 0) >= self.confidence_threshold_for_description] print(f"DEBUG: After confidence filtering (threshold={self.confidence_threshold_for_description}):") for class_name in ["car", "traffic light", "person", "handbag"]: class_objects = [obj for obj in confident_objects if obj.get("class_name") == class_name] print(f"DEBUG: {class_name}: {len(class_objects)} confident objects") if not confident_objects: no_confident_obj_msg = "While some elements might be present, no objects were identified with sufficient confidence for a detailed description." if not description_segments: description_segments.append(no_confident_obj_msg) else: description_segments.append(no_confident_obj_msg.lower().capitalize()) else: if object_statistics: # 使用預計算的統計信息,採用動態的信心度 for class_name, stats in object_statistics.items(): count = stats.get("count", 0) avg_confidence = stats.get("avg_confidence", 0) # 動態調整置信度閾值 dynamic_threshold = self.confidence_threshold_for_description if class_name in ["potted plant", "vase", "clock", "book"]: dynamic_threshold = max(0.15, self.confidence_threshold_for_description * 0.6) elif count >= 3: dynamic_threshold = max(0.2, self.confidence_threshold_for_description * 0.8) if count > 0 and avg_confidence >= dynamic_threshold: matching_objects = [obj for obj in confident_objects if obj.get("class_name") == class_name] if not matching_objects: matching_objects = [obj for obj in detected_objects if obj.get("class_name") == class_name and obj.get("confidence", 0) >= dynamic_threshold] if matching_objects: actual_count = min(stats["count"], len(matching_objects)) objects_by_class[class_name] = matching_objects[:actual_count] else: # 備用邏輯,同樣使用動態閾值 for obj in confident_objects: name = obj.get("class_name", "unknown object") if name == "unknown object" or not name: continue if name not in objects_by_class: objects_by_class[name] = [] objects_by_class[name].append(obj) print(f"DEBUG: Before spatial deduplication:") for class_name in ["car", "traffic light", "person", "handbag"]: if class_name in objects_by_class: print(f"DEBUG: {class_name}: {len(objects_by_class[class_name])} objects before dedup") if not objects_by_class: description_segments.append("No common objects were confidently identified for detailed description.") else: # 物件組排序函數 def sort_key_object_groups(item_tuple: Tuple[str, List[Dict]]): class_name_key, obj_group_list = item_tuple priority = 3 count = len(obj_group_list) # 確保類別名稱已標準化 normalized_class_name = self._normalize_object_class_name(class_name_key) # 動態優先級 if normalized_class_name == "person": priority = 0 elif normalized_class_name in ["dining table", "chair", "sofa", "bed"]: priority = 1 elif normalized_class_name in ["car", "bus", "truck", "traffic light"]: priority = 2 elif count >= 3: priority = max(1, priority - 1) elif normalized_class_name in ["potted plant", "vase", "clock", "book"] and count >= 2: priority = 2 avg_area = sum(o.get("normalized_area", 0.0) for o in obj_group_list) / len(obj_group_list) if obj_group_list else 0 quantity_bonus = min(count / 5.0, 1.0) return (priority, -len(obj_group_list), -avg_area, -quantity_bonus) # remove duplicate deduplicated_objects_by_class = {} processed_positions = [] for class_name, group_of_objects in objects_by_class.items(): unique_objects = [] for obj in group_of_objects: obj_position = obj.get("normalized_center", [0.5, 0.5]) is_duplicate = False for processed_pos in processed_positions: position_distance = abs(obj_position[0] - processed_pos[0]) + abs(obj_position[1] - processed_pos[1]) if position_distance < 0.15: is_duplicate = True break if not is_duplicate: unique_objects.append(obj) processed_positions.append(obj_position) if unique_objects: deduplicated_objects_by_class[class_name] = unique_objects objects_by_class = deduplicated_objects_by_class print(f"DEBUG: After spatial deduplication:") for class_name in ["car", "traffic light", "person", "handbag"]: if class_name in objects_by_class: print(f"DEBUG: {class_name}: {len(objects_by_class[class_name])} objects after dedup") sorted_object_groups = sorted(objects_by_class.items(), key=sort_key_object_groups) object_clauses = [] for class_name, group_of_objects in sorted_object_groups: count = len(group_of_objects) if class_name in ["car", "traffic light", "person", "handbag"]: print(f"DEBUG: Final count for {class_name}: {count}") if count == 0: continue # 標準化class name normalized_class_name = self._normalize_object_class_name(class_name) # 使用統計信息確保準確的數量描述 if object_statistics and class_name in object_statistics: actual_count = object_statistics[class_name]["count"] formatted_name_with_exact_count = self._format_object_count_description( normalized_class_name, actual_count, scene_type=scene_type ) else: formatted_name_with_exact_count = self._format_object_count_description( normalized_class_name, count, scene_type=scene_type ) if formatted_name_with_exact_count == "no specific objects clearly identified" or not formatted_name_with_exact_count: continue # 確定群組的集體位置 location_description_suffix = "" if count == 1: spatial_desc = self.get_spatial_description(group_of_objects[0], image_width, image_height, self.region_analyzer) if spatial_desc: location_description_suffix = f"is {spatial_desc}" else: distinct_regions = sorted(list(set(obj.get("region", "") for obj in group_of_objects if obj.get("region")))) valid_regions = [r for r in distinct_regions if r and r != "unknown" and r.strip()] if not valid_regions: location_description_suffix = "is positioned in the scene" elif len(valid_regions) == 1: spatial_desc = self.get_spatial_description_phrase(valid_regions[0]) location_description_suffix = f"is primarily {spatial_desc}" if spatial_desc else "is positioned in the scene" elif len(valid_regions) == 2: clean_region1 = valid_regions[0].replace('_', ' ') clean_region2 = valid_regions[1].replace('_', ' ') location_description_suffix = f"is mainly across the {clean_region1} and {clean_region2} areas" else: location_description_suffix = "is distributed in various parts of the scene" else: distinct_regions = sorted(list(set(obj.get("region", "") for obj in group_of_objects if obj.get("region")))) valid_regions = [r for r in distinct_regions if r and r != "unknown" and r.strip()] if not valid_regions: location_description_suffix = "are visible in the scene" elif len(valid_regions) == 1: clean_region = valid_regions[0].replace('_', ' ') location_description_suffix = f"are primarily in the {clean_region} area" elif len(valid_regions) == 2: clean_region1 = valid_regions[0].replace('_', ' ') clean_region2 = valid_regions[1].replace('_', ' ') location_description_suffix = f"are mainly across the {clean_region1} and {clean_region2} areas" else: location_description_suffix = "are distributed in various parts of the scene" # 首字母大寫 formatted_name_capitalized = formatted_name_with_exact_count[0].upper() + formatted_name_with_exact_count[1:] object_clauses.append(f"{formatted_name_capitalized} {location_description_suffix}") if object_clauses: if not description_segments: if object_clauses: first_clause = object_clauses.pop(0) description_segments.append(first_clause + ".") else: if object_clauses: description_segments.append("The scene features:") if object_clauses: joined_object_clauses = ". ".join(object_clauses) if joined_object_clauses and not joined_object_clauses.endswith("."): joined_object_clauses += "." description_segments.append(joined_object_clauses) elif not description_segments: return "The image depicts a scene, but specific objects could not be described with confidence or detail." # 最終組裝和格式化 raw_description = "" for i, segment in enumerate(filter(None, description_segments)): segment = segment.strip() if not segment: continue if not raw_description: raw_description = segment else: if not raw_description.endswith(('.', '!', '?')): raw_description += "." raw_description += " " + (segment[0].upper() + segment[1:] if len(segment) > 1 else segment.upper()) if raw_description and not raw_description.endswith(('.', '!', '?')): raw_description += "." # 移除重複性和不適當的描述詞彙 raw_description = self._remove_repetitive_descriptors(raw_description) if not raw_description or len(raw_description.strip()) < 20: if 'confident_objects' in locals() and confident_objects: return "The scene contains several detected objects, but a detailed textual description could not be fully constructed." else: return "A general scene is depicted with no objects identified with high confidence." return raw_description except Exception as e: error_msg = f"Error generating dynamic everyday description: {str(e)}" self.logger.error(f"{error_msg}\n{traceback.format_exc()}") raise ObjectDescriptionError(error_msg) from e def _remove_repetitive_descriptors(self, description: str) -> str: """ 移除描述中的重複性和不適當的描述詞彙,特別是 "identical" 等詞彙 Args: description: 原始描述文本 Returns: str: 清理後的描述文本 """ try: import re # 定義需要移除或替換的模式 cleanup_patterns = [ # 移除 "identical" 描述模式 (r'\b(\d+)\s+identical\s+([a-zA-Z\s]+)', r'\1 \2'), (r'\b(two|three|four|five|six|seven|eight|nine|ten|eleven|twelve)\s+identical\s+([a-zA-Z\s]+)', r'\1 \2'), (r'\bidentical\s+([a-zA-Z\s]+)', r'\1'), # 改善 "comprehensive arrangement" 等過於技術性的表達 (r'\bcomprehensive arrangement of\b', 'arrangement of'), (r'\bcomprehensive view featuring\b', 'scene featuring'), (r'\bcomprehensive display of\b', 'display of'), # 簡化過度描述性的短語 (r'\bpositioning around\s+(\d+)\s+identical\b', r'positioning around \1'), (r'\barranged around\s+(\d+)\s+identical\b', r'arranged around \1'), ] processed_description = description for pattern, replacement in cleanup_patterns: processed_description = re.sub(pattern, replacement, processed_description, flags=re.IGNORECASE) # 進一步清理可能的多餘空格 processed_description = re.sub(r'\s+', ' ', processed_description).strip() self.logger.debug(f"Cleaned description: removed repetitive descriptors") return processed_description except Exception as e: self.logger.warning(f"Error removing repetitive descriptors: {str(e)}") return description def _format_object_count_description(self, class_name: str, count: int, scene_type: Optional[str] = None, detected_objects: Optional[List[Dict]] = None, avg_confidence: float = 0.0) -> str: """ 格式化物件數量描述的核心方法,整合空間排列、材質推斷和場景語境 這個方法是整個物件描述系統的核心,它將多個子功能整合在一起: 1. 數字到文字的轉換(避免阿拉伯數字) 2. 基於場景的材質推斷 3. 空間排列模式的描述 4. 語境化的物件描述 Args: class_name: 標準化後的類別名稱 count: 物件數量 scene_type: 場景類型,用於語境化描述 detected_objects: 該類型的所有檢測物件,用於空間分析 avg_confidence: 平均檢測置信度,影響材質推斷的可信度 Returns: str: 完整的格式化數量描述 """ try: if count <= 0: return "" # 獲取基礎的複數形式 plural_form = self._get_plural_form(class_name) # 單數情況的處理 if count == 1: return self._format_single_object_description(class_name, scene_type, detected_objects, avg_confidence) # 複數情況的處理 return self._format_multiple_objects_description(class_name, count, plural_form, scene_type, detected_objects, avg_confidence) except Exception as e: self.logger.warning(f"Error formatting object count for '{class_name}': {str(e)}") return f"{count} {class_name}s" if count > 1 else class_name def _format_single_object_description(self, class_name: str, scene_type: Optional[str], detected_objects: Optional[List[Dict]], avg_confidence: float) -> str: """ 處理單個物件的描述生成 對於單個物件,我們重點在於通過材質推斷和位置描述來豐富描述內容, 避免簡單的 "a chair" 這樣的描述,而是生成 "a wooden dining chair" 這樣的表達 Args: class_name: 物件類別名稱 scene_type: 場景類型 detected_objects: 檢測物件列表 avg_confidence: 平均置信度 Returns: str: 單個物件的完整描述 """ article = "an" if class_name[0].lower() in 'aeiou' else "a" # 獲取材質描述符 material_descriptor = self._get_material_descriptor(class_name, scene_type, avg_confidence) # 獲取位置或特徵描述符 feature_descriptor = self._get_single_object_feature(class_name, scene_type, detected_objects) # 組合描述 descriptors = [] if material_descriptor: descriptors.append(material_descriptor) if feature_descriptor: descriptors.append(feature_descriptor) if descriptors: return f"{article} {' '.join(descriptors)} {class_name}" else: return f"{article} {class_name}" def _format_multiple_objects_description(self, class_name: str, count: int, plural_form: str, scene_type: Optional[str], detected_objects: Optional[List[Dict]], avg_confidence: float) -> str: """ 處理多個物件的描述生成 對於多個物件,我們的重點是: 1. 將數字轉換為文字表達 2. 分析空間排列模式 3. 添加適當的材質或功能描述 4. 生成自然流暢的描述 Args: class_name: 物件類別名稱 count: 物件數量 plural_form: 複數形式 scene_type: 場景類型 detected_objects: 檢測物件列表 avg_confidence: 平均置信度 Returns: str: 多個物件的完整描述 """ # 數字到文字的轉換映射 number_words = { 2: "two", 3: "three", 4: "four", 5: "five", 6: "six", 7: "seven", 8: "eight", 9: "nine", 10: "ten", 11: "eleven", 12: "twelve" } # 確定基礎數量表達 if count in number_words: count_expression = number_words[count] elif count <= 20: count_expression = "several" else: count_expression = "numerous" # 獲取材質或功能描述符 material_descriptor = self._get_material_descriptor(class_name, scene_type, avg_confidence) # 獲取空間排列描述 spatial_descriptor = self._get_spatial_arrangement_descriptor(class_name, scene_type, detected_objects, count) # 組合最終描述 descriptors = [] if material_descriptor: descriptors.append(material_descriptor) # 構建基礎描述 base_description = f"{count_expression} {' '.join(descriptors)} {plural_form}".strip() # 添加空間排列信息 if spatial_descriptor: return f"{base_description} {spatial_descriptor}" else: return base_description def _get_material_descriptor(self, class_name: str, scene_type: Optional[str], avg_confidence: float) -> Optional[str]: """ 基於場景語境和置信度進行材質推斷 這個方法實現了智能的材質推斷,它不依賴複雜的圖像分析, 而是基於常識和場景邏輯來推斷最可能的材質描述 Args: class_name: 物件類別名稱 scene_type: 場景類型 avg_confidence: 檢測置信度,影響推斷的保守程度 Returns: Optional[str]: 材質描述符,如果無法推斷則返回None """ # 只有在置信度足夠高時才進行材質推斷 if avg_confidence < 0.5: return None # 餐廳和用餐相關場景 if scene_type and scene_type in ["dining_area", "restaurant", "upscale_dining", "cafe"]: material_mapping = { "chair": "wooden" if avg_confidence > 0.7 else None, "dining table": "wooden", "couch": "upholstered", "vase": "decorative" } return material_mapping.get(class_name) # 辦公場景 elif scene_type and scene_type in ["office_workspace", "meeting_room", "conference_room"]: material_mapping = { "chair": "office", "dining table": "conference", # 在辦公環境中,餐桌通常是會議桌 "laptop": "modern", "book": "reference" } return material_mapping.get(class_name) # 客廳場景 elif scene_type and scene_type in ["living_room"]: material_mapping = { "couch": "comfortable", "chair": "accent", "tv": "large", "vase": "decorative" } return material_mapping.get(class_name) # 室外場景 elif scene_type and scene_type in ["city_street", "park_area", "parking_lot"]: material_mapping = { "car": "parked", "person": "walking", "bicycle": "stationed" } return material_mapping.get(class_name) # 如果沒有特定的場景映射,返回通用描述符 generic_mapping = { "chair": "comfortable", "dining table": "sturdy", "car": "parked", "person": "present" } return generic_mapping.get(class_name) def _get_spatial_arrangement_descriptor(self, class_name: str, scene_type: Optional[str], detected_objects: Optional[List[Dict]], count: int) -> Optional[str]: """ 分析物件的空間排列模式並生成相應描述 這個方法通過分析物件的位置分布來判斷排列模式, 然後根據物件類型和場景生成適當的空間描述 Args: class_name: 物件類別名稱 scene_type: 場景類型 detected_objects: 該類型的所有檢測物件 count: 物件數量 Returns: Optional[str]: 空間排列描述,如果無法分析則返回None """ if not detected_objects or len(detected_objects) < 2: return None try: # 提取物件的標準化位置 positions = [] for obj in detected_objects: center = obj.get("normalized_center", [0.5, 0.5]) if isinstance(center, (list, tuple)) and len(center) >= 2: positions.append(center) if len(positions) < 2: return None # 分析排列模式 arrangement_pattern = self._analyze_arrangement_pattern(positions) # 根據物件類型和場景生成描述 return self._generate_arrangement_description(class_name, scene_type, arrangement_pattern, count) except Exception as e: self.logger.warning(f"Error analyzing spatial arrangement: {str(e)}") return None def _analyze_arrangement_pattern(self, positions: List[List[float]]) -> str: """ 分析位置點的排列模式 這個方法使用簡單的幾何分析來判斷物件的排列類型, 幫助我們理解物件在空間中的組織方式 Args: positions: 標準化的位置座標列表 Returns: str: 排列模式類型(linear, clustered, scattered, circular等) """ import numpy as np if len(positions) < 2: return "single" # 轉換為numpy陣列便於計算 pos_array = np.array(positions) # 計算位置的分布特徵 x_coords = pos_array[:, 0] y_coords = pos_array[:, 1] # 分析x和y方向的變異程度 x_variance = np.var(x_coords) y_variance = np.var(y_coords) # 計算物件間的平均距離 distances = [] for i in range(len(positions)): for j in range(i + 1, len(positions)): dist = np.sqrt((positions[i][0] - positions[j][0])**2 + (positions[i][1] - positions[j][1])**2) distances.append(dist) avg_distance = np.mean(distances) if distances else 0 distance_variance = np.var(distances) if distances else 0 # 判斷排列模式 if len(positions) >= 4 and self._is_circular_pattern(positions): return "circular" elif x_variance < 0.05 or y_variance < 0.05: # 一個方向變異很小 return "linear" elif avg_distance < 0.3 and distance_variance < 0.02: # 物件聚集且距離相近 return "clustered" elif avg_distance > 0.6: # 物件分散 return "scattered" elif distance_variance < 0.03: # 距離一致,可能是規則排列 return "regular" else: return "distributed" def _is_circular_pattern(self, positions: List[List[float]]) -> bool: """ 檢查位置是否形成圓形或環形排列 Args: positions: 位置座標列表 Returns: bool: 是否為圓形排列 """ import numpy as np if len(positions) < 4: return False try: pos_array = np.array(positions) # 計算中心點 center_x = np.mean(pos_array[:, 0]) center_y = np.mean(pos_array[:, 1]) # 計算每個點到中心的距離 distances_to_center = [] for pos in positions: dist = np.sqrt((pos[0] - center_x)**2 + (pos[1] - center_y)**2) distances_to_center.append(dist) # 如果所有距離都相近,可能是圓形排列 distance_variance = np.var(distances_to_center) return distance_variance < 0.05 and np.mean(distances_to_center) > 0.2 except: return False def _generate_arrangement_description(self, class_name: str, scene_type: Optional[str], arrangement_pattern: str, count: int) -> Optional[str]: """ 根據物件類型、場景和排列模式生成空間描述 這個方法將抽象的排列模式轉換為自然語言描述, 並根據具體的物件類型和場景語境進行定制 Args: class_name: 物件類別名稱 scene_type: 場景類型 arrangement_pattern: 排列模式 count: 物件數量 Returns: Optional[str]: 生成的空間排列描述 """ # 基於物件類型的描述模板 arrangement_templates = { "chair": { "linear": "arranged in a row", "clustered": "grouped together for conversation", "circular": "arranged around the table", "scattered": "positioned throughout the space", "regular": "evenly spaced", "distributed": "thoughtfully positioned" }, "dining table": { "linear": "aligned to create a unified dining space", "clustered": "grouped to form intimate dining areas", "scattered": "distributed to optimize space flow", "regular": "systematically positioned", "distributed": "strategically placed" }, "car": { "linear": "parked in sequence", "clustered": "grouped in the parking area", "scattered": "distributed throughout the lot", "regular": "neatly parked", "distributed": "positioned across the area" }, "person": { "linear": "moving in a line", "clustered": "gathered together", "circular": "forming a circle", "scattered": "spread across the area", "distributed": "positioned throughout the scene" } } # 獲取對應的描述模板 if class_name in arrangement_templates: template_dict = arrangement_templates[class_name] base_description = template_dict.get(arrangement_pattern, "positioned in the scene") else: # 通用的排列描述 generic_templates = { "linear": "arranged in a line", "clustered": "grouped together", "circular": "arranged in a circular pattern", "scattered": "distributed across the space", "regular": "evenly positioned", "distributed": "thoughtfully placed" } base_description = generic_templates.get(arrangement_pattern, "positioned in the scene") return base_description def _get_single_object_feature(self, class_name: str, scene_type: Optional[str], detected_objects: Optional[List[Dict]]) -> Optional[str]: """ 為單個物件生成特徵描述符 當只有一個物件時,我們可以提供更具體的位置或功能描述 Args: class_name: 物件類別名稱 scene_type: 場景類型 detected_objects: 檢測物件(單個) Returns: Optional[str]: 特徵描述符 """ if not detected_objects or len(detected_objects) != 1: return None obj = detected_objects[0] region = obj.get("region", "").lower() # 基於位置的描述 if "center" in region: if class_name == "dining table": return "central" elif class_name == "chair": return "centrally placed" elif "corner" in region or "left" in region or "right" in region: return "positioned" # 基於場景的功能描述 if scene_type and scene_type in ["dining_area", "restaurant"]: if class_name == "chair": return "dining" elif class_name == "vase": return "decorative" return None def _get_plural_form(self, word: str) -> str: """ 獲取詞彙的複數形式 Args: word: 單數詞彙 Returns: str: 複數形式 """ try: # 特殊複數形式 irregular_plurals = { 'person': 'people', 'child': 'children', 'foot': 'feet', 'tooth': 'teeth', 'mouse': 'mice', 'man': 'men', 'woman': 'women' } if word.lower() in irregular_plurals: return irregular_plurals[word.lower()] # 規則複數形式 if word.endswith(('s', 'sh', 'ch', 'x', 'z')): return word + 'es' elif word.endswith('y') and word[-2] not in 'aeiou': return word[:-1] + 'ies' elif word.endswith('f'): return word[:-1] + 'ves' elif word.endswith('fe'): return word[:-2] + 'ves' else: return word + 's' except Exception as e: self.logger.warning(f"Error getting plural form for '{word}': {str(e)}") return word + 's' def _normalize_object_class_name(self, class_name: str) -> str: """ 標準化物件類別名稱,確保輸出自然語言格式 Args: class_name: 原始類別名稱 Returns: str: 標準化後的類別名稱 """ try: if not class_name or not isinstance(class_name, str): return "object" # 移除可能的技術性前綴或後綴 import re normalized = re.sub(r'^(class_|id_|type_)', '', class_name.lower()) normalized = re.sub(r'(_class|_id|_type)$', '', normalized) # 將下劃線和連字符替換為空格 normalized = normalized.replace('_', ' ').replace('-', ' ') # 移除多餘空格 normalized = ' '.join(normalized.split()) # 特殊類別名稱的標準化映射 class_name_mapping = { 'traffic light': 'traffic light', 'stop sign': 'stop sign', 'fire hydrant': 'fire hydrant', 'dining table': 'dining table', 'potted plant': 'potted plant', 'tv monitor': 'television', 'cell phone': 'mobile phone', 'wine glass': 'wine glass', 'hot dog': 'hot dog', 'teddy bear': 'teddy bear', 'hair drier': 'hair dryer', 'toothbrush': 'toothbrush' } return class_name_mapping.get(normalized, normalized) except Exception as e: self.logger.warning(f"Error normalizing class name '{class_name}': {str(e)}") return class_name if isinstance(class_name, str) else "object" def generate_basic_details(self, scene_type: str, detected_objects: List[Dict]) -> str: """ 當模板不可用時生成基本詳細信息 Args: scene_type: 識別的場景類型 detected_objects: 檢測到的物件列表 Returns: str: 基本場景詳細信息 """ try: # 處理特定場景類型的自定義邏輯 if scene_type == "living_room": tv_objs = [obj for obj in detected_objects if obj.get("class_id") == 62] # TV sofa_objs = [obj for obj in detected_objects if obj.get("class_id") == 57] # Sofa if tv_objs and sofa_objs: tv_region = tv_objs[0].get("region", "center") sofa_region = sofa_objs[0].get("region", "center") arrangement = f"The TV is in the {tv_region.replace('_', ' ')} of the image, " arrangement += f"while the sofa is in the {sofa_region.replace('_', ' ')}. " return f"{arrangement}This appears to be a space designed for relaxation and entertainment." elif scene_type == "bedroom": bed_objs = [obj for obj in detected_objects if obj.get("class_id") == 59] # Bed if bed_objs: bed_region = bed_objs[0].get("region", "center") extra_items = [] for obj in detected_objects: if obj.get("class_id") == 74: # Clock extra_items.append("clock") elif obj.get("class_id") == 73: # Book extra_items.append("book") extras = "" if extra_items: extras = f" There is also a {' and a '.join(extra_items)} visible." return f"The bed is located in the {bed_region.replace('_', ' ')} of the image.{extras}" elif scene_type in ["dining_area", "kitchen"]: # 計算食物和餐飲相關物品 food_items = [] for obj in detected_objects: if obj.get("class_id") in [39, 41, 42, 43, 44, 45]: # 廚房物品 food_items.append(obj.get("class_name", "kitchen item")) food_str = "" if food_items: unique_items = list(set(food_items)) if len(unique_items) <= 3: food_str = f" with {', '.join(unique_items)}" else: food_str = f" with {', '.join(unique_items[:3])} and other items" return f"{food_str}." elif scene_type == "city_street": # 計算人員和車輛 people_count = len([obj for obj in detected_objects if obj.get("class_id") == 0]) vehicle_count = len([obj for obj in detected_objects if obj.get("class_id") in [1, 2, 3, 5, 7]]) # Bicycle, car, motorbike, bus, truck traffic_desc = "" if people_count > 0 and vehicle_count > 0: traffic_desc = f" with {people_count} {'people' if people_count > 1 else 'person'} and " traffic_desc += f"{vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}" elif people_count > 0: traffic_desc = f" with {people_count} {'people' if people_count > 1 else 'person'}" elif vehicle_count > 0: traffic_desc = f" with {vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}" return f"{traffic_desc}." elif scene_type == "asian_commercial_street": # 尋找關鍵城市元素 people_count = len([obj for obj in detected_objects if obj.get("class_id") == 0]) vehicle_count = len([obj for obj in detected_objects if obj.get("class_id") in [1, 2, 3]]) # 分析行人分布 people_positions = [] for obj in detected_objects: if obj.get("class_id") == 0: # Person people_positions.append(obj.get("normalized_center", (0.5, 0.5))) # 檢查人員是否沿線分布(表示步行路徑) structured_path = False if len(people_positions) >= 3: # 簡化檢查 - 查看多個人員的y坐標是否相似 y_coords = [pos[1] for pos in people_positions] y_mean = sum(y_coords) / len(y_coords) y_variance = sum((y - y_mean)**2 for y in y_coords) / len(y_coords) if y_variance < 0.05: # 低變異數表示線性排列 structured_path = True street_desc = "A commercial street with " if people_count > 0: street_desc += f"{people_count} {'pedestrians' if people_count > 1 else 'pedestrian'}" if vehicle_count > 0: street_desc += f" and {vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}" elif vehicle_count > 0: street_desc += f"{vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}" else: street_desc += "various commercial elements" if structured_path: street_desc += ". The pedestrians appear to be following a defined walking path" # 添加文化元素 street_desc += ". The signage and architectural elements suggest an Asian urban setting." return street_desc # 默認通用描述 return "The scene contains various elements characteristic of this environment." except Exception as e: self.logger.warning(f"Error generating basic details for scene_type '{scene_type}': {str(e)}") return "The scene contains various elements characteristic of this environment." def generate_placeholder_content(self, placeholder: str, detected_objects: List[Dict], scene_type: str) -> str: """ 為模板佔位符生成內容 Args: placeholder: 模板佔位符 detected_objects: 檢測到的物件列表 scene_type: 場景類型 Returns: str: 生成的佔位符內容 """ try: # 處理不同類型的佔位符與自定義邏輯 if placeholder == "furniture": # 提取家具物品 furniture_ids = [56, 57, 58, 59, 60, 61] # 家具類別ID示例 furniture_objects = [obj for obj in detected_objects if obj.get("class_id") in furniture_ids] if furniture_objects: furniture_names = [] for obj in furniture_objects[:3]: raw_name = obj.get("class_name", "furniture") normalized_name = self._normalize_object_class_name(raw_name) furniture_names.append(normalized_name) unique_names = list(set(furniture_names)) if len(unique_names) == 1: return unique_names[0] elif len(unique_names) == 2: return f"{unique_names[0]} and {unique_names[1]}" else: return ", ".join(unique_names[:-1]) + f", and {unique_names[-1]}" return "various furniture items" elif placeholder == "electronics": # 提取電子物品 electronics_ids = [62, 63, 64, 65, 66, 67, 68, 69, 70] # 電子設備類別ID示例 electronics_objects = [obj for obj in detected_objects if obj.get("class_id") in electronics_ids] if electronics_objects: electronics_names = [obj.get("class_name", "electronic device") for obj in electronics_objects[:3]] return ", ".join(set(electronics_names)) return "electronic devices" elif placeholder == "people_count": # 計算人數 people_count = len([obj for obj in detected_objects if obj.get("class_id") == 0]) if people_count == 0: return "no people" elif people_count == 1: return "one person" elif people_count < 5: return f"{people_count} people" else: return "several people" elif placeholder == "seating": # 提取座位物品 seating_ids = [56, 57] # chair, sofa seating_objects = [obj for obj in detected_objects if obj.get("class_id") in seating_ids] if seating_objects: seating_names = [obj.get("class_name", "seating") for obj in seating_objects[:2]] return ", ".join(set(seating_names)) return "seating arrangements" # 默認情況 - 空字符串 return "" except Exception as e: self.logger.warning(f"Error generating placeholder content for '{placeholder}': {str(e)}") return "" def describe_functional_zones(self, functional_zones: Dict) -> str: """ 生成場景功能區域的描述,優化處理行人區域、人數統計和物品重複問題 Args: functional_zones: 識別出的功能區域字典 Returns: str: 功能區域描述 """ try: if not functional_zones: return "" # 處理不同類型的 functional_zones 參數 if isinstance(functional_zones, list): # 如果是列表,轉換為字典格式 zones_dict = {} for i, zone in enumerate(functional_zones): if isinstance(zone, dict) and 'name' in zone: zone_name = self._normalize_zone_name(zone['name']) else: zone_name = f"functional area {i+1}" zones_dict[zone_name] = zone if isinstance(zone, dict) else {"description": str(zone)} functional_zones = zones_dict elif not isinstance(functional_zones, dict): return "" # 標準化所有區域鍵名,移除內部標識符格式 normalized_zones = {} for zone_key, zone_data in functional_zones.items(): normalized_key = self._normalize_zone_name(zone_key) normalized_zones[normalized_key] = zone_data functional_zones = normalized_zones # 計算場景中的總人數 total_people_count = 0 people_by_zone = {} # 計算每個區域的人數並累計總人數 for zone_name, zone_info in functional_zones.items(): if "objects" in zone_info: zone_people_count = zone_info["objects"].count("person") people_by_zone[zone_name] = zone_people_count total_people_count += zone_people_count # 分類區域為行人區域和其他區域 pedestrian_zones = [] other_zones = [] for zone_name, zone_info in functional_zones.items(): # 檢查是否是行人相關區域 if any(keyword in zone_name.lower() for keyword in ["pedestrian", "crossing", "people"]): pedestrian_zones.append((zone_name, zone_info)) else: other_zones.append((zone_name, zone_info)) # 獲取最重要的行人區域和其他區域 main_pedestrian_zones = sorted(pedestrian_zones, key=lambda z: people_by_zone.get(z[0], 0), reverse=True)[:1] # 最多1個主要行人區域 top_other_zones = sorted(other_zones, key=lambda z: len(z[1].get("objects", [])), reverse=True)[:2] # 最多2個其他區域 # 合併區域 top_zones = main_pedestrian_zones + top_other_zones if not top_zones: return "" # 生成匯總描述 summary = "" max_mentioned_people = 0 # 追蹤已經提到的最大人數 # 如果總人數顯著且還沒在主描述中提到,添加總人數描述 if total_people_count > 5: summary = f"The scene contains a significant number of pedestrians ({total_people_count} people). " max_mentioned_people = total_people_count # 更新已提到的最大人數 # 處理每個區域的描述,確保人數信息的一致性 processed_zones = [] for zone_name, zone_info in top_zones: zone_desc = zone_info.get("description", "a functional zone") zone_people_count = people_by_zone.get(zone_name, 0) # 檢查描述中是否包含人數資訊 contains_people_info = "with" in zone_desc and ("person" in zone_desc.lower() or "people" in zone_desc.lower()) # 如果描述包含人數信息,且人數較小(小於已提到的最大人數),則修改描述 if contains_people_info and zone_people_count < max_mentioned_people: parts = zone_desc.split("with") if len(parts) > 1: # 移除人數部分 zone_desc = parts[0].strip() + " area" processed_zones.append((zone_name, {"description": zone_desc})) # 根據處理後的區域數量生成最終描述 final_desc = "" if len(processed_zones) == 1: _, zone_info = processed_zones[0] zone_desc = zone_info["description"] final_desc = summary + f"The scene includes {zone_desc}." elif len(processed_zones) == 2: _, zone1_info = processed_zones[0] _, zone2_info = processed_zones[1] zone1_desc = zone1_info["description"] zone2_desc = zone2_info["description"] final_desc = summary + f"The scene is divided into two main areas: {zone1_desc} and {zone2_desc}." else: zones_desc = ["The scene contains multiple functional areas including"] zone_descriptions = [z[1]["description"] for z in processed_zones] # 格式化最終的多區域描述 if len(zone_descriptions) == 3: formatted_desc = f"{zone_descriptions[0]}, {zone_descriptions[1]}, and {zone_descriptions[2]}" else: formatted_desc = ", ".join(zone_descriptions[:-1]) + f", and {zone_descriptions[-1]}" final_desc = summary + f"{zones_desc[0]} {formatted_desc}." return self.optimize_object_description(final_desc) except Exception as e: self.logger.warning(f"Error describing functional zones: {str(e)}") return "" def _normalize_zone_name(self, zone_name: str) -> str: """ 將內部區域鍵名標準化為自然語言描述 Args: zone_name: 原始區域名稱 Returns: str: 標準化後的區域名稱 """ try: if not zone_name or not isinstance(zone_name, str): return "functional area" # 移除數字後綴(如 crossing_zone_1 -> crossing_zone) import re base_name = re.sub(r'_\d+$', '', zone_name) # 將下劃線替換為空格 normalized = base_name.replace('_', ' ') # 標準化常見的區域類型名稱 zone_type_mapping = { 'crossing zone': 'pedestrian crossing area', 'vehicle zone': 'vehicle movement area', 'pedestrian zone': 'pedestrian activity area', 'traffic zone': 'traffic flow area', 'waiting zone': 'waiting area', 'seating zone': 'seating area', 'dining zone': 'dining area', 'furniture zone': 'furniture arrangement area', 'electronics zone': 'electronics area', 'people zone': 'social activity area', 'functional area': 'activity area' } # 檢查是否有對應的標準化名稱 for pattern, replacement in zone_type_mapping.items(): if pattern in normalized.lower(): return replacement # 如果沒有特定映射,使用通用格式 if 'zone' in normalized.lower(): normalized = normalized.replace('zone', 'area') elif not any(keyword in normalized.lower() for keyword in ['area', 'space', 'region']): normalized += ' area' return normalized.strip() except Exception as e: self.logger.warning(f"Error normalizing zone name '{zone_name}': {str(e)}") return "activity area" def get_configuration(self) -> Dict[str, Any]: """ 獲取當前配置參數 Returns: Dict[str, Any]: 配置參數字典 """ return { "min_prominence_score": self.min_prominence_score, "max_categories_to_return": self.max_categories_to_return, "max_total_objects": self.max_total_objects, "confidence_threshold_for_description": self.confidence_threshold_for_description } def update_configuration(self, **kwargs): """ 更新配置參數 Args: **kwargs: 要更新的配置參數 """ try: for key, value in kwargs.items(): if hasattr(self, key): old_value = getattr(self, key) setattr(self, key, value) self.logger.info(f"Updated {key}: {old_value} -> {value}") else: self.logger.warning(f"Unknown configuration parameter: {key}") except Exception as e: self.logger.error(f"Error updating configuration: {str(e)}") raise ObjectDescriptionError(f"Failed to update configuration: {str(e)}") from e