import os import re import json import logging import random import numpy as np from typing import Dict, List, Tuple, Any, Optional from scene_type import SCENE_TYPES from scene_detail_templates import SCENE_DETAIL_TEMPLATES from object_template_fillers import OBJECT_TEMPLATE_FILLERS from lighting_conditions import LIGHTING_CONDITIONS from viewpoint_templates import VIEWPOINT_TEMPLATES from cultural_templates import CULTURAL_TEMPLATES from confidence_templates import CONFIDENCE_TEMPLATES from landmark_data import ALL_LANDMARKS from region_analyzer import RegionAnalyzer from viewpoint_detector import ViewpointDetector, ViewpointDetectionError from template_manager import TemplateManager, TemplateLoadingError, TemplateFillError from object_description_generator import ObjectDescriptionGenerator, ObjectDescriptionError from cultural_context_analyzer import CulturalContextAnalyzer, CulturalContextError from text_formatter import TextFormatter, TextFormattingError class EnhancedSceneDescriberError(Exception): """場景描述生成過程中的自定義異常""" pass class EnhancedSceneDescriber: """ 增強場景描述器 - 提供詳細自然語言場景描述的主要窗口,其他相關class匯集於此 此class會協調多個專門組件來生成高質量的場景描述,包括視角檢測、 模板管理、物件描述、文化語境分析和文本格式化。 """ def __init__(self, templates_db: Optional[Dict] = None, scene_types: Optional[Dict] = None, spatial_analyzer_instance: Optional[Any] = None): """ 初始化增強場景描述器 Args: templates_db: 可選的自定義模板數據庫 scene_types: 場景類型定義字典 spatial_analyzer_instance: 空間分析器實例(保持兼容性) """ self.logger = logging.getLogger(self.__class__.__name__) self.logger.setLevel(logging.INFO) # 如果沒有logger,就加一個 if not self.logger.hasHandlers(): handler = logging.StreamHandler() formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) self.logger.addHandler(handler) try: # 載入場景類型定義 self.scene_types = scene_types or self._load_default_scene_types() # 初始化子組件 self._initialize_components(templates_db) # 保存空間分析器實例以保持兼容性 self.spatial_analyzer_instance = spatial_analyzer_instance self.logger.info("EnhancedSceneDescriber initialized successfully with %d scene types", len(self.scene_types)) except Exception as e: error_msg = f"Failed to initialize EnhancedSceneDescriber: {str(e)}" self.logger.error(f"{error_msg}\n{e.__class__.__name__}: {str(e)}") raise EnhancedSceneDescriberError(error_msg) from e def _load_default_scene_types(self) -> Dict: """ 載入默認場景類型 Returns: Dict: 場景類型定義 """ try: return SCENE_TYPES except Exception as e: self.logger.error(f"Failed to import SCENE_TYPES: {str(e)}") return {} # 返回空字典 def _initialize_components(self, templates_db: Optional[Dict]): """ 初始化所有子組件 Args: templates_db: 可選的模板數據庫 """ try: # 初始化視角檢測器 self.viewpoint_detector = ViewpointDetector() # 初始化區域分析器 self.region_analyzer = RegionAnalyzer() # 初始化模板管理器 self.template_manager = TemplateManager(custom_templates_db=templates_db) # 初始化物件描述生成器,傳入區域分析器 self.object_description_generator = ObjectDescriptionGenerator( region_analyzer=self.region_analyzer ) # 初始化文化語境分析器 self.cultural_context_analyzer = CulturalContextAnalyzer() # 初始化文本格式化器 self.text_formatter = TextFormatter() self.logger.debug("All components initialized successfully") except Exception as e: error_msg = f"Component initialization failed: {str(e)}" self.logger.error(error_msg) # 初始化基本組件而不是拋出異常 self._initialize_fallback_components() def generate_description(self, scene_type: str, detected_objects: List[Dict], confidence: float, lighting_info: Dict, functional_zones: List[str], enable_landmark: bool = True, scene_scores: Optional[Dict] = None, spatial_analysis: Optional[Dict] = None, image_dimensions: Optional[Tuple[int, int]] = None, # 改為 Tuple places365_info: Optional[Dict] = None, object_statistics: Optional[Dict] = None) -> str: try: traffic_list = [obj for obj in detected_objects if obj.get("class_name", "") == "traffic light"] # print(f"[DEBUG] generate_description 一開始接收到的 traffic light 數量: {len(traffic_list)}") # 原始的 print self.logger.debug(f"Initial traffic light count in generate_description: {len(traffic_list)}") # 改用 logger # for idx, tl in enumerate(traffic_list): # 這部分 log 可能過於詳細,先註解 # self.logger.debug(f" idx={idx}, confidence={tl.get('confidence', 0):.4f}, bbox={tl.get('bbox')}, region={tl.get('region')}") if scene_type == "unknown" or confidence < 0.4: generic_desc = self._generate_generic_description(detected_objects, lighting_info) return self.text_formatter.format_final_description(generic_desc) current_detected_objects = detected_objects if not enable_landmark: current_detected_objects = [obj for obj in detected_objects if not obj.get("is_landmark", False)] places365_context = "" if places365_info and places365_info.get('confidence', 0) > 0.3: scene_label = places365_info.get('scene_label', '') attributes = places365_info.get('attributes', []) is_indoor = places365_info.get('is_indoor', None) if scene_label: places365_context = f"Scene context: {scene_label}" if attributes: places365_context += f" with characteristics: {', '.join(attributes[:3])}" if is_indoor is not None: indoor_outdoor = "indoor" if is_indoor else "outdoor" places365_context += f" ({indoor_outdoor} environment)" self.logger.debug(f"Enhanced description incorporating Places365 context: {places365_context}") landmark_objects_in_scene = [obj for obj in current_detected_objects if obj.get("is_landmark", False)] has_landmark_in_scene = len(landmark_objects_in_scene) > 0 if enable_landmark and (scene_type in ["tourist_landmark", "natural_landmark", "historical_monument"] or has_landmark_in_scene): landmark_desc = self._generate_landmark_description( scene_type, current_detected_objects, confidence, lighting_info, functional_zones, landmark_objects_in_scene ) return self.text_formatter.format_final_description(landmark_desc) viewpoint = self.viewpoint_detector.detect_viewpoint(current_detected_objects) current_scene_type = scene_type if viewpoint == "aerial": if "intersection" in current_scene_type.lower() or self._is_intersection(current_detected_objects): current_scene_type = "aerial_view_intersection" elif any(keyword in current_scene_type.lower() for keyword in ["commercial", "shopping", "retail"]): current_scene_type = "aerial_view_commercial_area" elif any(keyword in current_scene_type.lower() for keyword in ["plaza", "square"]): current_scene_type = "aerial_view_plaza" else: current_scene_type = "aerial_view_general" current_scene_type = self._sanitize_scene_type_for_description(current_scene_type) # 偵測文化背景資訊 cultural_context = None if viewpoint != "aerial": cultural_context = self.cultural_context_analyzer.detect_cultural_context(current_scene_type, current_detected_objects) # 設定基礎描述 base_description = "A scene" if viewpoint == "aerial": if current_scene_type in self.scene_types: # 確保 self.scene_types 已有 base_description = self.scene_types.get(current_scene_type, {}).get("description", "An aerial view showing the layout and movement patterns from above") else: base_description = "An aerial view showing the layout and movement patterns from above" elif current_scene_type in self.scene_types: # 確保 self.scene_types 已有 base_description = self.scene_types.get(current_scene_type, {}).get("description", "A scene") # 假設 template_manager 內部可以處理 List[str] 的 functional_zones selected_template = self.template_manager.get_template_by_scene_type( scene_type=current_scene_type, detected_objects=current_detected_objects, functional_zones=functional_zones or [] # 傳入 List[str] ) # 用於 fill_template 中的某些佔位符 processed_functional_zones = {} if functional_zones: if isinstance(functional_zones, dict): # 如果外部傳入的就是dict processed_functional_zones = functional_zones elif isinstance(functional_zones, list): # 如果是 list of strings processed_functional_zones = {f"zone_{i}": {"description": zone_desc} for i, zone_desc in enumerate(functional_zones)} # 組織場景資料 scene_data = { "detected_objects": current_detected_objects, "functional_zones": processed_functional_zones, # 傳入處理過的字典 "scene_type": current_scene_type, "object_statistics": object_statistics or {}, "lighting_info": lighting_info, "spatial_analysis": spatial_analysis, "places365_info": places365_info } # 應用模板產生核心場景描述 core_scene_details = self.template_manager.apply_template(selected_template, scene_data) # 組合基礎描述與核心場景細節 description = base_description if core_scene_details and core_scene_details.strip(): cleaned_scene_details = self._validate_and_clean_scene_details(core_scene_details) if base_description.lower() == "a scene" and len(cleaned_scene_details) > len(base_description): description = cleaned_scene_details else: description = self.text_formatter.smart_append(description, cleaned_scene_details) elif not core_scene_details and not description: # 如果兩者都為空 description = self._generate_generic_description(current_detected_objects, lighting_info) # 添加次要描述資訊 if current_scene_type in self.scene_types and "secondary_description" in self.scene_types[current_scene_type]: secondary_desc = self.scene_types[current_scene_type]["secondary_description"] if secondary_desc: description = self.text_formatter.smart_append(description, secondary_desc) # 處理人物相關的描述 people_objs = [obj for obj in current_detected_objects if obj.get("class_id") == 0] if people_objs: people_count = len(people_objs) if people_count == 1: people_phrase = "a single person" elif 1 < people_count <= 3: people_phrase = f"{people_count} people" elif 3 < people_count <= 7: people_phrase = "several people" else: people_phrase = "multiple people" if not any(p_word in description.lower() for p_word in ["person", "people", "pedestrian"]): description = self.text_formatter.smart_append(description, f"The scene includes {people_phrase}.") # 添加文化背景元素(非空中視角) if cultural_context and viewpoint != "aerial": cultural_elements = self.cultural_context_analyzer.generate_cultural_elements(cultural_context) if cultural_elements: description = self.text_formatter.smart_append(description, cultural_elements) # 處理光照條件描述 lighting_description_text = "" if lighting_info and "time_of_day" in lighting_info: lighting_type = lighting_info["time_of_day"] lighting_desc_template = self.template_manager.get_lighting_template(lighting_type) if lighting_desc_template: lighting_description_text = lighting_desc_template if lighting_description_text and lighting_description_text.lower() not in description.lower(): description = self.text_formatter.smart_append(description, lighting_description_text) # 添加視角特定的觀察描述 if viewpoint != "eye_level": viewpoint_template = self.template_manager.get_viewpoint_template(viewpoint) prefix = viewpoint_template.get('prefix', '') observation_template = viewpoint_template.get("observation", "") scene_elements_for_vp = "the overall layout and objects" if viewpoint == "aerial": scene_elements_for_vp = "crossing patterns and general layout" viewpoint_observation_text = observation_template.format(scene_elements=scene_elements_for_vp) full_viewpoint_text = "" if prefix: full_viewpoint_text = prefix.strip() + " " if viewpoint_observation_text and viewpoint_observation_text[0].islower(): full_viewpoint_text += viewpoint_observation_text elif viewpoint_observation_text: full_viewpoint_text = prefix + (viewpoint_observation_text[0].lower() + viewpoint_observation_text[1:] if description else viewpoint_observation_text) elif viewpoint_observation_text: full_viewpoint_text = viewpoint_observation_text[0].upper() + viewpoint_observation_text[1:] if full_viewpoint_text and full_viewpoint_text.lower() not in description.lower(): description = self.text_formatter.smart_append(description, full_viewpoint_text) # 需要轉換或調整 describe_functional_zones if functional_zones and len(functional_zones) > 0: if isinstance(functional_zones, dict): zones_desc_text = self.object_description_generator.describe_functional_zones(functional_zones) else: # 如果是 list of strings temp_zones_dict = {f"area_{i}": {"description": desc} for i, desc in enumerate(functional_zones)} zones_desc_text = self.object_description_generator.describe_functional_zones(temp_zones_dict) if zones_desc_text: description = self.text_formatter.smart_append(description, zones_desc_text) # 避免重複提到 if hasattr(self.text_formatter, 'deduplicate_sentences_in_description'): deduplicated_description = self.text_formatter.deduplicate_sentences_in_description(description) self.logger.info(f"Description before pre-LLM deduplication (len {len(description)}): '{description[:150]}...'") self.logger.info(f"Description after pre-LLM deduplication (len {len(deduplicated_description)}): '{deduplicated_description[:150]}...'") description = deduplicated_description # 更新 description 為去除重複後的版本 else: self.logger.warning("TextFormatter does not have 'deduplicate_sentences_in_description'. Skipping pre-LLM deduplication of the internally generated description.") # 格式化最終描述 final_formatted_description = self.text_formatter.format_final_description(description) # 如果禁用地標,過濾地標引用 if not enable_landmark: final_formatted_description = self.text_formatter.filter_landmark_references(final_formatted_description, enable_landmark=False) # 如果描述為空,使用備用描述 if not final_formatted_description.strip() or final_formatted_description.strip() == ".": self.logger.warning(f"Description for scene_type '{current_scene_type}' became empty after processing. Falling back.") final_formatted_description = self.text_formatter.format_final_description( self._generate_generic_description(current_detected_objects, lighting_info) ) return final_formatted_description except Exception as e: error_msg = f"Error generating scene description: {str(e)}" self.logger.error(f"{error_msg}\n{e.__class__.__name__}: {str(e)}") try: fallback_desc = self._generate_generic_description(detected_objects, lighting_info) return self.text_formatter.format_final_description(fallback_desc) except: return "A scene with various elements is visible." def _extract_placeholders(self, template: str) -> List[str]: """提取模板中的佔位符""" import re return re.findall(r'\{([^}]+)\}', template) def _generate_placeholder_content(self, placeholder: str, detected_objects: List[Dict], functional_zones: List, scene_type: str, object_statistics: Dict) -> str: """生成佔位符內容""" all_replacements = self._generate_default_replacements() return self._get_placeholder_replacement( placeholder, {}, all_replacements, detected_objects, scene_type ) def _preprocess_functional_zones(self, functional_zones: List) -> Dict: """預處理功能區域數據""" if isinstance(functional_zones, list): # 將列表轉換為字典格式 zones_dict = {} for i, zone in enumerate(functional_zones): if isinstance(zone, str): zones_dict[f"area {i+1}"] = {"description": zone} elif isinstance(zone, dict): zones_dict[f"area {i+1}"] = zone return zones_dict elif isinstance(functional_zones, dict): return functional_zones else: return {} def _standardize_placeholder_content(self, content: str, placeholder_type: str) -> str: """標準化佔位符內容""" if not content: return "various elements" return content.strip() def _finalize_description_output(self, description: str) -> str: """最終化描述輸出""" if not description: return "A scene featuring various elements and organized areas of activity." # 基本清理 import re finalized = re.sub(r'\s+', ' ', description).strip() # 確保適當結尾 if finalized and not finalized.endswith(('.', '!', '?')): finalized += '.' # 首字母大寫 if finalized: finalized = finalized[0].upper() + finalized[1:] if len(finalized) > 1 else finalized.upper() return finalized def _sanitize_scene_type_for_description(self, scene_type: str) -> str: """ 清理場景類型名稱,確保不包含內部標識符格式 Args: scene_type: 原始場景類型名稱 Returns: str: 清理後的場景類型名稱 """ try: # 移除下劃線並轉換為空格分隔的自然語言 cleaned_type = scene_type.replace('_', ' ') # 確保不直接在描述中使用技術性場景類型名稱 return cleaned_type except Exception as e: self.logger.warning(f"Error sanitizing scene type '{scene_type}': {str(e)}") return "general scene" def _validate_and_clean_scene_details(self, scene_details: str) -> str: """ 驗證並清理場景詳細信息,移除可能的模板填充錯誤 Args: scene_details: 原始場景詳細信息 Returns: str: 清理後的場景詳細信息 """ try: if not scene_details or not scene_details.strip(): return "" cleaned = scene_details.strip() # 移除常見的模板填充錯誤模式 import re # 修復 "In ," 類型的錯誤 cleaned = re.sub(r'\bIn\s*,\s*', 'In this scene, ', cleaned) cleaned = re.sub(r'\bAt\s*,\s*', 'At this location, ', cleaned) cleaned = re.sub(r'\bWithin\s*,\s*', 'Within this area, ', cleaned) # 移除內部標識符格式 cleaned = re.sub(r'\b\w+_\w+(?:_\w+)*\b(?!\s+(area|zone|region))', lambda m: m.group(0).replace('_', ' '), cleaned) # 確保句子完整性 if cleaned and not cleaned.endswith(('.', '!', '?')): cleaned += '.' return cleaned except Exception as e: self.logger.warning(f"Error validating scene details: {str(e)}") return scene_details if scene_details else "" def _generate_landmark_description(self, scene_type: str, detected_objects: List[Dict], confidence: float, lighting_info: Optional[Dict] = None, functional_zones: Optional[Dict] = None, landmark_objects: Optional[List[Dict]] = None) -> str: """ 生成包含地標信息的場景描述 Args: scene_type: 識別的場景類型 detected_objects: 檢測到的物件列表 confidence: 場景分類置信度 lighting_info: 照明條件信息 functional_zones: 功能區域信息 landmark_objects: 識別為地標的物件列表 Returns: str: 包含地標信息的自然語言場景描述 """ try: # 如果沒有提供地標物件,從檢測物件中篩選 if landmark_objects is None: landmark_objects = [obj for obj in detected_objects if obj.get("is_landmark", False)] # 如果沒有地標,退回到標準描述 if not landmark_objects: if scene_type in ["tourist_landmark", "natural_landmark", "historical_monument"]: base_description = "A scenic area that appears to be a tourist destination, though specific landmarks are not clearly identifiable." else: return self.text_formatter.format_final_description(self._generate_scene_details( scene_type, detected_objects, lighting_info, self.viewpoint_detector.detect_viewpoint(detected_objects) )) else: # 獲取主要地標 primary_landmark = max(landmark_objects, key=lambda x: x.get("confidence", 0)) landmark_name = primary_landmark.get("class_name", "landmark") # 先取原生 location landmark_location = primary_landmark.get("location", "") # 如果 location 為空,就從全域 ALL_LANDMARKS 補上 lm_id = primary_landmark.get("landmark_id") if not landmark_location and lm_id and lm_id in ALL_LANDMARKS: landmark_location = ALL_LANDMARKS[lm_id].get("location", "") # 根據地標類型選擇適當的描述模板,並插入 location if scene_type == "natural_landmark" or primary_landmark.get("landmark_type") == "natural": base_description = f"A natural landmark scene featuring {landmark_name} in {landmark_location}." elif scene_type == "historical_monument" or primary_landmark.get("landmark_type") == "monument": base_description = f"A historical monument scene showcasing {landmark_name}, a significant landmark in {landmark_location}." else: base_description = f"A tourist landmark scene centered around {landmark_name}, an iconic structure in {landmark_location}." # 添加地標的額外信息 landmark_details = [] for landmark in landmark_objects: details = [] if "year_built" in landmark: details.append(f"built in {landmark['year_built']}") if "architectural_style" in landmark: details.append(f"featuring {landmark['architectural_style']} architectural style") if "significance" in landmark: details.append(landmark["significance"]) # 補 location(如果該物件沒有 location,就再從 ALL_LANDMARKS 撈一次) loc = landmark.get("location", "") lm_id_iter = landmark.get("landmark_id") if not loc and lm_id_iter and lm_id_iter in ALL_LANDMARKS: loc = ALL_LANDMARKS[lm_id_iter].get("location", "") if loc: details.append(f"located in {loc}") if details: landmark_details.append(f"{landmark['class_name']} ({', '.join(details)})") # 將詳細信息添加到基本描述中 if landmark_details: description = base_description + " The scene features " + ", ".join(landmark_details) + "." else: description = base_description # 獲取視角 viewpoint = self.viewpoint_detector.detect_viewpoint(detected_objects) # 生成人員活動描述 people_count = len([obj for obj in detected_objects if obj["class_id"] == 0]) if people_count > 0: if people_count == 1: people_description = "There is one person in the scene, likely a tourist or visitor." elif people_count < 5: people_description = f"There are {people_count} people in the scene, possibly tourists visiting the landmark." else: people_description = f"The scene includes a group of {people_count} people, indicating this is a popular tourist destination." description = self.text_formatter.smart_append(description, people_description) # 添加照明信息 if lighting_info and "time_of_day" in lighting_info: lighting_type = lighting_info["time_of_day"] lighting_description = self.template_manager.get_lighting_template(lighting_type) description = self.text_formatter.smart_append(description, lighting_description) # 添加視角描述 if viewpoint != "eye_level": viewpoint_template = self.template_manager.get_viewpoint_template(viewpoint) prefix = viewpoint_template.get('prefix', '') if prefix and not description.startswith(prefix): if description and description[0].isupper(): description = prefix + description[0].lower() + description[1:] else: description = prefix + description viewpoint_desc = viewpoint_template.get("observation", "").format( scene_elements="the landmark and surrounding area" ) if viewpoint_desc and viewpoint_desc not in description: description = self.text_formatter.smart_append(description, viewpoint_desc) # 添加功能區域描述 if functional_zones and len(functional_zones) > 0: zones_desc = self.object_description_generator.describe_functional_zones(functional_zones) if zones_desc: description = self.text_formatter.smart_append(description, zones_desc) # 描述可能的活動 landmark_activities = [] if scene_type == "natural_landmark" or any(obj.get("landmark_type") == "natural" for obj in landmark_objects): landmark_activities = [ "nature photography", "scenic viewing", "hiking or walking", "guided nature tours", "outdoor appreciation" ] elif scene_type == "historical_monument" or any(obj.get("landmark_type") == "monument" for obj in landmark_objects): landmark_activities = [ "historical sightseeing", "educational tours", "cultural appreciation", "photography of historical architecture", "learning about historical significance" ] else: landmark_activities = [ "sightseeing", "taking photographs", "guided tours", "cultural tourism", "souvenir shopping" ] # 添加活動描述 if landmark_activities: activities_text = "Common activities at this location include " + ", ".join(landmark_activities[:3]) + "." description = self.text_formatter.smart_append(description, activities_text) return self.text_formatter.format_final_description(description) except Exception as e: self.logger.warning(f"Error generating landmark description: {str(e)}") # 備用處理 return self.text_formatter.format_final_description( "A landmark scene with notable architectural or natural features." ) def _is_intersection(self, detected_objects: List[Dict]) -> bool: """ 通過分析物件分布來判斷場景是否為十字路口 Args: detected_objects: 檢測到的物件列表 Returns: bool: 是否為十字路口 """ try: pedestrians = [obj for obj in detected_objects if obj.get("class_id") == 0] if len(pedestrians) >= 8: positions = [obj.get("normalized_center", (0, 0)) for obj in pedestrians] x_coords = [pos[0] for pos in positions] y_coords = [pos[1] for pos in positions] x_variance = np.var(x_coords) if len(x_coords) > 1 else 0 y_variance = np.var(y_coords) if len(y_coords) > 1 else 0 x_range = max(x_coords) - min(x_coords) y_range = max(y_coords) - min(y_coords) if x_range > 0.5 and y_range > 0.5 and 0.7 < (x_range / y_range) < 1.3: return True return False except Exception as e: self.logger.warning(f"Error detecting intersection: {str(e)}") return False def _generate_generic_description(self, detected_objects: List[Dict], lighting_info: Optional[Dict] = None) -> str: """ 當場景類型未知或置信度極低時生成通用描述 Args: detected_objects: 檢測到的物件列表 lighting_info: 可選的照明條件信息 Returns: str: 基於檢測物件的通用描述 """ try: obj_counts = {} for obj in detected_objects: class_name = obj.get("class_name", "unknown object") if class_name not in obj_counts: obj_counts[class_name] = 0 obj_counts[class_name] += 1 top_objects = sorted(obj_counts.items(), key=lambda x: x[1], reverse=True)[:5] if not top_objects: base_desc = "This scene displays various elements, though specific objects are not clearly identifiable." else: objects_text = [] for name, count in top_objects: # 確保物件名稱不包含技術性格式 clean_name = name.replace('_', ' ') if isinstance(name, str) else str(name) if count > 1: objects_text.append(f"{count} {clean_name}s") else: objects_text.append(f"a {clean_name}" if clean_name[0].lower() not in 'aeiou' else f"an {clean_name}") if len(objects_text) == 1: objects_list = objects_text[0] elif len(objects_text) == 2: objects_list = f"{objects_text[0]} and {objects_text[1]}" else: objects_list = ", ".join(objects_text[:-1]) + f", and {objects_text[-1]}" base_desc = f"This scene features {objects_list}." # 添加照明信息 if lighting_info and "time_of_day" in lighting_info: lighting_type = lighting_info["time_of_day"] lighting_desc = self.template_manager.get_lighting_template(lighting_type) base_desc += f" {lighting_desc}" return base_desc except Exception as e: self.logger.warning(f"Error generating generic description: {str(e)}") return "A general scene is visible with various elements." def _generate_scene_details(self, scene_type: str, detected_objects: List[Dict], lighting_info: Optional[Dict] = None, viewpoint: str = "eye_level", spatial_analysis: Optional[Dict] = None, image_dimensions: Optional[Tuple[int, int]] = None, places365_info: Optional[Dict] = None, object_statistics: Optional[Dict] = None) -> str: """ 基於場景類型和檢測物件生成詳細描述 Args: scene_type: 識別的場景類型 detected_objects: 檢測到的物件列表 lighting_info: 可選的照明條件信息 viewpoint: 檢測到的視角 spatial_analysis: 可選的空間分析結果 image_dimensions: 可選的圖像尺寸 places365_info: 可選的 Places365 場景分類結果 object_statistics: 可選的詳細物件統計信息 Returns: str: 詳細場景描述 """ try: scene_details = "" # 日常場景類型列表 everyday_scene_types = [ "general_indoor_space", "generic_street_view", "desk_area_workspace", "outdoor_gathering_spot", "kitchen_counter_or_utility_area", "unknown" ] # 預處理場景類型以避免內部格式洩漏 processed_scene_type = self._sanitize_scene_type_for_description(scene_type) # 確定場景描述方法 is_confident_specific_scene = scene_type not in everyday_scene_types and scene_type in self.template_manager.get_scene_detail_templates(scene_type) treat_as_everyday = scene_type in everyday_scene_types if hasattr(self, 'enable_landmark') and not self.enable_landmark: if scene_type not in ["kitchen", "bedroom", "living_room", "office_workspace", "dining_area", "professional_kitchen"]: treat_as_everyday = True if treat_as_everyday or not is_confident_specific_scene: self.logger.debug(f"Generating dynamic description for scene_type: {scene_type}") scene_details = self.object_description_generator.generate_dynamic_everyday_description( detected_objects, lighting_info, viewpoint, spatial_analysis, image_dimensions, places365_info, object_statistics ) else: self.logger.debug(f"Using template for scene_type: {scene_type}") templates_list = self.template_manager.get_scene_detail_templates(scene_type, viewpoint) if templates_list: detail_template = random.choice(templates_list) scene_details = self.template_manager.fill_template( detail_template, detected_objects, scene_type, places365_info, object_statistics ) else: scene_details = self.object_description_generator.generate_dynamic_everyday_description( detected_objects, lighting_info, viewpoint, spatial_analysis, image_dimensions, places365_info, object_statistics ) # 如果禁用地標檢測,過濾地標引用 if hasattr(self, 'enable_landmark') and not self.enable_landmark: scene_details = self.text_formatter.filter_landmark_references(scene_details, enable_landmark=False) return scene_details if scene_details else "A scene with some visual elements." except Exception as e: self.logger.warning(f"Error generating scene details: {str(e)}") return "A scene with various elements." def filter_landmark_references(self, text, enable_landmark=True): """ 動態過濾文本中的地標引用 Args: text: 需要過濾的文本 enable_landmark: 是否啟用地標功能 Returns: str: 過濾後的文本 """ return self.text_formatter.filter_landmark_references(text, enable_landmark) def get_prominent_objects(self, detected_objects: List[Dict], min_prominence_score: float = 0.5, max_categories_to_return: Optional[int] = None, max_total_objects: Optional[int] = None) -> List[Dict]: """ 獲取最重要的物件 Args: detected_objects: 檢測到的物件列表 min_prominence_score: 最小重要性分數閾值,預設為0.5 max_categories_to_return: 可選的最大返回類別數量限制 max_total_objects: 可選的最大返回物件總數限制 Returns: List[Dict]: 重要物件列表 """ try: # 傳遞所有參數 prominent_objects = self.object_description_generator.get_prominent_objects( detected_objects, min_prominence_score, max_categories_to_return ) # 如果指定了最大物件總數限制,進行額外過濾 if max_total_objects is not None and max_total_objects > 0: # 限制總物件數量,保持重要性排序 prominent_objects = prominent_objects[:max_total_objects] # 如果指定了最大類別數量限制,則進行額外過濾 if max_categories_to_return is not None and max_categories_to_return > 0: # 按類別分組物件 categories_seen = set() filtered_objects = [] for obj in prominent_objects: class_name = obj.get("class_name", "unknown") if class_name not in categories_seen: categories_seen.add(class_name) filtered_objects.append(obj) # 如果已達到最大類別數量,停止添加新類別 if len(categories_seen) >= max_categories_to_return: break elif class_name in categories_seen: # 如果是已見過的類別,仍然添加該物件 filtered_objects.append(obj) return filtered_objects return prominent_objects except Exception as e: self.logger.warning(f"Error getting prominent objects: {str(e)}") return [] def detect_viewpoint(self, detected_objects: List[Dict]) -> str: """ 檢測圖像視角類型 Args: detected_objects: 檢測到的物件列表 Returns: str: 檢測到的視角類型 """ try: return self.viewpoint_detector.detect_viewpoint(detected_objects) except Exception as e: self.logger.warning(f"Error detecting viewpoint: {str(e)}") return "eye_level" def detect_cultural_context(self, scene_type: str, detected_objects: List[Dict]) -> Optional[str]: """ 檢測場景的文化語境 Args: scene_type: 識別的場景類型 detected_objects: 檢測到的物件列表 Returns: Optional[str]: 檢測到的文化語境或None """ try: return self.cultural_context_analyzer.detect_cultural_context(scene_type, detected_objects) except CulturalContextError as e: self.logger.warning(f"Error detecting cultural context: {str(e)}") return None def generate_cultural_elements(self, cultural_context: str) -> str: """ 為檢測到的文化語境生成描述元素 Args: cultural_context: 檢測到的文化語境 Returns: str: 文化元素描述 """ try: return self.cultural_context_analyzer.generate_cultural_elements(cultural_context) except CulturalContextError as e: self.logger.warning(f"Error generating cultural elements: {str(e)}") return "" def format_object_list_for_description(self, objects: List[Dict], use_indefinite_article_for_one: bool = False, count_threshold_for_generalization: int = -1, max_types_to_list: int = 5) -> str: """ 將物件列表格式化為人類可讀的字符串 Args: objects: 物件字典列表 use_indefinite_article_for_one: 單個物件是否使用 "a/an" count_threshold_for_generalization: 計數閾值 max_types_to_list: 最大物件類型數量 Returns: str: 格式化的物件描述字符串 """ try: return self.object_description_generator.format_object_list_for_description( objects, use_indefinite_article_for_one, count_threshold_for_generalization, max_types_to_list ) except ObjectDescriptionError as e: self.logger.warning(f"Error formatting object list: {str(e)}") return "various objects" def get_spatial_description(self, obj: Dict, image_width: Optional[int] = None, image_height: Optional[int] = None) -> str: """ 為物件生成空間位置描述 Args: obj: 物件字典 image_width: 可選的圖像寬度 image_height: 可選的圖像高度 Returns: str: 空間描述字符串 """ try: return self.object_description_generator.get_spatial_description(obj, image_width, image_height) except ObjectDescriptionError as e: self.logger.warning(f"Error generating spatial description: {str(e)}") return "in the scene" def optimize_object_description(self, description: str) -> str: """ 優化物件描述,避免重複列舉相同物件 Args: description: 原始描述文本 Returns: str: 優化後的描述文本 """ try: return self.object_description_generator.optimize_object_description(description) except ObjectDescriptionError as e: self.logger.warning(f"Error optimizing object description: {str(e)}") return description def describe_functional_zones(self, functional_zones: Dict) -> str: """ 生成場景功能區域的描述 Args: functional_zones: 識別出的功能區域字典 Returns: str: 功能區域描述 """ try: return self.object_description_generator.describe_functional_zones(functional_zones) except ObjectDescriptionError as e: self.logger.warning(f"Error describing functional zones: {str(e)}") return "" def smart_append(self, current_text: str, new_fragment: str) -> str: """ 智能地將新文本片段附加到現有文本 Args: current_text: 要附加到的現有文本 new_fragment: 要附加的新文本片段 Returns: str: 合併後的文本 """ try: return self.text_formatter.smart_append(current_text, new_fragment) except TextFormattingError as e: self.logger.warning(f"Error in smart append: {str(e)}") return f"{current_text} {new_fragment}" if current_text else new_fragment def format_final_description(self, text: str) -> str: """ 格式化最終描述文本 Args: text: 要格式化的文本 Returns: str: 格式化後的文本 """ try: return self.text_formatter.format_final_description(text) except TextFormattingError as e: self.logger.warning(f"Error formatting final description: {str(e)}") return text def get_template(self, category: str, key: Optional[str] = None): """ 獲取指定類別的模板 Args: category: 模板類別名稱 key: 可選的具體模板鍵值 Returns: 模板內容 """ try: return self.template_manager.get_template(category, key) except (TemplateLoadingError, TemplateFillError) as e: self.logger.warning(f"Error getting template: {str(e)}") return None def get_viewpoint_confidence(self, detected_objects: List[Dict]) -> Tuple[str, float]: """ 獲取視角檢測結果及其信心度 Args: detected_objects: 檢測到的物件列表 Returns: Tuple[str, float]: (視角類型, 信心度) """ try: return self.viewpoint_detector.get_viewpoint_confidence(detected_objects) except ViewpointDetectionError as e: self.logger.warning(f"Error getting viewpoint confidence: {str(e)}") return "eye_level", 0.5 def get_supported_cultures(self) -> List[str]: """ 獲取所有支援的文化語境列表 Returns: List[str]: 支援的文化語境名稱列表 """ return self.cultural_context_analyzer.get_supported_cultures() def has_cultural_context(self, cultural_context: str) -> bool: """ 檢查是否支援指定的文化語境 Args: cultural_context: 文化語境名稱 Returns: bool: 是否支援該文化語境 """ return self.cultural_context_analyzer.has_cultural_context(cultural_context) def validate_text_quality(self, text: str) -> Dict[str, bool]: """ 驗證文本質量 Args: text: 要驗證的文本 Returns: Dict[str, bool]: 質量檢查結果 """ try: return self.text_formatter.validate_text_quality(text) except TextFormattingError as e: self.logger.warning(f"Error validating text quality: {str(e)}") return {"error": True} def get_text_statistics(self, text: str) -> Dict[str, int]: """ 獲取文本統計信息 Args: text: 要分析的文本 Returns: Dict[str, int]: 文本統計信息 """ try: return self.text_formatter.get_text_statistics(text) except TextFormattingError as e: self.logger.warning(f"Error getting text statistics: {str(e)}") return {"characters": 0, "words": 0, "sentences": 0} def reload_templates(self): """ 重新載入所有模板 """ try: self.template_manager.reload_templates() self.logger.info("Templates reloaded successfully") except (TemplateLoadingError, TemplateFillError) as e: self.logger.error(f"Error reloading templates: {str(e)}") raise EnhancedSceneDescriberError(f"Failed to reload templates: {str(e)}") from e def get_configuration(self) -> Dict[str, Any]: """ 獲取當前配置信息 Returns: Dict[str, Any]: 配置信息字典 """ try: return { "scene_types_count": len(self.scene_types), "viewpoint_detector_config": self.viewpoint_detector.viewpoint_params, "object_generator_config": self.object_description_generator.get_configuration(), "supported_cultures": self.cultural_context_analyzer.get_supported_cultures(), "template_categories": self.template_manager.get_template_categories() } except Exception as e: self.logger.warning(f"Error getting configuration: {str(e)}") return {"error": str(e)} def _initialize_fallback_components(self): """備用組件初始化""" try: self.region_analyzer = RegionAnalyzer() self.object_description_generator = ObjectDescriptionGenerator( region_analyzer=self.region_analyzer ) except Exception as e: self.logger.error(f"Fallback component initialization failed: {str(e)}")