DawnC commited on
Commit
e6a18b7
·
verified ·
1 Parent(s): 05b8fc5

Upload 59 files

Browse files

Refactoring the architecture and Improved scene understanding accuracy

app.py CHANGED
@@ -642,7 +642,7 @@ def create_interface():
642
  "room_01.jpg",
643
  "street_04.jpg",
644
  "street_05.jpg",
645
- "landmark_Louvre_01.jpg",
646
  ],
647
  inputs=image_input,
648
  label="Example Images"
 
642
  "room_01.jpg",
643
  "street_04.jpg",
644
  "street_05.jpg",
645
+ "landmark_Louvre_01.jpg"
646
  ],
647
  inputs=image_input,
648
  label="Example Images"
clip_model_manager.py ADDED
@@ -0,0 +1,276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import torch
3
+ import clip
4
+ import numpy as np
5
+ import logging
6
+ import traceback
7
+ from typing import List, Dict, Tuple, Optional, Union, Any
8
+ from PIL import Image
9
+
10
+ class CLIPModelManager:
11
+ """
12
+ 專門管理 CLIP 模型相關的操作,包括模型載入、設備管理、圖像和文本的特徵編碼等核心功能
13
+ """
14
+
15
+ def __init__(self, model_name: str = "ViT-B/16", device: str = None):
16
+ """
17
+ 初始化 CLIP 模型管理器
18
+
19
+ Args:
20
+ model_name: CLIP模型名稱,默認為"ViT-B/16"
21
+ device: 運行設備,None則自動選擇
22
+ """
23
+ self.logger = logging.getLogger(__name__)
24
+ self.model_name = model_name
25
+
26
+ # 設置運行設備
27
+ if device is None:
28
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
29
+ else:
30
+ self.device = device
31
+
32
+ self.model = None
33
+ self.preprocess = None
34
+
35
+ self._initialize_model()
36
+
37
+ def _initialize_model(self):
38
+ """
39
+ 初始化CLIP模型
40
+ """
41
+ try:
42
+ self.logger.info(f"Initializing CLIP model ({self.model_name}) on {self.device}")
43
+ self.model, self.preprocess = clip.load(self.model_name, device=self.device)
44
+ self.logger.info("Successfully loaded CLIP model")
45
+ except Exception as e:
46
+ self.logger.error(f"Error loading CLIP model: {e}")
47
+ self.logger.error(traceback.format_exc())
48
+ raise
49
+
50
+ def encode_image(self, image_input: torch.Tensor) -> torch.Tensor:
51
+ """
52
+ 編碼圖像特徵
53
+
54
+ Args:
55
+ image_input: 預處理後的圖像張量
56
+
57
+ Returns:
58
+ torch.Tensor: 標準化後的圖像特徵
59
+ """
60
+ try:
61
+ with torch.no_grad():
62
+ image_features = self.model.encode_image(image_input)
63
+ image_features = image_features / image_features.norm(dim=-1, keepdim=True)
64
+ return image_features
65
+ except Exception as e:
66
+ self.logger.error(f"Error encoding image features: {e}")
67
+ self.logger.error(traceback.format_exc())
68
+ raise
69
+
70
+ def encode_text_batch(self, text_prompts: List[str], batch_size: int = 128) -> torch.Tensor:
71
+ """
72
+ 批量編碼文本特徵,避免CUDA內存問題
73
+
74
+ Args:
75
+ text_prompts: 文本提示列表
76
+ batch_size: 批處理大小
77
+
78
+ Returns:
79
+ torch.Tensor: 標準化後的文本特徵
80
+ """
81
+ if not text_prompts:
82
+ return None
83
+
84
+ try:
85
+ with torch.no_grad():
86
+ features_list = []
87
+
88
+ for i in range(0, len(text_prompts), batch_size):
89
+ batch_prompts = text_prompts[i:i+batch_size]
90
+ text_tokens = clip.tokenize(batch_prompts).to(self.device)
91
+ batch_features = self.model.encode_text(text_tokens)
92
+ batch_features = batch_features / batch_features.norm(dim=-1, keepdim=True)
93
+ features_list.append(batch_features)
94
+
95
+ # 連接所有批次
96
+ if len(features_list) > 1:
97
+ text_features = torch.cat(features_list, dim=0)
98
+ else:
99
+ text_features = features_list[0]
100
+
101
+ return text_features
102
+
103
+ except Exception as e:
104
+ self.logger.error(f"Error encoding text features: {e}")
105
+ self.logger.error(traceback.format_exc())
106
+ raise
107
+
108
+ def encode_single_text(self, text_prompts: List[str]) -> torch.Tensor:
109
+ """
110
+ 編碼單個文本批次的特徵
111
+
112
+ Args:
113
+ text_prompts: 文本提示列表
114
+
115
+ Returns:
116
+ torch.Tensor: 標準化後的文本特徵
117
+ """
118
+ try:
119
+ with torch.no_grad():
120
+ text_tokens = clip.tokenize(text_prompts).to(self.device)
121
+ text_features = self.model.encode_text(text_tokens)
122
+ text_features = text_features / text_features.norm(dim=-1, keepdim=True)
123
+ return text_features
124
+ except Exception as e:
125
+ self.logger.error(f"Error encoding single text batch: {e}")
126
+ self.logger.error(traceback.format_exc())
127
+ raise
128
+
129
+ def calculate_similarity(self, image_features: torch.Tensor, text_features: torch.Tensor) -> np.ndarray:
130
+ """
131
+ 計算圖像和文本特徵之間的相似度
132
+
133
+ Args:
134
+ image_features: 圖像特徵張量
135
+ text_features: 文本特徵張量
136
+
137
+ Returns:
138
+ np.ndarray: 相似度分數數組
139
+ """
140
+ try:
141
+ with torch.no_grad():
142
+ similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
143
+ similarity = similarity.cpu().numpy() if self.device == "cuda" else similarity.numpy()
144
+ return similarity
145
+ except Exception as e:
146
+ self.logger.error(f"Error calculating similarity: {e}")
147
+ self.logger.error(traceback.format_exc())
148
+ raise
149
+
150
+ def preprocess_image(self, image: Union[Image.Image, np.ndarray]) -> torch.Tensor:
151
+ """
152
+ 預處理圖像以供CLIP模型使用
153
+
154
+ Args:
155
+ image: PIL圖像或numpy數組
156
+
157
+ Returns:
158
+ torch.Tensor: 預處理後的圖像張量
159
+ """
160
+ try:
161
+ if not isinstance(image, Image.Image):
162
+ if isinstance(image, np.ndarray):
163
+ image = Image.fromarray(image)
164
+ else:
165
+ raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
166
+
167
+ image_input = self.preprocess(image).unsqueeze(0).to(self.device)
168
+ return image_input
169
+
170
+ except Exception as e:
171
+ self.logger.error(f"Error preprocessing image: {e}")
172
+ self.logger.error(traceback.format_exc())
173
+ raise
174
+
175
+ def process_image_region(self, image: Union[Image.Image, np.ndarray], box: List[float]) -> torch.Tensor:
176
+ """
177
+ 處理圖像的特定區域
178
+
179
+ Args:
180
+ image: 原始圖像
181
+ box: 邊界框 [x1, y1, x2, y2]
182
+
183
+ Returns:
184
+ torch.Tensor: 區域圖像的特徵
185
+ """
186
+ try:
187
+ # 確保圖像是PIL格式
188
+ if not isinstance(image, Image.Image):
189
+ if isinstance(image, np.ndarray):
190
+ image = Image.fromarray(image)
191
+ else:
192
+ raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
193
+
194
+ # 裁剪區域
195
+ x1, y1, x2, y2 = map(int, box)
196
+ cropped_image = image.crop((x1, y1, x2, y2))
197
+
198
+ # 預處理並編碼
199
+ image_input = self.preprocess_image(cropped_image)
200
+ image_features = self.encode_image(image_input)
201
+
202
+ return image_features
203
+
204
+ except Exception as e:
205
+ self.logger.error(f"Error processing image region: {e}")
206
+ self.logger.error(traceback.format_exc())
207
+ raise
208
+
209
+ def batch_process_regions(self, image: Union[Image.Image, np.ndarray],
210
+ boxes: List[List[float]]) -> torch.Tensor:
211
+ """
212
+ 批量處理多個圖像區域
213
+
214
+ Args:
215
+ image: 原始圖像
216
+ boxes: 邊界框列表
217
+
218
+ Returns:
219
+ torch.Tensor: 所有區域的圖像特徵
220
+ """
221
+ try:
222
+ # ensure PIL format
223
+ if not isinstance(image, Image.Image):
224
+ if isinstance(image, np.ndarray):
225
+ image = Image.fromarray(image)
226
+ else:
227
+ raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
228
+
229
+ if not boxes:
230
+ return torch.empty(0)
231
+
232
+ # 裁剪並預處理所有區域
233
+ cropped_inputs = []
234
+ for box in boxes:
235
+ x1, y1, x2, y2 = map(int, box)
236
+ cropped_image = image.crop((x1, y1, x2, y2))
237
+ processed_image = self.preprocess(cropped_image).unsqueeze(0)
238
+ cropped_inputs.append(processed_image)
239
+
240
+ # 批量處理
241
+ batch_tensor = torch.cat(cropped_inputs).to(self.device)
242
+ image_features = self.encode_image(batch_tensor)
243
+
244
+ return image_features
245
+
246
+ except Exception as e:
247
+ self.logger.error(f"Error batch processing regions: {e}")
248
+ self.logger.error(traceback.format_exc())
249
+ raise
250
+
251
+ def is_model_loaded(self) -> bool:
252
+ """
253
+ 檢查模型是否已成功載入
254
+
255
+ Returns:
256
+ bool: 模型載入狀態
257
+ """
258
+ return self.model is not None and self.preprocess is not None
259
+
260
+ def get_device(self) -> str:
261
+ """
262
+ 獲取當前設備
263
+
264
+ Returns:
265
+ str: 設備名稱
266
+ """
267
+ return self.device
268
+
269
+ def get_model_name(self) -> str:
270
+ """
271
+ 獲取模型名稱
272
+
273
+ Returns:
274
+ str: 模型名稱
275
+ """
276
+ return self.model_name
clip_zero_shot_classifier.py CHANGED
@@ -3,16 +3,24 @@ import torch
3
  import clip
4
  from PIL import Image
5
  import numpy as np
 
 
6
  from typing import List, Dict, Tuple, Optional, Union, Any
7
 
8
- from landmark_data import ALL_LANDMARKS, get_all_landmark_prompts
9
- from landmark_activities import LANDMARK_ACTIVITIES
 
 
 
10
 
11
  class CLIPZeroShotClassifier:
12
  """
13
- 使用CLIP模型進行零樣本分類,專注於識別世界知名地標。
14
- 作為YOLO檢測的補充,處理標準對象檢測無法識別的地標建築。
 
 
15
  """
 
16
  def __init__(self, model_name: str = "ViT-B/16", device: str = None):
17
  """
18
  初始化CLIP零樣本分類器
@@ -21,87 +29,38 @@ class CLIPZeroShotClassifier:
21
  model_name: CLIP模型名稱,默認為"ViT-B/16"
22
  device: 運行設備,None則自動選擇
23
  """
24
- # 設置運行設備
25
- if device is None:
26
- self.device = "cuda" if torch.cuda.is_available() else "cpu"
27
- else:
28
- self.device = device
29
-
30
- print(f"Initializing CLIP Zero-Shot Landmark Classifier ({model_name}) on {self.device}")
31
- try:
32
- self.model, self.preprocess = clip.load(model_name, device=self.device)
33
- print(f"Successfully loaded CLIP model")
34
- except Exception as e:
35
- print(f"Error loading CLIP model: {e}")
36
- raise
37
 
38
- # 加載地標數據
39
- try:
40
- self.landmark_data = ALL_LANDMARKS
41
- self.landmark_prompts = get_all_landmark_prompts()
42
- print(f"Loaded {len(self.landmark_prompts)} landmark prompts for classification")
43
-
44
- # 預計算地標文本特徵
45
- self.landmark_text_features = self._precompute_text_features(self.landmark_prompts)
46
-
47
- # 創建地標ID到索引的映射,可快速查找
48
- self.landmark_id_to_index = {landmark_id: i for i, landmark_id in enumerate(ALL_LANDMARKS.keys())}
49
-
50
- # 初始化批處理參數
51
- self.batch_size = 16 # 默認批處理大小
52
- self.confidence_threshold_multipliers = {
53
- "close_up": 0.9, # 近景標準閾值
54
- "partial": 0.6, # 部分可見降低閾值要求
55
- "distant": 0.5, # 遠景更低閾值要求
56
- "full_image": 0.7 # 整張圖像需要更高閾值
57
- }
58
 
59
- self.landmark_type_thresholds = {
60
- "tower": 0.5, # 塔型建築需要更高閾值
61
- "skyscraper": 0.4, # 摩天大樓使用較低閾值
62
- "building": 0.55, # 一般建築物閾值略微降低
63
- "monument": 0.5, # 紀念碑閾值
64
- "natural": 0.6 # 自然地標可以使用較低閾值
65
- }
66
 
67
- # 初始化結果快取
68
- self.results_cache = {} # 使用圖像hash作為鍵
69
- self.cache_max_size = 100 # 最大快取項目數
70
 
71
- except ImportError:
72
- print("Warning: landmark_data.py not found. Landmark classification will be limited")
73
- self.landmark_data = {}
74
- self.landmark_prompts = []
75
- self.landmark_text_features = None
76
- self.landmark_id_to_index = {}
77
- self.results_cache = {}
78
-
79
- def _get_image_hash(self, image):
80
- """
81
- 為圖像生成簡單的 hash 值用於快取
82
-
83
- Args:
84
- image: PIL Image 或 numpy 數組
85
-
86
- Returns:
87
- str: 圖像的 hash 值
88
- """
89
- if isinstance(image, np.ndarray):
90
- # 對於 numpy 數組,降採樣並計算簡單 hash
91
- small_img = image[::10, ::10] if image.ndim == 3 else image
92
- return hash(small_img.tobytes())
93
- else:
94
- # 對於 PIL 圖像,調整大小後轉換為 bytes
95
- small_img = image.resize((32, 32))
96
- return hash(small_img.tobytes())
97
-
98
- def _manage_cache(self):
99
  """
100
- 管理結果快取大小
101
  """
102
- if len(self.results_cache) > self.cache_max_size:
103
- oldest_key = next(iter(self.results_cache))
104
- del self.results_cache[oldest_key]
 
 
 
 
 
 
 
 
 
 
105
 
106
  def set_batch_size(self, batch_size: int):
107
  """
@@ -110,436 +69,179 @@ class CLIPZeroShotClassifier:
110
  Args:
111
  batch_size: 新的批處理大小
112
  """
113
- self.batch_size = max(1, batch_size)
114
- print(f"Batch size set to {self.batch_size}")
115
-
116
 
117
  def adjust_confidence_threshold(self, detection_type: str, multiplier: float):
118
  """
119
  調整特定檢測類型的置信度閾值乘數
120
 
121
- Args:
122
  detection_type: 檢測類型 ('close_up', 'partial', 'distant', 'full_image')
123
  multiplier: 置信度閾值乘數
124
  """
125
- if detection_type in self.confidence_threshold_multipliers:
126
- self.confidence_threshold_multipliers[detection_type] = max(0.1, min(1.5, multiplier))
127
- print(f"Adjusted confidence threshold multiplier for {detection_type} to {multiplier}")
128
- else:
129
- print(f"Unknown detection type: {detection_type}")
130
 
131
-
132
- def _precompute_text_features(self, text_prompts: List[str]) -> torch.Tensor:
 
 
 
133
  """
134
- 預計算文本提示的CLIP特徵,提高批處理效率
135
 
136
  Args:
137
- text_prompts: 文本提示列表
 
 
 
138
 
139
  Returns:
140
- torch.Tensor: 預計算的文本特徵
141
- """
142
- if not text_prompts:
143
- return None
144
-
145
- with torch.no_grad():
146
- # Process in batches to avoid CUDA memory issues
147
- batch_size = 128 # Adjust based on GPU memory
148
- features_list = []
149
-
150
- for i in range(0, len(text_prompts), batch_size):
151
- batch_prompts = text_prompts[i:i+batch_size]
152
- text_tokens = clip.tokenize(batch_prompts).to(self.device)
153
- batch_features = self.model.encode_text(text_tokens)
154
- batch_features = batch_features / batch_features.norm(dim=-1, keepdim=True)
155
- features_list.append(batch_features)
156
-
157
- # Concatenate all batches
158
- if len(features_list) > 1:
159
- text_features = torch.cat(features_list, dim=0)
160
- else:
161
- text_features = features_list[0]
162
-
163
- return text_features
164
-
165
- def _perform_pyramid_analysis(self,
166
- image: Union[Image.Image, np.ndarray],
167
- levels: int = 4,
168
- base_threshold: float = 0.25,
169
- aspect_ratios: List[float] = [1.0, 0.75, 1.5]) -> Dict[str, Any]:
170
  """
171
- Performs multi-scale pyramid analysis on the image to improve landmark detection.
172
-
173
- Args:
174
- image: Input image
175
- levels: Number of pyramid levels
176
- base_threshold: Base confidence threshold
177
- aspect_ratios: Different aspect ratios to try (for tall buildings vs wide landscapes)
178
 
179
- Returns:
180
- Dict: Results of pyramid analysis
181
- """
182
- # Ensure image is PIL format
183
- if not isinstance(image, Image.Image):
184
- if isinstance(image, np.ndarray):
185
- image = Image.fromarray(image)
186
- else:
187
- raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
188
-
189
- width, height = image.size
190
- pyramid_results = []
191
-
192
- # 對每個縮放和縱橫比組合進行處理
193
- for level in range(levels):
194
- # 計算縮放因子
195
- scale_factor = 1.0 - (level * 0.2)
196
-
197
- for aspect_ratio in aspect_ratios:
198
- # 計算新尺寸,保持面積近似不變
199
- if aspect_ratio != 1.0:
200
- # 保持面積近似不變的情況下調整縱橫比
201
- new_width = int(width * scale_factor * (1/aspect_ratio)**0.5)
202
- new_height = int(height * scale_factor * aspect_ratio**0.5)
203
  else:
204
- new_width = int(width * scale_factor)
205
- new_height = int(height * scale_factor)
206
-
207
- # 調整圖像大小
208
- scaled_image = image.resize((new_width, new_height), Image.LANCZOS)
209
-
210
- # 預處理圖像
211
- image_input = self.preprocess(scaled_image).unsqueeze(0).to(self.device)
212
-
213
- # 獲取圖像特徵
214
- with torch.no_grad():
215
- image_features = self.model.encode_image(image_input)
216
- image_features = image_features / image_features.norm(dim=-1, keepdim=True)
217
 
218
- # 計算相似度
219
- similarity = (100.0 * image_features @ self.landmark_text_features.T).softmax(dim=-1)
220
- similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
221
 
222
- # 找到最佳匹配
223
- best_idx = similarity.argmax().item()
224
- best_score = similarity[best_idx]
225
-
226
- if best_score >= base_threshold:
227
- landmark_id = list(self.landmark_data.keys())[best_idx]
228
- landmark_info = self.landmark_data[landmark_id]
229
-
230
- pyramid_results.append({
231
- "landmark_id": landmark_id,
232
- "landmark_name": landmark_info["name"],
233
- "confidence": float(best_score),
234
- "scale_factor": scale_factor,
235
- "aspect_ratio": aspect_ratio,
236
- "location": landmark_info["location"]
237
- })
238
 
239
- # 按置信度排序
240
- pyramid_results.sort(key=lambda x: x["confidence"], reverse=True)
 
 
241
 
242
- return {
243
- "is_landmark": len(pyramid_results) > 0,
244
- "results": pyramid_results,
245
- "best_result": pyramid_results[0] if pyramid_results else None
246
- }
247
 
248
- def _enhance_features(self, image: Union[Image.Image, np.ndarray]) -> Image.Image:
249
- """
250
- Enhances image features to improve landmark detection.
 
251
 
252
- Args:
253
- image: Input image
 
 
 
254
 
255
- Returns:
256
- PIL.Image: Enhanced image
257
- """
258
- # Ensure image is PIL format
259
- if not isinstance(image, Image.Image):
260
- if isinstance(image, np.ndarray):
261
- image = Image.fromarray(image)
262
- else:
263
- raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
264
 
265
- # Convert to numpy for processing
266
- img_array = np.array(image)
267
 
268
- # Skip processing for grayscale images
269
- if len(img_array.shape) < 3:
270
- return image
 
271
 
272
- # Apply adaptive contrast enhancement
273
- # Convert to LAB color space
274
- from skimage import color, exposure
275
- try:
276
- # Convert to LAB color space
277
- if img_array.shape[2] == 4: # Handle RGBA
278
- img_array = img_array[:,:,:3]
279
-
280
- lab = color.rgb2lab(img_array[:,:,:3] / 255.0)
281
- l_channel = lab[:,:,0]
282
-
283
- # Enhance contrast of L channel
284
- p2, p98 = np.percentile(l_channel, (2, 98))
285
- l_channel_enhanced = exposure.rescale_intensity(l_channel, in_range=(p2, p98))
286
-
287
- # Replace L channel and convert back to RGB
288
- lab[:,:,0] = l_channel_enhanced
289
- enhanced_img = color.lab2rgb(lab) * 255.0
290
- enhanced_img = enhanced_img.astype(np.uint8)
291
-
292
- return Image.fromarray(enhanced_img)
293
- except ImportError:
294
- print("Warning: skimage not available for feature enhancement")
295
- return image
296
- except Exception as e:
297
- print(f"Error in feature enhancement: {e}")
298
- return image
299
 
300
- def _determine_landmark_type(self, landmark_id):
301
- """
302
- 自動判斷地標類型,基於地標數據和命名
303
 
304
- Returns:
305
- str: 地標類型,用於調整閾值
306
- """
307
- if not landmark_id:
308
- return "building" # 預設類型
309
-
310
- # 獲取地標詳細數據
311
- landmark_data = self.landmark_data if hasattr(self, 'landmark_data') else {}
312
- landmark_info = landmark_data.get(landmark_id, {})
313
-
314
- # 獲取地標相關文本
315
- landmark_id_lower = landmark_id.lower()
316
- landmark_name = landmark_info.get("name", "").lower()
317
- landmark_location = landmark_info.get("location", "").lower()
318
- landmark_aliases = [alias.lower() for alias in landmark_info.get("aliases", [])]
319
-
320
- # 合併所有文本數據用於特徵判斷
321
- combined_text = " ".join([landmark_id_lower, landmark_name] + landmark_aliases)
322
-
323
- # 地標類型的特色特徵
324
- type_features = {
325
- "skyscraper": ["skyscraper", "tall", "tower", "高樓", "摩天", "大厦", "タワー"],
326
- "tower": ["tower", "bell", "clock", "塔", "鐘樓", "タワー", "campanile"],
327
- "monument": ["monument", "memorial", "statue", "紀念", "雕像", "像", "memorial"],
328
- "natural": ["mountain", "lake", "canyon", "falls", "beach", "山", "湖", "峽谷", "瀑布", "海灘"],
329
- "temple": ["temple", "shrine", "寺", "神社", "廟"],
330
- "palace": ["palace", "castle", "宮", "城", "皇宮", "宫殿"],
331
- "distinctive": ["unique", "leaning", "slanted", "傾斜", "斜", "獨特", "傾く"]
332
- }
333
-
334
- # 檢查是否位於亞洲地區
335
- asian_regions = ["china", "japan", "korea", "taiwan", "singapore", "vietnam", "thailand",
336
- "hong kong", "中國", "日本", "韓國", "台灣", "新加坡", "越南", "泰國", "香港"]
337
- is_asian = any(region in landmark_location for region in asian_regions)
338
-
339
- # 判斷地標類型
340
- best_type = None
341
- max_matches = 0
342
-
343
- for type_name, features in type_features.items():
344
- # 計算特徵詞匹配數量
345
- matches = sum(1 for feature in features if feature in combined_text)
346
- if matches > max_matches:
347
- max_matches = matches
348
- best_type = type_name
349
-
350
- # 處理亞洲地區特例
351
- if is_asian and best_type == "tower":
352
- best_type = "skyscraper" # 亞洲地區的塔型建築閾值較低
353
-
354
- # 特例處理:檢測傾斜建築
355
- if any(term in combined_text for term in ["leaning", "slanted", "tilt", "inclined", "斜", "傾斜"]):
356
- return "distinctive" # 傾斜建築需要特殊處理
357
-
358
- return best_type if best_type and max_matches > 0 else "building" # 預設為一般建築
359
-
360
- def classify_image_region(self,
361
- image: Union[Image.Image, np.ndarray],
362
- box: List[float],
363
- threshold: float = 0.25,
364
- detection_type: str = "close_up") -> Dict[str, Any]:
365
- """
366
- 對圖像的特定區域進行地標分類,具有增強的多尺度和部分識別能力
367
-
368
- Args:
369
- image: 原始圖像 (PIL Image 或 numpy數組)
370
- box: 邊界框 [x1, y1, x2, y2]
371
- threshold: 基礎分類置信度閾值
372
- detection_type: 檢測類型,影響置信度調整
373
 
374
- Returns:
375
- Dict: 地標分類結果
376
- """
377
- # 確保圖像是PIL格式
378
- if not isinstance(image, Image.Image):
379
- if isinstance(image, np.ndarray):
380
- image = Image.fromarray(image)
381
- else:
382
- raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
383
-
384
- # 生成圖像區域的hash用於快取
385
- region_key = (self._get_image_hash(image), tuple(box), detection_type)
386
- if region_key in self.results_cache:
387
- return self.results_cache[region_key]
388
-
389
- # 裁剪區域
390
- x1, y1, x2, y2 = map(int, box)
391
- cropped_image = image.crop((x1, y1, x2, y2))
392
- enhanced_image = self._enhance_features(cropped_image)
393
-
394
- # 分析視角信息
395
- viewpoint_info = self._analyze_viewpoint(enhanced_image)
396
- dominant_viewpoint = viewpoint_info["dominant_viewpoint"]
397
-
398
- # 計算區域信息
399
- region_width = x2 - x1
400
- region_height = y2 - y1
401
- image_width, image_height = image.size
402
-
403
- # 根據區域大小判斷可能的檢測類型
404
- region_area_ratio = (region_width * region_height) / (image_width * image_height)
405
- if detection_type == "auto":
406
- if region_area_ratio > 0.5:
407
- detection_type = "close_up"
408
- elif region_area_ratio > 0.2:
409
- detection_type = "partial"
410
- else:
411
- detection_type = "distant"
412
-
413
- # 根據視角調整檢測類型
414
- if dominant_viewpoint == "close_up" and detection_type != "close_up":
415
- detection_type = "close_up"
416
- elif dominant_viewpoint == "distant" and detection_type != "distant":
417
- detection_type = "distant"
418
- elif dominant_viewpoint == "angled_view":
419
- detection_type = "partial" # 角度視圖可能是部分可見
420
-
421
- # 調整置信度閾值
422
- base_multiplier = self.confidence_threshold_multipliers.get(detection_type, 1.0)
423
- adjusted_threshold = threshold * base_multiplier
424
-
425
- # 調整多尺度處理的尺度範圍和縱橫比 - 增強對傾斜建築的支持
426
- scales = [1.0] # 默認尺度
427
-
428
- # 基於視角選擇合適的尺度和縱橫比
429
- if detection_type in ["partial", "distant"]:
430
- scales = [0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3] # 標準範圍
431
-
432
- # 如果是特殊視角,進一步調整尺度和縱橫比 - 新增
433
- if dominant_viewpoint in ["angled_view", "low_angle"]:
434
- scales = [0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4] # 更寬的範圍
435
-
436
- # 準備縱橫比 - 同時支持水平和垂直地標
437
- aspect_ratios = [1.0, 0.8, 1.2] # 標準縱橫比
438
-
439
- # 針對可能的傾斜建築增加更多縱橫比 - 新增
440
- if dominant_viewpoint in ["angled_view", "unique_feature"]:
441
- aspect_ratios = [0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.5] # 更多樣的縱橫比
442
-
443
- best_result = {
444
- "landmark_id": None,
445
- "landmark_name": None,
446
- "confidence": 0.0,
447
- "is_landmark": False
448
- }
449
-
450
- # 多尺度和縱橫比分析
451
- for scale in scales:
452
- for aspect_ratio in aspect_ratios:
453
- # 縮放裁剪區域
454
- current_width, current_height = cropped_image.size
455
-
456
- # 計算新尺寸,保持面積不變但調整縱橫比
457
- if aspect_ratio != 1.0:
458
- new_width = int(current_width * scale * (1/aspect_ratio)**0.5)
459
- new_height = int(current_height * scale * aspect_ratio**0.5)
460
- else:
461
- new_width = int(current_width * scale)
462
- new_height = int(current_height * scale)
463
 
464
- # 確保尺寸至少為1像素
465
- new_width = max(1, new_width)
466
- new_height = max(1, new_height)
 
 
 
467
 
468
- # 縮放圖像
469
- try:
470
- scaled_image = cropped_image.resize((new_width, new_height), Image.LANCZOS)
471
- except Exception as e:
472
- print(f"Failed to resize image to {new_width}x{new_height}: {e}")
473
- continue
474
 
475
- # 預處理裁剪圖像
476
- try:
477
- image_input = self.preprocess(scaled_image).unsqueeze(0).to(self.device)
478
- except Exception as e:
479
- print(f"Failed to preprocess image: {e}")
480
- continue
481
 
482
- # 獲取圖像特徵
483
- with torch.no_grad():
484
- try:
485
- image_features = self.model.encode_image(image_input)
486
- image_features = image_features / image_features.norm(dim=-1, keepdim=True)
487
 
488
- # 計算與地標提示的相似度
489
- similarity = (100.0 * image_features @ self.landmark_text_features.T).softmax(dim=-1)
490
- similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
491
 
492
  # 找到最佳匹配
493
- best_idx = similarity.argmax().item()
494
- best_score = similarity[best_idx]
495
 
496
  # 如果當前尺度結果更好,則更新
497
  if best_score > best_result["confidence"]:
498
- landmark_id = list(self.landmark_data.keys())[best_idx]
499
- landmark_info = self.landmark_data[landmark_id]
500
-
501
- best_result = {
502
- "landmark_id": landmark_id,
503
- "landmark_name": landmark_info["name"],
504
- "location": landmark_info["location"],
505
- "confidence": float(best_score),
506
- "is_landmark": best_score >= adjusted_threshold,
507
- "scale_used": scale,
508
- "aspect_ratio_used": aspect_ratio,
509
- "viewpoint": dominant_viewpoint
510
- }
511
-
512
- # 添加額外可用信息
513
- for key in ["year_built", "architectural_style", "significance"]:
514
- if key in landmark_info:
515
- best_result[key] = landmark_info[key]
 
 
 
 
 
 
516
  except Exception as e:
517
- print(f"Error in calculating similarity: {e}")
518
  continue
519
 
520
- # 只有在有識別出地標ID且信心度足夠高時才應用地標類型閾值調整
521
- if best_result["landmark_id"]:
522
- landmark_type = self._determine_landmark_type(best_result["landmark_id"])
 
523
 
524
- # 檢測是否為特殊類型的建築如斜塔
525
- if landmark_type == "distinctive":
526
- # 特殊建築的閾值降低25%
527
- type_multiplier = 0.75
528
- else:
529
- # 使用已有的類型閾值
530
- type_multiplier = self.landmark_type_thresholds.get(landmark_type, 1.0) / 0.5
531
 
532
- # 更新判斷是否為地標的標準
533
- final_threshold = adjusted_threshold * type_multiplier
534
- best_result["is_landmark"] = best_result["confidence"] >= final_threshold
535
- best_result["landmark_type"] = landmark_type # 添加地標類型信息
536
- best_result["threshold_applied"] = final_threshold # 記錄應用的閾值
537
 
538
- # 快取結果
539
- self.results_cache[region_key] = best_result
540
- self._manage_cache()
 
 
 
541
 
542
- return best_result
543
 
544
  def classify_batch_regions(self,
545
  image: Union[Image.Image, np.ndarray],
@@ -556,73 +258,76 @@ class CLIPZeroShotClassifier:
556
  Returns:
557
  List[Dict]: 分類結果列表
558
  """
559
- if not self.landmark_text_features is not None:
560
- return [{"is_landmark": False, "confidence": 0.0} for _ in boxes]
561
-
562
- # 確保圖像是PIL格式
563
- if not isinstance(image, Image.Image):
564
- if isinstance(image, np.ndarray):
565
- image = Image.fromarray(image)
566
- else:
567
- raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
568
-
569
- # 無框可處理時
570
- if not boxes:
571
- return []
572
 
573
- # 裁剪並預處理所有區域
574
- cropped_inputs = []
575
- for box in boxes:
576
- x1, y1, x2, y2 = map(int, box)
577
- cropped_image = image.crop((x1, y1, x2, y2))
578
- processed_image = self.preprocess(cropped_image).unsqueeze(0)
579
- cropped_inputs.append(processed_image)
580
 
581
- # batch process
582
- batch_tensor = torch.cat(cropped_inputs).to(self.device)
583
 
584
- # batch encoding
585
- with torch.no_grad():
586
- image_features = self.model.encode_image(batch_tensor)
587
- image_features = image_features / image_features.norm(dim=-1, keepdim=True)
588
 
589
  # 計算相似度
590
- similarity = (100.0 * image_features @ self.landmark_text_features.T).softmax(dim=-1)
591
- similarity = similarity.cpu().numpy() if self.device == "cuda" else similarity.numpy()
592
-
593
- # 處理每個區域的結果
594
- results = []
595
- for i, sim in enumerate(similarity):
596
- best_idx = sim.argmax().item()
597
- best_score = sim[best_idx]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
598
 
599
- if best_score >= threshold:
600
- landmark_id = list(self.landmark_data.keys())[best_idx]
601
- landmark_info = self.landmark_data[landmark_id]
602
 
603
- results.append({
604
- "landmark_id": landmark_id,
605
- "landmark_name": landmark_info["name"],
606
- "location": landmark_info["location"],
607
- "confidence": float(best_score),
608
- "is_landmark": True,
609
- "box": boxes[i]
610
- })
611
- else:
612
- results.append({
613
- "landmark_id": None,
614
- "landmark_name": None,
615
- "confidence": float(best_score),
616
- "is_landmark": False,
617
- "box": boxes[i]
618
- })
619
-
620
- return results
621
 
622
  def search_entire_image(self,
623
- image: Union[Image.Image, np.ndarray],
624
- threshold: float = 0.35,
625
- detailed_analysis: bool = False) -> Dict[str, Any]:
626
  """
627
  檢查整張圖像是否包含地標,具有增強的分析能力
628
 
@@ -634,92 +339,103 @@ class CLIPZeroShotClassifier:
634
  Returns:
635
  Dict: 地標分類結果
636
  """
637
- # 確保圖像是PIL格式
638
- if not isinstance(image, Image.Image):
639
- if isinstance(image, np.ndarray):
640
- image = Image.fromarray(image)
641
- else:
642
- raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
643
-
644
- # 檢查快取
645
- image_key = (self._get_image_hash(image), "entire_image", detailed_analysis)
646
- if image_key in self.results_cache:
647
- return self.results_cache[image_key]
648
-
649
- # 調整閾值
650
- adjusted_threshold = threshold * self.confidence_threshold_multipliers.get("full_image", 1.0)
651
-
652
- # 預處理圖像
653
- image_input = self.preprocess(image).unsqueeze(0).to(self.device)
654
-
655
- # 獲取圖像特徵
656
- with torch.no_grad():
657
- image_features = self.model.encode_image(image_input)
658
- image_features = image_features / image_features.norm(dim=-1, keepdim=True)
659
-
660
- # 計算與地標提示的相似度
661
- similarity = (100.0 * image_features @ self.landmark_text_features.T).softmax(dim=-1)
662
- similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
663
-
664
- # 找到最佳匹配
665
- best_idx = similarity.argmax().item()
666
- best_score = similarity[best_idx]
667
-
668
- # top3 landmark
669
- top_indices = similarity.argsort()[-3:][::-1]
670
- top_landmarks = []
671
-
672
- for idx in top_indices:
673
- score = similarity[idx]
674
- landmark_id = list(self.landmark_data.keys())[idx]
675
- landmark_info = self.landmark_data[landmark_id]
676
-
677
- landmark_result = {
678
- "landmark_id": landmark_id,
679
- "landmark_name": landmark_info["name"],
680
- "location": landmark_info["location"],
681
- "confidence": float(score)
682
- }
683
 
684
- # 添加額外可用信息
685
- if "year_built" in landmark_info:
686
- landmark_result["year_built"] = landmark_info["year_built"]
687
- if "architectural_style" in landmark_info:
688
- landmark_result["architectural_style"] = landmark_info["architectural_style"]
689
- if "significance" in landmark_info:
690
- landmark_result["significance"] = landmark_info["significance"]
691
 
692
- top_landmarks.append(landmark_result)
 
 
693
 
694
- # main result
695
- result = {}
696
- if best_score >= adjusted_threshold:
697
- landmark_id = list(self.landmark_data.keys())[best_idx]
698
- landmark_info = self.landmark_data[landmark_id]
699
 
700
- # 應用地標類型特定閾值
701
- landmark_type = self._determine_landmark_type(landmark_id)
702
- type_multiplier = self.landmark_type_thresholds.get(landmark_type, 1.0) / 0.5
703
- final_threshold = adjusted_threshold * type_multiplier
704
 
705
- if best_score >= final_threshold:
706
- result = {
707
- "landmark_id": landmark_id,
708
- "landmark_name": landmark_info["name"],
709
- "location": landmark_info["location"],
710
- "confidence": float(best_score),
711
- "is_landmark": True,
712
- "landmark_type": landmark_type,
713
- "top_landmarks": top_landmarks
714
- }
 
 
 
 
 
 
 
 
715
 
716
- # 添加額外可用信息
717
- if "year_built" in landmark_info:
718
- result["year_built"] = landmark_info["year_built"]
719
- if "architectural_style" in landmark_info:
720
- result["architectural_style"] = landmark_info["architectural_style"]
721
- if "significance" in landmark_info:
722
- result["significance"] = landmark_info["significance"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
723
  else:
724
  result = {
725
  "landmark_id": None,
@@ -729,266 +445,49 @@ class CLIPZeroShotClassifier:
729
  "top_landmarks": top_landmarks
730
  }
731
 
732
- # 如果請求詳細分析且是地標,進一步分析圖像區域
733
- if detailed_analysis and result.get("is_landmark", False):
734
- # 創建不同區域進行更深入分析
735
- width, height = image.size
736
- regions = [
737
- # 中心區域
738
- [width * 0.25, height * 0.25, width * 0.75, height * 0.75],
739
- # 左半部
740
- [0, 0, width * 0.5, height],
741
- # 右半部
742
- [width * 0.5, 0, width, height],
743
- # 上半部
744
- [0, 0, width, height * 0.5],
745
- # 下半部
746
- [0, height * 0.5, width, height]
747
- ]
748
-
749
- region_results = []
750
- for i, box in enumerate(regions):
751
- region_result = self.classify_image_region(
752
- image,
753
- box,
754
- threshold=threshold * 0.9,
755
- detection_type="partial"
756
- )
757
- if region_result["is_landmark"]:
758
- region_result["region_name"] = ["center", "left", "right", "top", "bottom"][i]
759
- region_results.append(region_result)
760
-
761
- # 添加區域分析結果
762
- if region_results:
763
- result["region_analyses"] = region_results
764
-
765
- # 快取結果
766
- self.results_cache[image_key] = result
767
- self._manage_cache()
768
-
769
- return result
770
-
771
- def enhanced_landmark_detection(self,
772
- image: Union[Image.Image, np.ndarray],
773
- threshold: float = 0.3) -> Dict[str, Any]:
774
- """
775
- Enhanced landmark detection using multiple analysis techniques.
776
-
777
- Args:
778
- image: Input image
779
- threshold: Base confidence threshold
780
-
781
- Returns:
782
- Dict: Comprehensive landmark detection results
783
- """
784
- # Ensure image is PIL format
785
- if not isinstance(image, Image.Image):
786
- if isinstance(image, np.ndarray):
787
- image = Image.fromarray(image)
788
- else:
789
- raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
790
-
791
- # Phase 1: Analyze viewpoint to adjust detection parameters
792
- viewpoint_info = self._analyze_viewpoint(image)
793
- viewpoint = viewpoint_info["dominant_viewpoint"]
794
-
795
- # Adjust threshold based on viewpoint
796
- if viewpoint == "distant":
797
- adjusted_threshold = threshold * 0.7 # Lower threshold for distant views
798
- elif viewpoint == "close_up":
799
- adjusted_threshold = threshold * 1.1 # Higher threshold for close-ups
800
- else:
801
- adjusted_threshold = threshold
802
-
803
- # Phase 2: Perform multi-scale pyramid analysis
804
- pyramid_results = self._perform_pyramid_analysis(image, levels=3, base_threshold=adjusted_threshold)
805
-
806
- # Phase 3: Perform grid-based region analysis
807
- grid_results = []
808
- width, height = image.size
809
-
810
- # Create adaptive grid based on viewpoint
811
- if viewpoint == "distant":
812
- grid_size = 3 # Coarser grid for distant views
813
- elif viewpoint == "close_up":
814
- grid_size = 5 # Finer grid for close-ups
815
- else:
816
- grid_size = 4 # Default grid size
817
-
818
- # Generate grid regions
819
- for i in range(grid_size):
820
- for j in range(grid_size):
821
- box = [
822
- width * (j/grid_size),
823
- height * (i/grid_size),
824
- width * ((j+1)/grid_size),
825
- height * ((i+1)/grid_size)
826
  ]
827
 
828
- # Apply feature enhancement
829
- region_result = self.classify_image_region(
830
- image,
831
- box,
832
- threshold=adjusted_threshold,
833
- detection_type="auto"
834
- )
835
-
836
- if region_result["is_landmark"]:
837
- region_result["grid_position"] = (i, j)
838
- grid_results.append(region_result)
839
-
840
- # Phase 4: Cross-validate and combine results
841
- all_detections = []
842
-
843
- # Add pyramid results
844
- if pyramid_results["is_landmark"] and pyramid_results["best_result"]:
845
- all_detections.append({
846
- "source": "pyramid",
847
- "landmark_id": pyramid_results["best_result"]["landmark_id"],
848
- "landmark_name": pyramid_results["best_result"]["landmark_name"],
849
- "confidence": pyramid_results["best_result"]["confidence"],
850
- "scale_factor": pyramid_results["best_result"].get("scale_factor", 1.0)
851
- })
852
-
853
- # Add grid results
854
- for result in grid_results:
855
- all_detections.append({
856
- "source": "grid",
857
- "landmark_id": result["landmark_id"],
858
- "landmark_name": result["landmark_name"],
859
- "confidence": result["confidence"],
860
- "grid_position": result.get("grid_position", (0, 0))
861
- })
862
-
863
- # Search entire image
864
- full_image_result = self.search_entire_image(image, threshold=adjusted_threshold)
865
- if full_image_result and full_image_result.get("is_landmark", False):
866
- all_detections.append({
867
- "source": "full_image",
868
- "landmark_id": full_image_result["landmark_id"],
869
- "landmark_name": full_image_result["landmark_name"],
870
- "confidence": full_image_result["confidence"]
871
- })
872
-
873
- # Group by landmark_id and calculate aggregate confidence
874
- landmark_groups = {}
875
- for detection in all_detections:
876
- landmark_id = detection["landmark_id"]
877
- if landmark_id not in landmark_groups:
878
- landmark_groups[landmark_id] = {
879
- "landmark_id": landmark_id,
880
- "landmark_name": detection["landmark_name"],
881
- "detections": [],
882
- "sources": set()
883
- }
884
-
885
- landmark_groups[landmark_id]["detections"].append(detection)
886
- landmark_groups[landmark_id]["sources"].add(detection["source"])
887
-
888
- # Calculate aggregate confidence for each landmark
889
- for landmark_id, group in landmark_groups.items():
890
- detections = group["detections"]
891
-
892
- # Base confidence is the maximum confidence from any source
893
- max_confidence = max(d["confidence"] for d in detections)
894
 
895
- # Bonus for detection from multiple sources
896
- source_count = len(group["sources"])
897
- source_bonus = min(0.15, (source_count - 1) * 0.05) # Up to 15% bonus
898
 
899
- # Consistency bonus for multiple detections of the same landmark
900
- detection_count = len(detections)
901
- consistency_bonus = min(0.1, (detection_count - 1) * 0.02) # Up to 10% bonus
902
 
903
- # Calculate final confidence
904
- aggregate_confidence = min(1.0, max_confidence + source_bonus + consistency_bonus)
905
 
906
- group["confidence"] = aggregate_confidence
907
- group["detection_count"] = detection_count
908
- group["source_count"] = source_count
909
-
910
- # Sort landmarks by confidence
911
- sorted_landmarks = sorted(
912
- landmark_groups.values(),
913
- key=lambda x: x["confidence"],
914
- reverse=True
915
- )
916
-
917
- return {
918
- "is_landmark_scene": len(sorted_landmarks) > 0,
919
- "detected_landmarks": sorted_landmarks,
920
- "viewpoint_info": viewpoint_info,
921
- "primary_landmark": sorted_landmarks[0] if sorted_landmarks else None
922
- }
923
-
924
- def _analyze_architectural_features(self, image):
925
- """
926
- Analyzes the architectural features of a structure in the image without hardcoding specific landmarks.
927
 
928
- Args:
929
- image: Input image
930
-
931
- Returns:
932
- Dict: Architectural feature analysis results
933
- """
934
- # Define universal architectural feature prompts that apply to all types of landmarks
935
- architecture_prompts = {
936
- "tall_structure": "a tall vertical structure standing alone",
937
- "tiered_building": "a building with multiple stacked tiers or segments",
938
- "historical_structure": "a building with historical architectural elements",
939
- "modern_design": "a modern structure with contemporary architectural design",
940
- "segmented_exterior": "a structure with visible segmented or sectioned exterior",
941
- "viewing_platform": "a tall structure with observation area at the top",
942
- "time_display": "a structure with timepiece features",
943
- "glass_facade": "a building with prominent glass exterior surfaces",
944
- "memorial_structure": "a monument or memorial structure",
945
- "ancient_construction": "ancient constructed elements or archaeological features",
946
- "natural_landmark": "a natural geographic formation or landmark",
947
- "slanted_design": "a structure with non-vertical or leaning profile"
948
- }
949
-
950
- # Calculate similarity scores against universal architectural patterns
951
- context_scores = self.calculate_similarity_scores(image, architecture_prompts)
952
-
953
- # Determine most relevant architectural features
954
- top_features = sorted(context_scores.items(), key=lambda x: x[1], reverse=True)[:3]
955
-
956
- # Calculate feature confidence
957
- context_confidence = sum(score for _, score in top_features) / 3
958
-
959
- # Determine primary architectural category based on top features
960
- architectural_categories = {
961
- "tower": ["tall_structure", "viewing_platform", "time_display"],
962
- "skyscraper": ["tall_structure", "modern_design", "glass_facade"],
963
- "historical": ["historical_structure", "ancient_construction", "memorial_structure"],
964
- "natural": ["natural_landmark"],
965
- "distinctive": ["tiered_building", "segmented_exterior", "slanted_design"]
966
- }
967
-
968
- # Score each category based on the top features
969
- category_scores = {}
970
- for category, features in architectural_categories.items():
971
- category_score = 0
972
- for feature, score in context_scores.items():
973
- if feature in features:
974
- category_score += score
975
- category_scores[category] = category_score
976
-
977
- primary_category = max(category_scores.items(), key=lambda x: x[1])[0]
978
-
979
- return {
980
- "architectural_features": top_features,
981
- "context_confidence": context_confidence,
982
- "primary_category": primary_category,
983
- "category_scores": category_scores
984
- }
985
 
986
  def intelligent_landmark_search(self,
987
- image: Union[Image.Image, np.ndarray],
988
- yolo_boxes: Optional[List[List[float]]] = None,
989
- base_threshold: float = 0.25) -> Dict[str, Any]:
990
  """
991
- 對圖像進行智能地標搜索,綜合整張圖像分析和區域分析
992
 
993
  Args:
994
  image: 原始圖像
@@ -998,158 +497,121 @@ class CLIPZeroShotClassifier:
998
  Returns:
999
  Dict: 包含所有檢測結果的綜合分析
1000
  """
1001
- # 確保圖像是PIL格式
1002
- if not isinstance(image, Image.Image):
1003
- if isinstance(image, np.ndarray):
1004
- image = Image.fromarray(image)
1005
- else:
1006
- raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
 
 
 
 
 
 
 
 
1007
 
1008
- # No YOLO 框時,可以稍微降低閾值以提高召回率
1009
- actual_threshold = base_threshold * 0.85 if yolo_boxes is None or len(yolo_boxes) == 0 else base_threshold
1010
 
1011
- # 首先對整張圖像進行分析
1012
- try:
1013
  full_image_result = self.search_entire_image(
1014
  image,
1015
  threshold=actual_threshold,
1016
- detailed_analysis=True # 確保詳細分析開啟
1017
  )
1018
 
1019
- # No YOLO 框,則進行多尺度分析以提高檢測機會
1020
  if (yolo_boxes is None or len(yolo_boxes) == 0) and (not full_image_result or not full_image_result.get("is_landmark", False)):
1021
- print("No YOLO boxes provided, attempting multi-scale pyramid analysis")
1022
- try:
1023
- if hasattr(self, '_perform_pyramid_analysis'):
1024
- pyramid_results = self._perform_pyramid_analysis(
1025
- image,
1026
- levels=4, #
1027
- base_threshold=actual_threshold,
1028
- aspect_ratios=[1.0, 0.75, 1.5, 0.5, 2.0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1029
  )
1030
 
1031
- if pyramid_results and pyramid_results.get("is_landmark", False) and pyramid_results.get("best_result", {}).get("confidence", 0) > actual_threshold:
1032
- # 使用金字塔分析結果增強或替代全圖結果
1033
- if not full_image_result or not full_image_result.get("is_landmark", False):
1034
- full_image_result = {
1035
- "is_landmark": True,
1036
- "landmark_id": pyramid_results["best_result"]["landmark_id"],
1037
- "landmark_name": pyramid_results["best_result"]["landmark_name"],
1038
- "confidence": pyramid_results["best_result"]["confidence"],
1039
- "location": pyramid_results["best_result"].get("location", "Unknown Location")
1040
- }
1041
- print(f"Pyramid analysis detected landmark: {pyramid_results['best_result']['landmark_name']} with confidence {pyramid_results['best_result']['confidence']:.3f}")
1042
- else:
1043
- print("Pyramid analysis not available, skipping multi-scale detection")
1044
- except Exception as e:
1045
- print(f"Error in pyramid analysis: {e}")
1046
- except Exception as e:
1047
- print(f"Error in search_entire_image: {e}")
1048
- import traceback
1049
- traceback.print_exc()
1050
- full_image_result = None
1051
-
1052
- # 初始化結果字典
1053
- result = {
1054
- "full_image_analysis": full_image_result if full_image_result else {},
1055
- "is_landmark_scene": False, # 默認值
1056
- "detected_landmarks": []
1057
- }
1058
-
1059
- # 上下文感知比較,處理接近的排名結果
1060
- if full_image_result and "top_landmarks" in full_image_result and len(full_image_result["top_landmarks"]) >= 2:
1061
- top_landmarks = full_image_result["top_landmarks"]
1062
-
1063
- # 檢查前兩個結果是否非常接近(信心度差異小於 0.1)
1064
- if len(top_landmarks) >= 2 and abs(top_landmarks[0]["confidence"] - top_landmarks[1]["confidence"]) < 0.1:
1065
- # 對於接近的結果,使用通用建築特徵分析進行區分
1066
- try:
1067
- # 分析建築特徵
1068
- if hasattr(self, '_analyze_architectural_features'):
1069
- architectural_analysis = self._analyze_architectural_features(image)
1070
- top_features = architectural_analysis.get("architectural_features", [])
1071
- primary_category = architectural_analysis.get("primary_category", "")
1072
-
1073
- # 根據建築特徵調整地標置信度
1074
- for i, landmark in enumerate(top_landmarks[:2]):
1075
- if i >= len(top_landmarks):
1076
- continue
1077
 
1078
- landmark_id = landmark.get("landmark_id", "").lower()
1079
- confidence_boost = 0
1080
-
1081
- # 使用主要建築類別來調整置信度,使用通用條件而非特定地標名稱
1082
- if primary_category == "tower" and any(term in landmark_id for term in ["tower", "spire", "needle"]):
1083
- confidence_boost += 0.05
1084
- elif primary_category == "skyscraper" and any(term in landmark_id for term in ["building", "skyscraper", "tall"]):
1085
- confidence_boost += 0.05
1086
- elif primary_category == "historical" and any(term in landmark_id for term in ["monument", "castle", "palace", "temple"]):
1087
- confidence_boost += 0.05
1088
- elif primary_category == "distinctive" and any(term in landmark_id for term in ["unusual", "unique", "special", "famous"]):
1089
- confidence_boost += 0.05
1090
-
1091
- # 根據特定特徵進一步微調,使用通用特徵描述而非特定地標
1092
- for feature, score in top_features:
1093
- if feature == "time_display" and "clock" in landmark_id:
1094
- confidence_boost += 0.03
1095
- elif feature == "segmented_exterior" and "segmented" in landmark_id:
1096
- confidence_boost += 0.03
1097
- elif feature == "slanted_design" and "leaning" in landmark_id:
1098
- confidence_boost += 0.03
1099
-
1100
- # 應用信心度調整
1101
- if confidence_boost > 0 and i < len(top_landmarks):
1102
- top_landmarks[i]["confidence"] += confidence_boost
1103
- print(f"Boosted {landmark['landmark_name']} confidence by {confidence_boost:.2f} based on architectural features ({primary_category})")
1104
-
1105
- # 重新排序
1106
- top_landmarks.sort(key=lambda x: x["confidence"], reverse=True)
1107
- full_image_result["top_landmarks"] = top_landmarks
1108
- if top_landmarks:
1109
- full_image_result["landmark_id"] = top_landmarks[0]["landmark_id"]
1110
- full_image_result["landmark_name"] = top_landmarks[0]["landmark_name"]
1111
- full_image_result["confidence"] = top_landmarks[0]["confidence"]
1112
- full_image_result["location"] = top_landmarks[0].get("location", "Unknown Location")
1113
- except Exception as e:
1114
- print(f"Error in architectural feature analysis: {e}")
1115
- import traceback
1116
- traceback.print_exc()
1117
-
1118
- if full_image_result and full_image_result.get("is_landmark", False):
1119
- result["is_landmark_scene"] = True
1120
- landmark_id = full_image_result.get("landmark_id", "unknown")
1121
-
1122
- # extract landmark info
1123
- landmark_specific_info = self._extract_landmark_specific_info(landmark_id)
1124
-
1125
- landmark_info = {
1126
- "landmark_id": landmark_id,
1127
- "landmark_name": full_image_result.get("landmark_name", "Unknown Landmark"),
1128
- "confidence": full_image_result.get("confidence", 0.0),
1129
- "location": full_image_result.get("location", "Unknown Location"),
1130
- "region_type": "full_image",
1131
- "box": [0, 0, getattr(image, 'width', 0), getattr(image, 'height', 0)]
1132
- }
1133
 
1134
- # 整合地標特定info,確保正確的名稱被使用
1135
- landmark_info.update(landmark_specific_info)
 
 
1136
 
1137
- # 如果特定信息中有更準確的地標名稱,使用它
1138
- if landmark_specific_info.get("landmark_name"):
1139
- landmark_info["landmark_name"] = landmark_specific_info["landmark_name"]
1140
 
1141
- result["detected_landmarks"].append(landmark_info)
 
 
 
 
 
 
 
1142
 
1143
- # 確保地標特定活動被正確設置為主要結果
1144
- if landmark_specific_info.get("has_specific_activities", False):
1145
- result["primary_landmark_activities"] = landmark_specific_info.get("landmark_specific_activities", [])
1146
- print(f"Set primary landmark activities: {len(result['primary_landmark_activities'])} activities for {landmark_info['landmark_name']}")
1147
 
1148
- # 如果提供了YOLO邊界框,分析這些區域
1149
- if yolo_boxes and len(yolo_boxes) > 0:
1150
- for box in yolo_boxes:
1151
- try:
1152
- if hasattr(self, 'classify_image_region'):
 
 
 
 
 
 
 
 
1153
  box_result = self.classify_image_region(
1154
  image,
1155
  box,
@@ -1157,13 +619,10 @@ class CLIPZeroShotClassifier:
1157
  detection_type="auto"
1158
  )
1159
 
1160
- # 如果檢測到地標
1161
  if box_result and box_result.get("is_landmark", False):
1162
- # 檢查是否與已檢測的地標重複
1163
  is_duplicate = False
1164
  for existing in result["detected_landmarks"]:
1165
  if existing.get("landmark_id") == box_result.get("landmark_id"):
1166
- # 如果新的置信度更高,則更新
1167
  if box_result.get("confidence", 0) > existing.get("confidence", 0):
1168
  existing.update({
1169
  "confidence": box_result.get("confidence", 0),
@@ -1173,7 +632,6 @@ class CLIPZeroShotClassifier:
1173
  is_duplicate = True
1174
  break
1175
 
1176
- # 如果不是重複的,添加到列表
1177
  if not is_duplicate:
1178
  result["detected_landmarks"].append({
1179
  "landmark_id": box_result.get("landmark_id", "unknown"),
@@ -1183,234 +641,250 @@ class CLIPZeroShotClassifier:
1183
  "region_type": "yolo_box",
1184
  "box": box
1185
  })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1186
  except Exception as e:
1187
- print(f"Error in analyzing YOLO box: {e}")
1188
- continue
1189
-
1190
- # 最後,執行額外的網格搜索以捕獲可能被遺漏的地標
1191
- # 但只有在尚未發現地標或僅發現低置信度地標時
1192
- should_do_grid_search = (
1193
- len(result["detected_landmarks"]) == 0 or
1194
- max([landmark.get("confidence", 0) for landmark in result["detected_landmarks"]], default=0) < 0.5
1195
- )
1196
-
1197
- if should_do_grid_search and hasattr(self, 'classify_image_region'):
1198
- try:
1199
- # 創建5x5網格
1200
- width, height = getattr(image, 'size', (getattr(image, 'width', 0), getattr(image, 'height', 0)))
1201
- if not isinstance(width, (int, float)) or width <= 0:
1202
- width = getattr(image, 'width', 0)
1203
- if not isinstance(height, (int, float)) or height <= 0:
1204
- height = getattr(image, 'height', 0)
1205
-
1206
- if width > 0 and height > 0:
1207
- grid_boxes = []
1208
- for i in range(5):
1209
- for j in range(5):
1210
- grid_boxes.append([
1211
- width * (j/5), height * (i/5),
1212
- width * ((j+1)/5), height * ((i+1)/5)
1213
- ])
1214
-
1215
- # 分析每個網格區域
1216
- for box in grid_boxes:
1217
- try:
1218
- grid_result = self.classify_image_region(
1219
- image,
1220
- box,
1221
- threshold=base_threshold * 0.9, # 稍微降低網格搜索閾值
1222
- detection_type="partial"
1223
- )
1224
-
1225
- # 如果檢測到地標
1226
- if grid_result and grid_result.get("is_landmark", False):
1227
- # 檢查是否與已檢測的地標重複
1228
- is_duplicate = False
1229
- for existing in result["detected_landmarks"]:
1230
- if existing.get("landmark_id") == grid_result.get("landmark_id"):
1231
- is_duplicate = True
1232
- break
1233
-
1234
- # 如果不是重複的,添加到列表
1235
- if not is_duplicate:
1236
- result["detected_landmarks"].append({
1237
- "landmark_id": grid_result.get("landmark_id", "unknown"),
1238
- "landmark_name": grid_result.get("landmark_name", "Unknown Landmark"),
1239
- "confidence": grid_result.get("confidence", 0.0),
1240
- "location": grid_result.get("location", "Unknown Location"),
1241
- "region_type": "grid",
1242
- "box": box
1243
- })
1244
- except Exception as e:
1245
- print(f"Error in analyzing grid region: {e}")
1246
- continue
1247
- except Exception as e:
1248
- print(f"Error in grid search: {e}")
1249
- import traceback
1250
- traceback.print_exc()
1251
 
1252
- # 按置信度排序檢測結果
1253
- result["detected_landmarks"].sort(key=lambda x: x.get("confidence", 0), reverse=True)
1254
 
1255
- # 更新整體場景類型判斷
1256
- if len(result["detected_landmarks"]) > 0:
1257
- result["is_landmark_scene"] = True
1258
- result["primary_landmark"] = result["detected_landmarks"][0]
1259
 
1260
- # 添加 clip_analysis_on_full_image 結果,以便給 LLM 提供更多上下文
1261
- if full_image_result and "clip_analysis" in full_image_result:
1262
- result["clip_analysis_on_full_image"] = full_image_result["clip_analysis"]
1263
 
1264
- return result
1265
 
1266
- def _extract_landmark_specific_info(self, landmark_id: str) -> Dict[str, Any]:
 
 
 
 
 
 
 
 
 
 
 
1267
  """
1268
- 提取特定地標的詳細信息,包括特色模板和活動建議
1269
 
1270
  Args:
1271
- landmark_id: 地標ID
 
1272
 
1273
  Returns:
1274
- Dict: 地標特定信息
1275
  """
1276
- if not landmark_id or landmark_id == "unknown":
1277
- return {"has_specific_activities": False}
1278
-
1279
- specific_info = {"has_specific_activities": False}
1280
-
1281
- # 從 ALL_LANDMARKS 或 self.landmark_data 中提取基本信息
1282
- landmark_data_source = None
1283
-
1284
- # 優先嘗試從類屬性獲取
1285
- if hasattr(self, 'landmark_data') and self.landmark_data and landmark_id in self.landmark_data:
1286
- landmark_data_source = self.landmark_data[landmark_id]
1287
- print(f"Using landmark data from class attribute for {landmark_id}")
1288
- else:
1289
- try:
1290
- if landmark_id in ALL_LANDMARKS:
1291
- landmark_data_source = ALL_LANDMARKS[landmark_id]
1292
- print(f"Using landmark data from ALL_LANDMARKS for {landmark_id}")
1293
- except ImportError:
1294
- print("Warning: Could not import ALL_LANDMARKS from landmark_data")
1295
- except Exception as e:
1296
- print(f"Error accessing ALL_LANDMARKS: {e}")
1297
-
1298
- # 處理地標基本數據
1299
- if landmark_data_source:
1300
- # 提取正確的地標名稱
1301
- if "name" in landmark_data_source:
1302
- specific_info["landmark_name"] = landmark_data_source["name"]
1303
-
1304
- # 提取所有可用的 prompts 作為特色模板
1305
- if "prompts" in landmark_data_source:
1306
- specific_info["feature_templates"] = landmark_data_source["prompts"][:5]
1307
- specific_info["primary_template"] = landmark_data_source["prompts"][0]
1308
-
1309
- # 提取別名info
1310
- if "aliases" in landmark_data_source:
1311
- specific_info["aliases"] = landmark_data_source["aliases"]
1312
-
1313
- # 提取位置信息
1314
- if "location" in landmark_data_source:
1315
- specific_info["location"] = landmark_data_source["location"]
1316
-
1317
- # 提取其他相關信息
1318
- for key in ["year_built", "architectural_style", "significance", "description"]:
1319
- if key in landmark_data_source:
1320
- specific_info[key] = landmark_data_source[key]
1321
-
1322
- # 嘗試從 LANDMARK_ACTIVITIES 中提取活動建議
1323
  try:
1324
- if landmark_id in LANDMARK_ACTIVITIES:
1325
- activities = LANDMARK_ACTIVITIES[landmark_id]
1326
- specific_info["landmark_specific_activities"] = activities
1327
- specific_info["has_specific_activities"] = True
1328
- print(f"Found {len(activities)} specific activities for landmark {landmark_id}")
1329
- else:
1330
- print(f"No specific activities found for landmark {landmark_id} in LANDMARK_ACTIVITIES")
1331
- specific_info["has_specific_activities"] = False
1332
- except ImportError:
1333
- print("Warning: Could not import LANDMARK_ACTIVITIES from landmark_activities")
1334
- specific_info["has_specific_activities"] = False
1335
- except Exception as e:
1336
- print(f"Error loading landmark activities for {landmark_id}: {e}")
1337
- specific_info["has_specific_activities"] = False
1338
 
1339
- return specific_info
 
 
 
 
 
1340
 
1341
- def _analyze_viewpoint(self, image: Union[Image.Image, np.ndarray]) -> Dict[str, float]:
1342
- """
1343
- Analyzes the image viewpoint to adjust detection parameters.
1344
 
1345
- Args:
1346
- image: Input image
 
 
 
 
 
1347
 
1348
- Returns:
1349
- Dict: Viewpoint analysis results
1350
- """
1351
- viewpoint_prompts = {
1352
- "aerial_view": "an aerial view from above looking down",
1353
- "street_level": "a street level view looking up at a tall structure",
1354
- "eye_level": "an eye-level horizontal view of a landmark",
1355
- "distant": "a distant view of a landmark on the horizon",
1356
- "close_up": "a close-up detailed view of architectural features",
1357
- "interior": "an interior view inside a structure"
1358
- }
1359
-
1360
- # Calculate similarity scores
1361
- viewpoint_scores = self.calculate_similarity_scores(image, viewpoint_prompts)
1362
-
1363
- # Find dominant viewpoint
1364
- dominant_viewpoint = max(viewpoint_scores.items(), key=lambda x: x[1])
1365
-
1366
- return {
1367
- "viewpoint_scores": viewpoint_scores,
1368
- "dominant_viewpoint": dominant_viewpoint[0],
1369
- "confidence": dominant_viewpoint[1]
1370
- }
1371
-
1372
- def calculate_similarity_scores(self, image: Union[Image.Image, np.ndarray],
1373
- prompts: Dict[str, str]) -> Dict[str, float]:
1374
- """
1375
- 計算圖像與一組特定提示之間的相似度分數
1376
 
1377
- Args:
1378
- image: 輸入圖像
1379
- prompts: 提示詞字典 {名稱: 提示文本}
1380
 
1381
- Returns:
1382
- Dict[str, float]: 每個提示的相似度分數
1383
- """
1384
- # 確保圖像是PIL格式
1385
- if not isinstance(image, Image.Image):
1386
- if isinstance(image, np.ndarray):
1387
- image = Image.fromarray(image)
1388
  else:
1389
- raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1390
 
1391
- # 預處理圖像
1392
- image_input = self.preprocess(image).unsqueeze(0).to(self.device)
 
1393
 
1394
- # 獲取圖像特徵
1395
- with torch.no_grad():
1396
- image_features = self.model.encode_image(image_input)
1397
- image_features = image_features / image_features.norm(dim=-1, keepdim=True)
1398
 
1399
- # 計算與每個提示的相似度
1400
- scores = {}
1401
- prompt_texts = list(prompts.values())
1402
- prompt_tokens = clip.tokenize(prompt_texts).to(self.device)
1403
 
1404
- with torch.no_grad():
1405
- prompt_features = self.model.encode_text(prompt_tokens)
1406
- prompt_features = prompt_features / prompt_features.norm(dim=-1, keepdim=True)
1407
 
1408
- # calculate similarity
1409
- similarity = (100.0 * image_features @ prompt_features.T).softmax(dim=-1)
1410
- similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
1411
 
1412
- # 填充結果字典
1413
- for i, (name, _) in enumerate(prompts.items()):
1414
- scores[name] = float(similarity[i])
 
 
 
 
 
 
 
1415
 
1416
- return scores
 
 
 
 
 
 
 
 
 
 
 
3
  import clip
4
  from PIL import Image
5
  import numpy as np
6
+ import logging
7
+ import traceback
8
  from typing import List, Dict, Tuple, Optional, Union, Any
9
 
10
+ from clip_model_manager import CLIPModelManager
11
+ from landmark_data_manager import LandmarkDataManager
12
+ from image_analyzer import ImageAnalyzer
13
+ from confidence_manager import ConfidenceManager
14
+ from result_cache_manager import ResultCacheManager
15
 
16
  class CLIPZeroShotClassifier:
17
  """
18
+ 使用CLIP模型進行zero shot,專注於辨識世界知名地標。
19
+ 作為YOLO的補充,處理YOLO無法辨識到的地標。
20
+
21
+ 這是一個總窗口class,協調各個組件的工作以提供統一的對外接口。
22
  """
23
+
24
  def __init__(self, model_name: str = "ViT-B/16", device: str = None):
25
  """
26
  初始化CLIP零樣本分類器
 
29
  model_name: CLIP模型名稱,默認為"ViT-B/16"
30
  device: 運行設備,None則自動選擇
31
  """
32
+ self.logger = logging.getLogger(__name__)
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
+ # 初始化各個組件
35
+ self.clip_model_manager = CLIPModelManager(model_name, device)
36
+ self.landmark_data_manager = LandmarkDataManager()
37
+ self.image_analyzer = ImageAnalyzer()
38
+ self.confidence_manager = ConfidenceManager()
39
+ self.cache_manager = ResultCacheManager()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
+ # 預計算地標文本特徵
42
+ self.landmark_text_features = None
43
+ self._precompute_landmark_features()
 
 
 
 
44
 
45
+ self.logger.info(f"Initializing CLIP Zero-Shot Landmark Classifier ({model_name}) on {self.clip_model_manager.get_device()}")
 
 
46
 
47
+ def _precompute_landmark_features(self):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  """
49
+ 預計算地標文本特徵,提高批處理效率
50
  """
51
+ try:
52
+ if self.landmark_data_manager.is_landmark_enabled():
53
+ landmark_prompts = self.landmark_data_manager.get_landmark_prompts()
54
+ if landmark_prompts:
55
+ self.landmark_text_features = self.clip_model_manager.encode_text_batch(landmark_prompts)
56
+ self.logger.info(f"Precomputed text features for {len(landmark_prompts)} landmark prompts")
57
+ else:
58
+ self.logger.warning("No landmark prompts available for precomputation")
59
+ else:
60
+ self.logger.warning("Landmark data not enabled, skipping feature precomputation")
61
+ except Exception as e:
62
+ self.logger.error(f"Error precomputing landmark features: {e}")
63
+ self.logger.error(traceback.format_exc())
64
 
65
  def set_batch_size(self, batch_size: int):
66
  """
 
69
  Args:
70
  batch_size: 新的批處理大小
71
  """
72
+ self.confidence_manager.set_batch_size(batch_size)
 
 
73
 
74
  def adjust_confidence_threshold(self, detection_type: str, multiplier: float):
75
  """
76
  調整特定檢測類型的置信度閾值乘數
77
 
78
+ Args
79
  detection_type: 檢測類型 ('close_up', 'partial', 'distant', 'full_image')
80
  multiplier: 置信度閾值乘數
81
  """
82
+ self.confidence_manager.adjust_confidence_threshold(detection_type, multiplier)
 
 
 
 
83
 
84
+ def classify_image_region(self,
85
+ image: Union[Image.Image, np.ndarray],
86
+ box: List[float],
87
+ threshold: float = 0.25,
88
+ detection_type: str = "close_up") -> Dict[str, Any]:
89
  """
90
+ 對圖像的特定區域進行地標分類,具有增強的多尺度和部分識別能力
91
 
92
  Args:
93
+ image: 原始圖像 (PIL Image 或 numpy數組)
94
+ box: 邊界框 [x1, y1, x2, y2]
95
+ threshold: 基礎分類置信度閾值
96
+ detection_type: 檢測類型,影響置信度調整
97
 
98
  Returns:
99
+ Dict: 地標分類結果
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  """
101
+ try:
102
+ if not self.landmark_data_manager.is_landmark_enabled():
103
+ return {"is_landmark": False, "confidence": 0.0}
 
 
 
 
104
 
105
+ # 確保圖像是PIL格式
106
+ if not isinstance(image, Image.Image):
107
+ if isinstance(image, np.ndarray):
108
+ image = Image.fromarray(image)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  else:
110
+ raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
+ # 生成圖像區域的hash用於快取
113
+ image_hash = self.image_analyzer.get_image_hash(image)
114
+ region_key = self.cache_manager.get_region_cache_key(image_hash, tuple(box), detection_type)
115
 
116
+ # 檢查快取
117
+ cached_result = self.cache_manager.get_cached_result(region_key)
118
+ if cached_result is not None:
119
+ return cached_result
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
+ # 裁剪區域
122
+ x1, y1, x2, y2 = map(int, box)
123
+ cropped_image = image.crop((x1, y1, x2, y2))
124
+ enhanced_image = self.image_analyzer.enhance_features(cropped_image)
125
 
126
+ # 分析視角信息
127
+ viewpoint_info = self.image_analyzer.analyze_viewpoint(enhanced_image, self.clip_model_manager)
128
+ dominant_viewpoint = viewpoint_info["dominant_viewpoint"]
 
 
129
 
130
+ # 計算區域信息
131
+ region_width = x2 - x1
132
+ region_height = y2 - y1
133
+ image_width, image_height = image.size
134
 
135
+ # 根據區域大小判斷可能的檢測類型
136
+ if detection_type == "auto":
137
+ detection_type = self.confidence_manager.determine_detection_type_from_region(
138
+ region_width, region_height, image_width, image_height
139
+ )
140
 
141
+ # 根據視角調整檢測類型
142
+ detection_type = self.confidence_manager.adjust_detection_type_by_viewpoint(detection_type, dominant_viewpoint)
 
 
 
 
 
 
 
143
 
144
+ # 調整置信度閾值
145
+ adjusted_threshold = self.confidence_manager.calculate_adjusted_threshold(threshold, detection_type)
146
 
147
+ # 準備多尺度和縱橫比分析
148
+ scales = [1.0]
149
+ if detection_type in ["partial", "distant"]:
150
+ scales = [0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3]
151
 
152
+ if dominant_viewpoint in ["angled_view", "low_angle"]:
153
+ scales = [0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
+ aspect_ratios = [1.0, 0.8, 1.2]
156
+ if dominant_viewpoint in ["angled_view", "unique_feature"]:
157
+ aspect_ratios = [0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.5]
158
 
159
+ best_result = {
160
+ "landmark_id": None,
161
+ "landmark_name": None,
162
+ "confidence": 0.0,
163
+ "is_landmark": False
164
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
 
166
+ # 多尺度和縱橫比分析
167
+ for scale in scales:
168
+ for aspect_ratio in aspect_ratios:
169
+ try:
170
+ # 縮放裁剪區域
171
+ current_width, current_height = cropped_image.size
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
 
173
+ if aspect_ratio != 1.0:
174
+ new_width = int(current_width * scale * (1/aspect_ratio)**0.5)
175
+ new_height = int(current_height * scale * aspect_ratio**0.5)
176
+ else:
177
+ new_width = int(current_width * scale)
178
+ new_height = int(current_height * scale)
179
 
180
+ new_width = max(1, new_width)
181
+ new_height = max(1, new_height)
 
 
 
 
182
 
183
+ scaled_image = cropped_image.resize((new_width, new_height), Image.LANCZOS)
 
 
 
 
 
184
 
185
+ # 預處理並獲取特徵
186
+ image_input = self.clip_model_manager.preprocess_image(scaled_image)
187
+ image_features = self.clip_model_manager.encode_image(image_input)
 
 
188
 
189
+ # 計算相似度
190
+ similarity = self.clip_model_manager.calculate_similarity(image_features, self.landmark_text_features)
 
191
 
192
  # 找到最佳匹配
193
+ best_idx = similarity[0].argmax().item()
194
+ best_score = similarity[0][best_idx]
195
 
196
  # 如果當前尺度結果更好,則更新
197
  if best_score > best_result["confidence"]:
198
+ landmark_id, landmark_info = self.landmark_data_manager.get_landmark_by_index(best_idx)
199
+
200
+ if landmark_id:
201
+ # 先從 LandmarkDataManager 拿 location
202
+ loc = landmark_info.get("location", "")
203
+ # 如果 loc 為空,就從全域 ALL_LANDMARKS 補上
204
+ if not loc and landmark_id in ALL_LANDMARKS:
205
+ loc = ALL_LANDMARKS[landmark_id].get("location", "")
206
+ best_result = {
207
+ "landmark_id": landmark_id,
208
+ "landmark_name": landmark_info.get("name", "Unknown"),
209
+ "location": loc or "Unknown Location",
210
+ "confidence": float(best_score),
211
+ "is_landmark": best_score >= adjusted_threshold,
212
+ "scale_used": scale,
213
+ "aspect_ratio_used": aspect_ratio,
214
+ "viewpoint": dominant_viewpoint
215
+ }
216
+
217
+ # 添加額外可用信息
218
+ for key in ["year_built", "architectural_style", "significance"]:
219
+ if key in landmark_info:
220
+ best_result[key] = landmark_info[key]
221
+
222
  except Exception as e:
223
+ self.logger.error(f"Error in scale analysis: {e}")
224
  continue
225
 
226
+ # 應用地標類型閾值調整
227
+ if best_result["landmark_id"]:
228
+ landmark_type = self.landmark_data_manager.determine_landmark_type(best_result["landmark_id"])
229
+ final_threshold = self.confidence_manager.calculate_final_threshold(adjusted_threshold, detection_type, landmark_type)
230
 
231
+ best_result["is_landmark"] = self.confidence_manager.evaluate_confidence(best_result["confidence"], final_threshold)
232
+ best_result["landmark_type"] = landmark_type
233
+ best_result["threshold_applied"] = final_threshold
 
 
 
 
234
 
235
+ # 快取結果
236
+ self.cache_manager.set_cached_result(region_key, best_result)
 
 
 
237
 
238
+ return best_result
239
+
240
+ except Exception as e:
241
+ self.logger.error(f"Error in classify_image_region: {e}")
242
+ self.logger.error(traceback.format_exc())
243
+ return {"is_landmark": False, "confidence": 0.0}
244
 
 
245
 
246
  def classify_batch_regions(self,
247
  image: Union[Image.Image, np.ndarray],
 
258
  Returns:
259
  List[Dict]: 分類結果列表
260
  """
261
+ try:
262
+ if not self.landmark_data_manager.is_landmark_enabled() or self.landmark_text_features is None:
263
+ return [{"is_landmark": False, "confidence": 0.0} for _ in boxes]
 
 
 
 
 
 
 
 
 
 
264
 
265
+ # 確保圖像是PIL格式
266
+ if not isinstance(image, Image.Image):
267
+ if isinstance(image, np.ndarray):
268
+ image = Image.fromarray(image)
269
+ else:
270
+ raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
 
271
 
272
+ if not boxes:
273
+ return []
274
 
275
+ # 批量處理所有區域
276
+ batch_features = self.clip_model_manager.batch_process_regions(image, boxes)
 
 
277
 
278
  # 計算相似度
279
+ similarity = self.clip_model_manager.calculate_similarity(batch_features, self.landmark_text_features)
280
+
281
+ # 處理每個區域的結果
282
+ results = []
283
+ for i, sim in enumerate(similarity):
284
+ best_idx = sim.argmax().item()
285
+ best_score = sim[best_idx]
286
+
287
+ if best_score >= threshold:
288
+ landmark_id, landmark_info = self.landmark_data_manager.get_landmark_by_index(best_idx)
289
+
290
+ if landmark_id:
291
+ # 如果landmark_info["location"] 為空,則從 ALL_LANDMARKS 補
292
+ loc = landmark_info.get("location", "")
293
+ if not loc and landmark_id in ALL_LANDMARKS:
294
+ loc = ALL_LANDMARKS[landmark_id].get("location", "")
295
+ results.append({
296
+ "landmark_id": landmark_id,
297
+ "landmark_name": landmark_info.get("name", "Unknown"),
298
+ "location": loc or "Unknown Location",
299
+ "confidence": float(best_score),
300
+ "is_landmark": True,
301
+ "box": boxes[i]
302
+ })
303
+ else:
304
+ results.append({
305
+ "landmark_id": None,
306
+ "landmark_name": None,
307
+ "confidence": float(best_score),
308
+ "is_landmark": False,
309
+ "box": boxes[i]
310
+ })
311
+ else:
312
+ results.append({
313
+ "landmark_id": None,
314
+ "landmark_name": None,
315
+ "confidence": float(best_score),
316
+ "is_landmark": False,
317
+ "box": boxes[i]
318
+ })
319
 
320
+ return results
 
 
321
 
322
+ except Exception as e:
323
+ self.logger.error(f"Error in classify_batch_regions: {e}")
324
+ self.logger.error(traceback.format_exc())
325
+ return [{"is_landmark": False, "confidence": 0.0} for _ in boxes]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
326
 
327
  def search_entire_image(self,
328
+ image: Union[Image.Image, np.ndarray],
329
+ threshold: float = 0.35,
330
+ detailed_analysis: bool = False) -> Dict[str, Any]:
331
  """
332
  檢查整張圖像是否包含地標,具有增強的分析能力
333
 
 
339
  Returns:
340
  Dict: 地標分類結果
341
  """
342
+ try:
343
+ if not self.landmark_data_manager.is_landmark_enabled() or self.landmark_text_features is None:
344
+ return {"is_landmark": False, "confidence": 0.0}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
345
 
346
+ # 確保圖像是PIL格式
347
+ if not isinstance(image, Image.Image):
348
+ if isinstance(image, np.ndarray):
349
+ image = Image.fromarray(image)
350
+ else:
351
+ raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
 
352
 
353
+ # 檢查cache
354
+ image_hash = self.image_analyzer.get_image_hash(image)
355
+ image_key = self.cache_manager.get_image_cache_key(image_hash, "entire_image", detailed_analysis)
356
 
357
+ cached_result = self.cache_manager.get_cached_result(image_key)
358
+ if cached_result is not None:
359
+ return cached_result
 
 
360
 
361
+ # 調整閾值
362
+ adjusted_threshold = self.confidence_manager.calculate_adjusted_threshold(threshold, "full_image")
 
 
363
 
364
+ # 預處理並獲取特徵
365
+ image_input = self.clip_model_manager.preprocess_image(image)
366
+ image_features = self.clip_model_manager.encode_image(image_input)
367
+
368
+ # calculate相似度
369
+ similarity = self.clip_model_manager.calculate_similarity(image_features, self.landmark_text_features)
370
+
371
+ # 找到最佳匹配
372
+ best_idx = similarity[0].argmax().item()
373
+ best_score = similarity[0][best_idx]
374
+
375
+ # 獲取top3地標
376
+ top_indices = similarity[0].argsort()[-3:][::-1]
377
+ top_landmarks = []
378
+
379
+ for idx in top_indices:
380
+ score = similarity[0][idx]
381
+ landmark_id, landmark_info = self.landmark_data_manager.get_landmark_by_index(idx)
382
 
383
+ if landmark_id:
384
+ # location
385
+ loc_top = landmark_info.get("location", "")
386
+ if not loc_top and landmark_id in ALL_LANDMARKS:
387
+ loc_top = ALL_LANDMARKS[landmark_id].get("location", "")
388
+ landmark_result = {
389
+ "landmark_id": landmark_id,
390
+ "landmark_name": landmark_info.get("name", "Unknown"),
391
+ "location": loc_top or "Unknown Location",
392
+ "confidence": float(score)
393
+ }
394
+
395
+ # 加額外可用信息
396
+ for key in ["year_built", "architectural_style", "significance"]:
397
+ if key in landmark_info:
398
+ landmark_result[key] = landmark_info[key]
399
+
400
+ top_landmarks.append(landmark_result)
401
+
402
+ # main result
403
+ result = {}
404
+ if best_score >= adjusted_threshold:
405
+ landmark_id, landmark_info = self.landmark_data_manager.get_landmark_by_index(best_idx)
406
+
407
+ if landmark_id:
408
+ # 應用地標類型特定閾值
409
+ landmark_type = self.landmark_data_manager.determine_landmark_type(landmark_id)
410
+ final_threshold = self.confidence_manager.calculate_final_threshold(adjusted_threshold, "full_image", landmark_type)
411
+
412
+ if self.confidence_manager.evaluate_confidence(best_score, final_threshold):
413
+ # 補 location
414
+ loc_main = landmark_info.get("location", "")
415
+ if not loc_main and landmark_id in ALL_LANDMARKS:
416
+ loc_main = ALL_LANDMARKS[landmark_id].get("location", "")
417
+ result = {
418
+ "landmark_id": landmark_id,
419
+ "landmark_name": landmark_info.get("name", "Unknown"),
420
+ "location": loc_main or "Unknown Location",
421
+ "confidence": float(best_score),
422
+ "is_landmark": True,
423
+ "landmark_type": landmark_type,
424
+ "top_landmarks": top_landmarks
425
+ }
426
+
427
+ # 添加額外可用信息
428
+ for key in ["year_built", "architectural_style", "significance"]:
429
+ if key in landmark_info:
430
+ result[key] = landmark_info[key]
431
+ else:
432
+ result = {
433
+ "landmark_id": None,
434
+ "landmark_name": None,
435
+ "confidence": float(best_score),
436
+ "is_landmark": False,
437
+ "top_landmarks": top_landmarks
438
+ }
439
  else:
440
  result = {
441
  "landmark_id": None,
 
445
  "top_landmarks": top_landmarks
446
  }
447
 
448
+ # 詳細分析
449
+ if detailed_analysis and result.get("is_landmark", False):
450
+ width, height = image.size
451
+ regions = [
452
+ [width * 0.25, height * 0.25, width * 0.75, height * 0.75],
453
+ [0, 0, width * 0.5, height],
454
+ [width * 0.5, 0, width, height],
455
+ [0, 0, width, height * 0.5],
456
+ [0, height * 0.5, width, height]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
457
  ]
458
 
459
+ region_results = []
460
+ for i, box in enumerate(regions):
461
+ region_result = self.classify_image_region(
462
+ image,
463
+ box,
464
+ threshold=threshold * 0.9,
465
+ detection_type="partial"
466
+ )
467
+ if region_result["is_landmark"]:
468
+ region_result["region_name"] = ["center", "left", "right", "top", "bottom"][i]
469
+ region_results.append(region_result)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
470
 
471
+ if region_results:
472
+ result["region_analyses"] = region_results
 
473
 
474
+ # 快取結果
475
+ self.cache_manager.set_cached_result(image_key, result)
 
476
 
477
+ return result
 
478
 
479
+ except Exception as e:
480
+ self.logger.error(f"Error in search_entire_image: {e}")
481
+ self.logger.error(traceback.format_exc())
482
+ return {"is_landmark": False, "confidence": 0.0}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
483
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
484
 
485
  def intelligent_landmark_search(self,
486
+ image: Union[Image.Image, np.ndarray],
487
+ yolo_boxes: Optional[List[List[float]]] = None,
488
+ base_threshold: float = 0.25) -> Dict[str, Any]:
489
  """
490
+ 對圖像進行地標搜索,綜合整張圖像分析和區域分析
491
 
492
  Args:
493
  image: 原始圖像
 
497
  Returns:
498
  Dict: 包含所有檢測結果的綜合分析
499
  """
500
+ try:
501
+ if not self.landmark_data_manager.is_landmark_enabled():
502
+ return {
503
+ "full_image_analysis": {},
504
+ "is_landmark_scene": False,
505
+ "detected_landmarks": []
506
+ }
507
+
508
+ # 確保圖像是PIL格式
509
+ if not isinstance(image, Image.Image):
510
+ if isinstance(image, np.ndarray):
511
+ image = Image.fromarray(image)
512
+ else:
513
+ raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
514
 
515
+ # 調整閾值
516
+ actual_threshold = base_threshold * 0.85 if yolo_boxes is None or len(yolo_boxes) == 0 else base_threshold
517
 
518
+ # 首先對整張圖像進行分析
 
519
  full_image_result = self.search_entire_image(
520
  image,
521
  threshold=actual_threshold,
522
+ detailed_analysis=True
523
  )
524
 
525
+ # 如果沒有YOLO框且全圖分析未發現地標,進行金字塔分析
526
  if (yolo_boxes is None or len(yolo_boxes) == 0) and (not full_image_result or not full_image_result.get("is_landmark", False)):
527
+ self.logger.info("No YOLO boxes provided, attempting multi-scale pyramid analysis")
528
+ pyramid_results = self.image_analyzer.perform_pyramid_analysis(
529
+ image,
530
+ self.clip_model_manager,
531
+ self.landmark_data_manager,
532
+ levels=4,
533
+ base_threshold=actual_threshold,
534
+ aspect_ratios=[1.0, 0.75, 1.5, 0.5, 2.0]
535
+ )
536
+
537
+ if pyramid_results and pyramid_results.get("is_landmark", False) and pyramid_results.get("best_result", {}).get("confidence", 0) > actual_threshold:
538
+ if not full_image_result or not full_image_result.get("is_landmark", False):
539
+ full_image_result = {
540
+ "is_landmark": True,
541
+ "landmark_id": pyramid_results["best_result"]["landmark_id"],
542
+ "landmark_name": pyramid_results["best_result"]["landmark_name"],
543
+ "confidence": pyramid_results["best_result"]["confidence"],
544
+ "location": pyramid_results["best_result"].get("location", "Unknown Location")
545
+ }
546
+ self.logger.info(f"Pyramid analysis detected landmark: {pyramid_results['best_result']['landmark_name']} with confidence {pyramid_results['best_result']['confidence']:.3f}")
547
+
548
+ # 初始化結果dict
549
+ result = {
550
+ "full_image_analysis": full_image_result if full_image_result else {},
551
+ "is_landmark_scene": False,
552
+ "detected_landmarks": []
553
+ }
554
+
555
+ # 處理上下文感知比較
556
+ if full_image_result and "top_landmarks" in full_image_result and len(full_image_result["top_landmarks"]) >= 2:
557
+ top_landmarks = full_image_result["top_landmarks"]
558
+
559
+ if len(top_landmarks) >= 2 and abs(top_landmarks[0]["confidence"] - top_landmarks[1]["confidence"]) < 0.1:
560
+ architectural_analysis = self.image_analyzer.analyze_architectural_features(image, self.clip_model_manager)
561
+
562
+ for i, landmark in enumerate(top_landmarks[:2]):
563
+ if i >= len(top_landmarks):
564
+ continue
565
+
566
+ adjusted_confidence = self.confidence_manager.apply_architectural_boost(
567
+ landmark["confidence"],
568
+ architectural_analysis,
569
+ landmark.get("landmark_id", "")
570
  )
571
 
572
+ if adjusted_confidence != landmark["confidence"]:
573
+ top_landmarks[i]["confidence"] = adjusted_confidence
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
574
 
575
+ # 重新排序
576
+ top_landmarks.sort(key=lambda x: x["confidence"], reverse=True)
577
+ full_image_result["top_landmarks"] = top_landmarks
578
+ if top_landmarks:
579
+ full_image_result["landmark_id"] = top_landmarks[0]["landmark_id"]
580
+ full_image_result["landmark_name"] = top_landmarks[0]["landmark_name"]
581
+ full_image_result["confidence"] = top_landmarks[0]["confidence"]
582
+ full_image_result["location"] = top_landmarks[0].get("location", "Unknown Location")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
583
 
584
+ # 處理全圖結果
585
+ if full_image_result and full_image_result.get("is_landmark", False):
586
+ result["is_landmark_scene"] = True
587
+ landmark_id = full_image_result.get("landmark_id", "unknown")
588
 
589
+ landmark_specific_info = self.landmark_data_manager.extract_landmark_specific_info(landmark_id)
 
 
590
 
591
+ landmark_info = {
592
+ "landmark_id": landmark_id,
593
+ "landmark_name": full_image_result.get("landmark_name", "Unknown Landmark"),
594
+ "confidence": full_image_result.get("confidence", 0.0),
595
+ "location": full_image_result.get("location", "Unknown Location"),
596
+ "region_type": "full_image",
597
+ "box": [0, 0, getattr(image, 'width', 0), getattr(image, 'height', 0)]
598
+ }
599
 
600
+ landmark_info.update(landmark_specific_info)
 
 
 
601
 
602
+ if landmark_specific_info.get("landmark_name"):
603
+ landmark_info["landmark_name"] = landmark_specific_info["landmark_name"]
604
+
605
+ result["detected_landmarks"].append(landmark_info)
606
+
607
+ if landmark_specific_info.get("has_specific_activities", False):
608
+ result["primary_landmark_activities"] = landmark_specific_info.get("landmark_specific_activities", [])
609
+ self.logger.info(f"Set primary landmark activities: {len(result['primary_landmark_activities'])} activities for {landmark_info['landmark_name']}")
610
+
611
+ # 處理YOLO邊界框
612
+ if yolo_boxes and len(yolo_boxes) > 0:
613
+ for box in yolo_boxes:
614
+ try:
615
  box_result = self.classify_image_region(
616
  image,
617
  box,
 
619
  detection_type="auto"
620
  )
621
 
 
622
  if box_result and box_result.get("is_landmark", False):
 
623
  is_duplicate = False
624
  for existing in result["detected_landmarks"]:
625
  if existing.get("landmark_id") == box_result.get("landmark_id"):
 
626
  if box_result.get("confidence", 0) > existing.get("confidence", 0):
627
  existing.update({
628
  "confidence": box_result.get("confidence", 0),
 
632
  is_duplicate = True
633
  break
634
 
 
635
  if not is_duplicate:
636
  result["detected_landmarks"].append({
637
  "landmark_id": box_result.get("landmark_id", "unknown"),
 
641
  "region_type": "yolo_box",
642
  "box": box
643
  })
644
+ except Exception as e:
645
+ self.logger.error(f"Error in analyzing YOLO box: {e}")
646
+ continue
647
+
648
+ # 網格搜索(如果需要)
649
+ should_do_grid_search = (
650
+ len(result["detected_landmarks"]) == 0 or
651
+ max([landmark.get("confidence", 0) for landmark in result["detected_landmarks"]], default=0) < 0.5
652
+ )
653
+
654
+ if should_do_grid_search:
655
+ try:
656
+ width, height = getattr(image, 'size', (getattr(image, 'width', 0), getattr(image, 'height', 0)))
657
+ if not isinstance(width, (int, float)) or width <= 0:
658
+ width = getattr(image, 'width', 0)
659
+ if not isinstance(height, (int, float)) or height <= 0:
660
+ height = getattr(image, 'height', 0)
661
+
662
+ if width > 0 and height > 0:
663
+ grid_boxes = []
664
+ for i in range(5):
665
+ for j in range(5):
666
+ grid_boxes.append([
667
+ width * (j/5), height * (i/5),
668
+ width * ((j+1)/5), height * ((i+1)/5)
669
+ ])
670
+
671
+ for box in grid_boxes:
672
+ try:
673
+ grid_result = self.classify_image_region(
674
+ image,
675
+ box,
676
+ threshold=base_threshold * 0.9,
677
+ detection_type="partial"
678
+ )
679
+
680
+ if grid_result and grid_result.get("is_landmark", False):
681
+ is_duplicate = False
682
+ for existing in result["detected_landmarks"]:
683
+ if existing.get("landmark_id") == grid_result.get("landmark_id"):
684
+ is_duplicate = True
685
+ break
686
+
687
+ if not is_duplicate:
688
+ result["detected_landmarks"].append({
689
+ "landmark_id": grid_result.get("landmark_id", "unknown"),
690
+ "landmark_name": grid_result.get("landmark_name", "Unknown Landmark"),
691
+ "confidence": grid_result.get("confidence", 0.0),
692
+ "location": grid_result.get("location", "Unknown Location"),
693
+ "region_type": "grid",
694
+ "box": box
695
+ })
696
+ except Exception as e:
697
+ self.logger.error(f"Error in analyzing grid region: {e}")
698
+ continue
699
  except Exception as e:
700
+ self.logger.error(f"Error in grid search: {e}")
701
+ self.logger.error(traceback.format_exc())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
702
 
703
+ # 按置信度排序檢測結果
704
+ result["detected_landmarks"].sort(key=lambda x: x.get("confidence", 0), reverse=True)
705
 
706
+ # 更新整體場景類型判斷
707
+ if len(result["detected_landmarks"]) > 0:
708
+ result["is_landmark_scene"] = True
709
+ result["primary_landmark"] = result["detected_landmarks"][0]
710
 
711
+ if full_image_result and "clip_analysis" in full_image_result:
712
+ result["clip_analysis_on_full_image"] = full_image_result["clip_analysis"]
 
713
 
714
+ return result
715
 
716
+ except Exception as e:
717
+ self.logger.error(f"Error in intelligent_landmark_search: {e}")
718
+ self.logger.error(traceback.format_exc())
719
+ return {
720
+ "full_image_analysis": {},
721
+ "is_landmark_scene": False,
722
+ "detected_landmarks": []
723
+ }
724
+
725
+ def enhanced_landmark_detection(self,
726
+ image: Union[Image.Image, np.ndarray],
727
+ threshold: float = 0.3) -> Dict[str, Any]:
728
  """
729
+ 使用多種分析技術進行增強地標檢測
730
 
731
  Args:
732
+ image: 輸入圖像
733
+ threshold: 基礎置信度閾值
734
 
735
  Returns:
736
+ Dict: 綜合地標檢測結果
737
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
738
  try:
739
+ if not self.landmark_data_manager.is_landmark_enabled():
740
+ return {"is_landmark_scene": False, "detected_landmarks": []}
 
 
 
 
 
 
 
 
 
 
 
 
741
 
742
+ # 確保圖像是PIL格式
743
+ if not isinstance(image, Image.Image):
744
+ if isinstance(image, np.ndarray):
745
+ image = Image.fromarray(image)
746
+ else:
747
+ raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
748
 
749
+ # 1: 分析視角以調整檢測參數
750
+ viewpoint_info = self.image_analyzer.analyze_viewpoint(image, self.clip_model_manager)
751
+ viewpoint = viewpoint_info["dominant_viewpoint"]
752
 
753
+ # 根據視角調整閾值
754
+ if viewpoint == "distant":
755
+ adjusted_threshold = threshold * 0.7
756
+ elif viewpoint == "close_up":
757
+ adjusted_threshold = threshold * 1.1
758
+ else:
759
+ adjusted_threshold = threshold
760
 
761
+ # 2: 執行多尺度金字塔分析
762
+ pyramid_results = self.image_analyzer.perform_pyramid_analysis(
763
+ image,
764
+ self.clip_model_manager,
765
+ self.landmark_data_manager,
766
+ levels=3,
767
+ base_threshold=adjusted_threshold
768
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
769
 
770
+ # 3: 執行基於網格的區域分析
771
+ grid_results = []
772
+ width, height = image.size
773
 
774
+ # 根據視角創建自適應網格
775
+ if viewpoint == "distant":
776
+ grid_size = 3
777
+ elif viewpoint == "close_up":
778
+ grid_size = 5
 
 
779
  else:
780
+ grid_size = 4
781
+
782
+ # 生成網格區域
783
+ for i in range(grid_size):
784
+ for j in range(grid_size):
785
+ box = [
786
+ width * (j/grid_size),
787
+ height * (i/grid_size),
788
+ width * ((j+1)/grid_size),
789
+ height * ((i+1)/grid_size)
790
+ ]
791
+
792
+ region_result = self.classify_image_region(
793
+ image,
794
+ box,
795
+ threshold=adjusted_threshold,
796
+ detection_type="auto"
797
+ )
798
+
799
+ if region_result["is_landmark"]:
800
+ region_result["grid_position"] = (i, j)
801
+ grid_results.append(region_result)
802
+
803
+ # 4: 交叉驗證並合併結果
804
+ all_detections = []
805
+
806
+ # 添加金字塔結果
807
+ if pyramid_results["is_landmark"] and pyramid_results["best_result"]:
808
+ all_detections.append({
809
+ "source": "pyramid",
810
+ "landmark_id": pyramid_results["best_result"]["landmark_id"],
811
+ "landmark_name": pyramid_results["best_result"]["landmark_name"],
812
+ "confidence": pyramid_results["best_result"]["confidence"],
813
+ "scale_factor": pyramid_results["best_result"].get("scale_factor", 1.0)
814
+ })
815
+
816
+ # 添加網格結果
817
+ for result in grid_results:
818
+ all_detections.append({
819
+ "source": "grid",
820
+ "landmark_id": result["landmark_id"],
821
+ "landmark_name": result["landmark_name"],
822
+ "confidence": result["confidence"],
823
+ "grid_position": result.get("grid_position", (0, 0))
824
+ })
825
+
826
+ # 搜索整張圖像
827
+ full_image_result = self.search_entire_image(image, threshold=adjusted_threshold)
828
+ if full_image_result and full_image_result.get("is_landmark", False):
829
+ all_detections.append({
830
+ "source": "full_image",
831
+ "landmark_id": full_image_result["landmark_id"],
832
+ "landmark_name": full_image_result["landmark_name"],
833
+ "confidence": full_image_result["confidence"]
834
+ })
835
+
836
+ # 按地標ID分組並計算總體置信度
837
+ landmark_groups = {}
838
+ for detection in all_detections:
839
+ landmark_id = detection["landmark_id"]
840
+ if landmark_id not in landmark_groups:
841
+ landmark_groups[landmark_id] = {
842
+ "landmark_id": landmark_id,
843
+ "landmark_name": detection["landmark_name"],
844
+ "detections": [],
845
+ "sources": set()
846
+ }
847
+
848
+ landmark_groups[landmark_id]["detections"].append(detection)
849
+ landmark_groups[landmark_id]["sources"].add(detection["source"])
850
 
851
+ # 計算每���地標的總體置信度
852
+ for landmark_id, group in landmark_groups.items():
853
+ detections = group["detections"]
854
 
855
+ # 基礎置信度是任何來源的最大置信度
856
+ max_confidence = max(d["confidence"] for d in detections)
 
 
857
 
858
+ # 多來源檢測獎勵
859
+ source_count = len(group["sources"])
860
+ source_bonus = min(0.15, (source_count - 1) * 0.05)
 
861
 
862
+ # 一致性獎勵
863
+ detection_count = len(detections)
864
+ consistency_bonus = min(0.1, (detection_count - 1) * 0.02)
865
 
866
+ # 計算最終置信度
867
+ aggregate_confidence = min(1.0, max_confidence + source_bonus + consistency_bonus)
 
868
 
869
+ group["confidence"] = aggregate_confidence
870
+ group["detection_count"] = detection_count
871
+ group["source_count"] = source_count
872
+
873
+ # 照信心度排序地標
874
+ sorted_landmarks = sorted(
875
+ landmark_groups.values(),
876
+ key=lambda x: x["confidence"],
877
+ reverse=True
878
+ )
879
 
880
+ return {
881
+ "is_landmark_scene": len(sorted_landmarks) > 0,
882
+ "detected_landmarks": sorted_landmarks,
883
+ "viewpoint_info": viewpoint_info,
884
+ "primary_landmark": sorted_landmarks[0] if sorted_landmarks else None
885
+ }
886
+
887
+ except Exception as e:
888
+ self.logger.error(f"Error in enhanced_landmark_detection: {e}")
889
+ self.logger.error(traceback.format_exc())
890
+ return {"is_landmark_scene": False, "detected_landmarks": []}
component_initializer.py ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import traceback
3
+ import logging
4
+ from typing import Dict, Optional, Any, Tuple
5
+
6
+ from spatial_analyzer import SpatialAnalyzer
7
+ from scene_description import SceneDescriptor
8
+ from enhance_scene_describer import EnhancedSceneDescriber
9
+ from clip_analyzer import CLIPAnalyzer
10
+ from clip_zero_shot_classifier import CLIPZeroShotClassifier
11
+ from llm_enhancer import LLMEnhancer
12
+ from landmark_activities import LANDMARK_ACTIVITIES
13
+ from scene_type import SCENE_TYPES
14
+ from object_categories import OBJECT_CATEGORIES
15
+
16
+
17
+ class ComponentInitializer:
18
+ """
19
+ 負責初始化和管理 SceneAnalyzer 的所有子組件。
20
+ 處理組件初始化失敗的情況並提供優雅的降級機制。
21
+ """
22
+
23
+ def __init__(self, class_names: Dict[int, str] = None, use_llm: bool = True,
24
+ use_clip: bool = True, enable_landmark: bool = True,
25
+ llm_model_path: str = None):
26
+ """
27
+ 初始化組件管理器。
28
+
29
+ Args:
30
+ class_names: YOLO 類別 ID 到名稱的映射字典
31
+ use_llm: 是否啟用 LLM 增強功能
32
+ use_clip: 是否啟用 CLIP 分析功能
33
+ enable_landmark: 是否啟用地標檢測功能
34
+ llm_model_path: LLM 模型路徑(可選)
35
+ """
36
+ self.logger = logging.getLogger(__name__)
37
+
38
+ # 存儲初始化參數
39
+ self.class_names = class_names
40
+ self.use_llm = use_llm
41
+ self.use_clip = use_clip
42
+ self.enable_landmark = enable_landmark
43
+ self.llm_model_path = llm_model_path
44
+
45
+ # 初始化組件容器
46
+ self.components = {}
47
+ self.data_structures = {}
48
+ self.initialization_status = {}
49
+
50
+ # 初始化所有組件
51
+ self._initialize_all_components()
52
+
53
+ def _initialize_all_components(self):
54
+ """初始化所有必要的組件和數據結構。"""
55
+ try:
56
+ # 1. 首先載入數據
57
+ self._load_data_structures()
58
+
59
+ # 2. 初始化核心分析組件
60
+ self._initialize_core_analyzers()
61
+
62
+ # 3. 初始化 CLIP 相關內容
63
+ if self.use_clip:
64
+ self._initialize_clip_components()
65
+
66
+ # 4. 初始化 LLM 組件
67
+ if self.use_llm:
68
+ self._initialize_llm_components()
69
+
70
+ self.logger.info("All components initialized successfully")
71
+
72
+ except Exception as e:
73
+ self.logger.error(f"Error during component initialization: {e}")
74
+ traceback.print_exc()
75
+ raise
76
+
77
+ def _load_data_structures(self):
78
+ """載入必要的數據結構。"""
79
+ data_loaders = {
80
+ 'LANDMARK_ACTIVITIES': self._load_landmark_activities,
81
+ 'SCENE_TYPES': self._load_scene_types,
82
+ 'OBJECT_CATEGORIES': self._load_object_categories
83
+ }
84
+
85
+ for data_name, loader_func in data_loaders.items():
86
+ try:
87
+ self.data_structures[data_name] = loader_func()
88
+ self.initialization_status[data_name] = True
89
+ self.logger.info(f"Loaded {data_name} successfully")
90
+ except Exception as e:
91
+ self.logger.warning(f"Failed to load {data_name}: {e}")
92
+ self.data_structures[data_name] = {}
93
+ self.initialization_status[data_name] = False
94
+
95
+ def _load_landmark_activities(self) -> Dict:
96
+ """載入地標活動數據。"""
97
+ try:
98
+ return LANDMARK_ACTIVITIES
99
+ except ImportError as e:
100
+ self.logger.warning(f"Could not import LANDMARK_ACTIVITIES: {e}")
101
+ return {}
102
+
103
+ def _load_scene_types(self) -> Dict:
104
+ """載入場景類型數據。"""
105
+ try:
106
+ return SCENE_TYPES
107
+ except ImportError as e:
108
+ self.logger.warning(f"Could not import SCENE_TYPES: {e}")
109
+ return {}
110
+
111
+ def _load_object_categories(self) -> Dict:
112
+ """載入物體類別數據。"""
113
+ try:
114
+ return OBJECT_CATEGORIES
115
+ except ImportError as e:
116
+ self.logger.warning(f"Could not import OBJECT_CATEGORIES: {e}")
117
+ return {}
118
+
119
+ def _initialize_core_analyzers(self):
120
+ """初始化核心分析組件。"""
121
+ # 初始化 SpatialAnalyzer
122
+ try:
123
+ self.components['spatial_analyzer'] = SpatialAnalyzer(
124
+ class_names=self.class_names,
125
+ object_categories=self.data_structures.get('OBJECT_CATEGORIES', {})
126
+ )
127
+ self.initialization_status['spatial_analyzer'] = True
128
+ self.logger.info("Initialized SpatialAnalyzer successfully")
129
+ except Exception as e:
130
+ self.logger.error(f"Error initializing SpatialAnalyzer: {e}")
131
+ traceback.print_exc()
132
+ self.initialization_status['spatial_analyzer'] = False
133
+ self.components['spatial_analyzer'] = None
134
+
135
+ # 初始化 SceneDescriptor
136
+ try:
137
+ self.components['descriptor'] = SceneDescriptor(
138
+ scene_types=self.data_structures.get('SCENE_TYPES', {}),
139
+ object_categories=self.data_structures.get('OBJECT_CATEGORIES', {})
140
+ )
141
+ self.initialization_status['descriptor'] = True
142
+ self.logger.info("Initialized SceneDescriptor successfully")
143
+ except Exception as e:
144
+ self.logger.error(f"Error initializing SceneDescriptor: {e}")
145
+ traceback.print_exc()
146
+ self.initialization_status['descriptor'] = False
147
+ self.components['descriptor'] = None
148
+
149
+ # 初始化 EnhancedSceneDescriber
150
+ try:
151
+ if self.components.get('spatial_analyzer'):
152
+ self.components['scene_describer'] = EnhancedSceneDescriber(
153
+ scene_types=self.data_structures.get('SCENE_TYPES', {}),
154
+ spatial_analyzer_instance=self.components['spatial_analyzer']
155
+ )
156
+ self.initialization_status['scene_describer'] = True
157
+ self.logger.info("Initialized EnhancedSceneDescriber successfully")
158
+ else:
159
+ self.logger.warning("Cannot initialize EnhancedSceneDescriber without SpatialAnalyzer")
160
+ self.initialization_status['scene_describer'] = False
161
+ self.components['scene_describer'] = None
162
+ except Exception as e:
163
+ self.logger.error(f"Error initializing EnhancedSceneDescriber: {e}")
164
+ traceback.print_exc()
165
+ self.initialization_status['scene_describer'] = False
166
+ self.components['scene_describer'] = None
167
+
168
+ def _initialize_clip_components(self):
169
+ """初始化 CLIP 相關組件。"""
170
+ # 初始化 CLIPAnalyzer
171
+ try:
172
+ self.components['clip_analyzer'] = CLIPAnalyzer()
173
+ self.initialization_status['clip_analyzer'] = True
174
+ self.logger.info("Initialized CLIPAnalyzer successfully")
175
+
176
+ # 如果啟用地標檢測,初始化 CLIPZeroShotClassifier
177
+ if self.enable_landmark:
178
+ self._initialize_landmark_classifier()
179
+
180
+ except Exception as e:
181
+ self.logger.warning(f"Could not initialize CLIP analyzer: {e}")
182
+ self.logger.info("Scene analysis will proceed without CLIP. Install CLIP with 'pip install clip' for enhanced scene understanding.")
183
+ self.use_clip = False
184
+ self.initialization_status['clip_analyzer'] = False
185
+ self.components['clip_analyzer'] = None
186
+
187
+ def _initialize_landmark_classifier(self):
188
+ """初始化地標分類器。"""
189
+ try:
190
+ # 嘗試使用已載入的 CLIP 模型實例
191
+ if (self.components.get('clip_analyzer') and
192
+ hasattr(self.components['clip_analyzer'], 'get_clip_instance')):
193
+ model, preprocess, device = self.components['clip_analyzer'].get_clip_instance()
194
+ self.components['landmark_classifier'] = CLIPZeroShotClassifier(device=device)
195
+ self.logger.info("Initialized landmark classifier with shared CLIP model")
196
+ else:
197
+ self.components['landmark_classifier'] = CLIPZeroShotClassifier()
198
+ self.logger.info("Initialized landmark classifier with independent CLIP model")
199
+
200
+ # 配置地標檢測器參數
201
+ self._configure_landmark_classifier()
202
+ self.initialization_status['landmark_classifier'] = True
203
+
204
+ except (ImportError, Exception) as e:
205
+ self.logger.warning(f"Could not initialize landmark classifier: {e}")
206
+ self.initialization_status['landmark_classifier'] = False
207
+ self.components['landmark_classifier'] = None
208
+ # 不完全禁用地標檢測,允許運行時重新嘗試
209
+
210
+ def _configure_landmark_classifier(self):
211
+ """配置地標分類器的參數。"""
212
+ if self.components.get('landmark_classifier'):
213
+ try:
214
+ classifier = self.components['landmark_classifier']
215
+ classifier.set_batch_size(8)
216
+ classifier.adjust_confidence_threshold("full_image", 0.8)
217
+ classifier.adjust_confidence_threshold("distant", 0.65)
218
+ self.logger.info("Landmark detection enabled with optimized settings")
219
+ except Exception as e:
220
+ self.logger.warning(f"Error configuring landmark classifier: {e}")
221
+
222
+ def _initialize_llm_components(self):
223
+ """初始化 LLM 組件。"""
224
+ try:
225
+ self.components['llm_enhancer'] = LLMEnhancer(model_path=self.llm_model_path)
226
+ self.initialization_status['llm_enhancer'] = True
227
+ self.logger.info("LLM enhancer initialized successfully")
228
+ except Exception as e:
229
+ self.logger.warning(f"Could not initialize LLM enhancer: {e}")
230
+ self.logger.info("Scene analysis will proceed without LLM. Make sure required packages are installed.")
231
+ self.use_llm = False
232
+ self.initialization_status['llm_enhancer'] = False
233
+ self.components['llm_enhancer'] = None
234
+
235
+ def get_component(self, component_name: str) -> Optional[Any]:
236
+ """
237
+ 獲取指定的組件實例。
238
+
239
+ Args:
240
+ component_name: 組件名稱
241
+
242
+ Returns:
243
+ 組件實例或 None(如果未初始化成功)
244
+ """
245
+ return self.components.get(component_name)
246
+
247
+ def get_data_structure(self, data_name: str) -> Dict:
248
+ """
249
+ 獲取指定的數據結構。
250
+
251
+ Args:
252
+ data_name: 數據結構名稱
253
+
254
+ Returns:
255
+ 數據結構字典
256
+ """
257
+ return self.data_structures.get(data_name, {})
258
+
259
+ def is_component_available(self, component_name: str) -> bool:
260
+ """
261
+ 檢查指定組件是否可用。
262
+
263
+ Args:
264
+ component_name: 組件名稱
265
+
266
+ Returns:
267
+ 組件是否可用
268
+ """
269
+ return self.initialization_status.get(component_name, False)
270
+
271
+ def get_initialization_summary(self) -> Dict[str, bool]:
272
+ """
273
+ 獲取所有組件的初始化狀態摘要。
274
+
275
+ Returns:
276
+ 組件名稱到初始化狀態的映射
277
+ """
278
+ return self.initialization_status.copy()
279
+
280
+ def reinitialize_component(self, component_name: str) -> bool:
281
+ """
282
+ 重新初始化指定的組件。
283
+
284
+ Args:
285
+ component_name: 要重新初始化的組件名稱
286
+
287
+ Returns:
288
+ 重新初始化是否成功
289
+ """
290
+ try:
291
+ if component_name == 'landmark_classifier' and self.use_clip and self.enable_landmark:
292
+ self._initialize_landmark_classifier()
293
+ return self.initialization_status.get('landmark_classifier', False)
294
+ else:
295
+ self.logger.warning(f"Reinitializing {component_name} is not supported")
296
+ return False
297
+ except Exception as e:
298
+ self.logger.error(f"Error reinitializing {component_name}: {e}")
299
+ return False
300
+
301
+ def update_landmark_enable_status(self, enable_landmark: bool):
302
+ """
303
+ 更新地標檢測的啟用狀態。
304
+
305
+ Args:
306
+ enable_landmark: 是否啟用地標檢測
307
+ """
308
+ self.enable_landmark = enable_landmark
309
+
310
+ # 如果啟用地標檢測但分類器不可用,嘗試重新初始化
311
+ if enable_landmark and not self.is_component_available('landmark_classifier'):
312
+ if self.use_clip:
313
+ self.reinitialize_component('landmark_classifier')
314
+
315
+ # 更新相關組件的狀態
316
+ for component_name in ['scene_describer', 'clip_analyzer', 'landmark_classifier']:
317
+ component = self.get_component(component_name)
318
+ if component and hasattr(component, 'enable_landmark'):
319
+ component.enable_landmark = enable_landmark
confidence_manager.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import logging
3
+ import traceback
4
+ from typing import Dict, Any, Optional
5
+
6
+ class ConfidenceManager:
7
+ """
8
+ 專門管理信心度相關邏輯,包括動態閾值調整、信心度乘數管理和地標類型特定的閾值處理
9
+ """
10
+
11
+ def __init__(self):
12
+ """
13
+ 初始化置信度管理器
14
+ """
15
+ self.logger = logging.getLogger(__name__)
16
+
17
+ # 初始化批處理參數
18
+ self.batch_size = 16 # 默認批處理大小
19
+
20
+ # 置信度閾值乘數配置
21
+ self.confidence_threshold_multipliers = {
22
+ "close_up": 0.9, # 近景標準閾值
23
+ "partial": 0.6, # 部分可見降低閾值要求
24
+ "distant": 0.5, # 遠景更低閾值要求
25
+ "full_image": 0.7 # 整張圖像需要更高閾值
26
+ }
27
+
28
+ # 地標類型閾值配置
29
+ self.landmark_type_thresholds = {
30
+ "tower": 0.5, # 塔型建築需要更高閾值
31
+ "skyscraper": 0.4, # 摩天大樓使用較低閾值
32
+ "building": 0.55, # 一般的建築物閾值略微降低
33
+ "monument": 0.5, # 紀念碑閾值
34
+ "natural": 0.6 # 自然景觀可以使用較低閾值
35
+ }
36
+
37
+ def set_batch_size(self, batch_size: int):
38
+ """
39
+ 設置批處理大小
40
+
41
+ Args:
42
+ batch_size: 新的批處理大小
43
+ """
44
+ self.batch_size = max(1, batch_size)
45
+ self.logger.info(f"Batch size set to {self.batch_size}")
46
+
47
+ def adjust_confidence_threshold(self, detection_type: str, multiplier: float):
48
+ """
49
+ 調整特定檢測類型的信心度的threshold
50
+
51
+ Args:
52
+ detection_type: 檢測類型 ('close_up', 'partial', 'distant', 'full_image')
53
+ multiplier: 置信度閾值乘數
54
+ """
55
+ if detection_type in self.confidence_threshold_multipliers:
56
+ self.confidence_threshold_multipliers[detection_type] = max(0.1, min(1.5, multiplier))
57
+ self.logger.info(f"Adjusted confidence threshold multiplier for {detection_type} to {multiplier}")
58
+ else:
59
+ self.logger.warning(f"Unknown detection type: {detection_type}")
60
+
61
+ def get_detection_type_multiplier(self, detection_type: str) -> float:
62
+ """
63
+ 獲取檢測類型的置信度乘數
64
+
65
+ Args:
66
+ detection_type: 檢測類型
67
+
68
+ Returns:
69
+ float: 置信度乘數
70
+ """
71
+ return self.confidence_threshold_multipliers.get(detection_type, 1.0)
72
+
73
+ def get_landmark_type_threshold(self, landmark_type: str) -> float:
74
+ """
75
+ 獲取地標類型的閾值
76
+
77
+ Args:
78
+ landmark_type: 地標類型
79
+
80
+ Returns:
81
+ float: 地標類型閾值
82
+ """
83
+ return self.landmark_type_thresholds.get(landmark_type, 0.5)
84
+
85
+ def calculate_adjusted_threshold(self, base_threshold: float, detection_type: str) -> float:
86
+ """
87
+ 根據檢測類型計算調整後的閾值
88
+
89
+ Args:
90
+ base_threshold: 基礎閾值
91
+ detection_type: 檢測type
92
+
93
+ Returns:
94
+ float: 調整後的閾值
95
+ """
96
+ try:
97
+ base_multiplier = self.get_detection_type_multiplier(detection_type)
98
+ adjusted_threshold = base_threshold * base_multiplier
99
+ return adjusted_threshold
100
+ except Exception as e:
101
+ self.logger.error(f"Error calculating adjusted threshold: {e}")
102
+ self.logger.error(traceback.format_exc())
103
+ return base_threshold
104
+
105
+ def calculate_final_threshold(self, base_threshold: float, detection_type: str,
106
+ landmark_type: str) -> float:
107
+ """
108
+ 計算最終閾值,結合檢測類型和地標類型
109
+
110
+ Args:
111
+ base_threshold: 基礎閾值
112
+ detection_type: 檢測type
113
+ landmark_type: 地標type
114
+
115
+ Returns:
116
+ float: 最終閾值
117
+ """
118
+ try:
119
+ # 根據檢測類型調整
120
+ adjusted_threshold = self.calculate_adjusted_threshold(base_threshold, detection_type)
121
+
122
+ # 根據地標類型進一步調整
123
+ if landmark_type == "distinctive":
124
+ # 特殊建築的閾值降低25%
125
+ type_multiplier = 0.75
126
+ else:
127
+ # 使用已有的類型閾值
128
+ type_multiplier = self.get_landmark_type_threshold(landmark_type) / 0.5
129
+
130
+ final_threshold = adjusted_threshold * type_multiplier
131
+ return final_threshold
132
+
133
+ except Exception as e:
134
+ self.logger.error(f"Error calculating final threshold: {e}")
135
+ self.logger.error(traceback.format_exc())
136
+ return base_threshold
137
+
138
+ def evaluate_confidence(self, confidence: float, threshold: float) -> bool:
139
+ """
140
+ 評估置信度是否達到閾值
141
+
142
+ Args:
143
+ confidence: 信心度score
144
+ threshold: 閾值
145
+
146
+ Returns:
147
+ bool: 是否達到閾值
148
+ """
149
+ return confidence >= threshold
150
+
151
+ def apply_architectural_boost(self, confidence: float, architectural_analysis: Dict[str, Any],
152
+ landmark_id: str) -> float:
153
+ """
154
+ 根據建築特徵分析調整信心度
155
+
156
+ Args:
157
+ confidence: 原始置信度
158
+ architectural_analysis: 建築特徵分析結果
159
+ landmark_id: 地標ID
160
+
161
+ Returns:
162
+ float: 調整後的信心度
163
+ """
164
+ try:
165
+ confidence_boost = 0
166
+ landmark_id_lower = landmark_id.lower()
167
+
168
+ top_features = architectural_analysis.get("architectural_features", [])
169
+ primary_category = architectural_analysis.get("primary_category", "")
170
+
171
+ # 使用主要建築類別來調整置信度,使用通用條件而非特定地標名稱
172
+ if primary_category == "tower" and any(term in landmark_id_lower for term in ["tower", "spire", "needle"]):
173
+ confidence_boost += 0.05
174
+ elif primary_category == "skyscraper" and any(term in landmark_id_lower for term in ["building", "skyscraper", "tall"]):
175
+ confidence_boost += 0.05
176
+ elif primary_category == "historical" and any(term in landmark_id_lower for term in ["monument", "castle", "palace", "temple"]):
177
+ confidence_boost += 0.05
178
+ elif primary_category == "distinctive" and any(term in landmark_id_lower for term in ["unusual", "unique", "special", "famous"]):
179
+ confidence_boost += 0.05
180
+
181
+ # 根據特定特徵進一步微調,使用通用特徵描述而非特定地標
182
+ for feature, score in top_features:
183
+ if feature == "time_display" and "clock" in landmark_id_lower:
184
+ confidence_boost += 0.03
185
+ elif feature == "segmented_exterior" and "segmented" in landmark_id_lower:
186
+ confidence_boost += 0.03
187
+ elif feature == "slanted_design" and "leaning" in landmark_id_lower:
188
+ confidence_boost += 0.03
189
+
190
+ # 應用信心度調整
191
+ if confidence_boost > 0:
192
+ adjusted_confidence = confidence + confidence_boost
193
+ self.logger.info(f"Boosted confidence by {confidence_boost:.2f} based on architectural features ({primary_category})")
194
+ return adjusted_confidence
195
+
196
+ return confidence
197
+
198
+ except Exception as e:
199
+ self.logger.error(f"Error applying architectural boost: {e}")
200
+ self.logger.error(traceback.format_exc())
201
+ return confidence
202
+
203
+ def determine_detection_type_from_region(self, region_width: int, region_height: int,
204
+ image_width: int, image_height: int) -> str:
205
+ """
206
+ 根據區域大小自動判斷檢測類型
207
+
208
+ Args:
209
+ region_width: 區域寬度
210
+ region_height: 區域高度
211
+ image_width: 圖像寬度
212
+ image_height: 圖像高度
213
+
214
+ Returns:
215
+ str: 檢測類型
216
+ """
217
+ try:
218
+ region_area_ratio = (region_width * region_height) / (image_width * image_height)
219
+
220
+ if region_area_ratio > 0.5:
221
+ return "close_up"
222
+ elif region_area_ratio > 0.2:
223
+ return "partial"
224
+ else:
225
+ return "distant"
226
+
227
+ except Exception as e:
228
+ self.logger.error(f"Error determining detection type from region: {e}")
229
+ self.logger.error(traceback.format_exc())
230
+ return "partial"
231
+
232
+ def adjust_detection_type_by_viewpoint(self, detection_type: str, dominant_viewpoint: str) -> str:
233
+ """
234
+ 根據視角調整檢測類型
235
+
236
+ Args:
237
+ detection_type: 原始檢測類型
238
+ dominant_viewpoint: 主要視角
239
+
240
+ Returns:
241
+ str: 調整後的檢測類型
242
+ """
243
+ try:
244
+ if dominant_viewpoint == "close_up" and detection_type != "close_up":
245
+ return "close_up"
246
+ elif dominant_viewpoint == "distant" and detection_type != "distant":
247
+ return "distant"
248
+ elif dominant_viewpoint == "angled_view":
249
+ return "partial" # 角度視圖可能是部分可見
250
+ else:
251
+ return detection_type
252
+
253
+ except Exception as e:
254
+ self.logger.error(f"Error adjusting detection type by viewpoint: {e}")
255
+ self.logger.error(traceback.format_exc())
256
+ return detection_type
257
+
258
+ def get_batch_size(self) -> int:
259
+ """
260
+ 獲取當前批處理大小
261
+
262
+ Returns:
263
+ int: 批處理大小
264
+ """
265
+ return self.batch_size
266
+
267
+ def get_all_threshold_multipliers(self) -> Dict[str, float]:
268
+ """
269
+ 獲取所有置信度閾值乘數
270
+
271
+ Returns:
272
+ Dict[str, float]: 閾��乘數字典
273
+ """
274
+ return self.confidence_threshold_multipliers.copy()
275
+
276
+ def get_all_landmark_type_thresholds(self) -> Dict[str, float]:
277
+ """
278
+ 獲取所有地標類型閾值
279
+
280
+ Returns:
281
+ Dict[str, float]: 地標類型閾值字典
282
+ """
283
+ return self.landmark_type_thresholds.copy()
configuration_manager.py ADDED
@@ -0,0 +1,418 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Any, List, Tuple, Optional, Union
2
+ import json
3
+ import os
4
+ from dataclasses import dataclass, field
5
+ from pathlib import Path
6
+
7
+
8
+ @dataclass
9
+ class FeatureThresholds:
10
+ """Configuration class for feature extraction thresholds."""
11
+ dark_pixel_threshold: float = 50.0
12
+ bright_pixel_threshold: float = 220.0
13
+ sky_blue_hue_min: float = 95.0
14
+ sky_blue_hue_max: float = 135.0
15
+ sky_blue_sat_min: float = 40.0
16
+ sky_blue_val_min: float = 90.0
17
+ gray_sat_max: float = 70.0
18
+ gray_val_min: float = 60.0
19
+ gray_val_max: float = 220.0
20
+ light_source_abs_thresh: float = 220.0
21
+
22
+
23
+ @dataclass
24
+ class IndoorOutdoorThresholds:
25
+ """Configuration class for indoor/outdoor classification thresholds."""
26
+ sky_blue_dominance_thresh: float = 0.18
27
+ sky_brightness_ratio_thresh: float = 1.25
28
+ openness_top_thresh: float = 0.68
29
+ sky_texture_complexity_thresh: float = 0.35
30
+ ceiling_likelihood_thresh: float = 0.4
31
+ boundary_clarity_thresh: float = 0.38
32
+ brightness_uniformity_thresh_indoor: float = 0.6
33
+ brightness_uniformity_thresh_outdoor: float = 0.40
34
+ many_bright_spots_thresh: int = 6
35
+ dim_scene_for_spots_thresh: float = 115.0
36
+ home_pattern_thresh_strong: float = 2.0
37
+ home_pattern_thresh_moderate: float = 1.0
38
+ warm_indoor_max_brightness_thresh: float = 135.0
39
+ aerial_top_dark_ratio_thresh: float = 0.9
40
+ aerial_top_complex_thresh: float = 0.60
41
+ aerial_min_avg_brightness_thresh: float = 65.0
42
+
43
+
44
+ @dataclass
45
+ class LightingThresholds:
46
+ """Configuration class for lighting condition analysis thresholds."""
47
+ outdoor_night_thresh_brightness: float = 80.0
48
+ outdoor_night_lights_thresh: int = 2
49
+ outdoor_dusk_dawn_thresh_brightness: float = 130.0
50
+ outdoor_dusk_dawn_color_thresh: float = 0.10
51
+ outdoor_day_bright_thresh: float = 140.0
52
+ outdoor_day_blue_thresh: float = 0.05
53
+ outdoor_day_cloudy_thresh: float = 120.0
54
+ outdoor_day_gray_thresh: float = 0.18
55
+ indoor_bright_thresh: float = 130.0
56
+ indoor_moderate_thresh: float = 95.0
57
+ commercial_min_brightness_thresh: float = 105.0
58
+ commercial_min_spots_thresh: int = 3
59
+ stadium_min_spots_thresh: int = 6
60
+ neon_yellow_orange_thresh: float = 0.12
61
+ neon_bright_spots_thresh: int = 4
62
+ neon_avg_saturation_thresh: float = 60.0
63
+
64
+
65
+ @dataclass
66
+ class WeightingFactors:
67
+ """Configuration class for feature weighting factors."""
68
+ # Sky/Openness weights (negative values push towards outdoor)
69
+ sky_blue_dominance_w: float = 3.5
70
+ sky_brightness_ratio_w: float = 3.0
71
+ openness_top_w: float = 2.8
72
+ sky_texture_w: float = 2.0
73
+
74
+ # Ceiling/Enclosure weights (positive values push towards indoor)
75
+ ceiling_likelihood_w: float = 1.5
76
+ boundary_clarity_w: float = 1.2
77
+
78
+ # Brightness weights
79
+ brightness_uniformity_w: float = 0.6
80
+ brightness_non_uniformity_outdoor_w: float = 1.0
81
+ brightness_non_uniformity_indoor_penalty_w: float = 0.1
82
+
83
+ # Light source weights
84
+ circular_lights_w: float = 1.2
85
+ indoor_light_score_w: float = 0.8
86
+ many_bright_spots_indoor_w: float = 0.3
87
+
88
+ # Color atmosphere weights
89
+ warm_atmosphere_indoor_w: float = 0.15
90
+
91
+ # Environment pattern weights
92
+ home_env_strong_w: float = 1.5
93
+ home_env_moderate_w: float = 0.7
94
+
95
+ # Structural pattern weights
96
+ aerial_street_w: float = 2.5
97
+ places365_outdoor_scene_w: float = 4.0
98
+ places365_indoor_scene_w: float = 3.0
99
+ places365_attribute_w: float = 1.5
100
+
101
+
102
+ @dataclass
103
+ class OverrideFactors:
104
+ """Configuration class for override and reduction factors."""
105
+ sky_override_factor_ceiling: float = 0.1
106
+ sky_override_factor_boundary: float = 0.2
107
+ sky_override_factor_uniformity: float = 0.15
108
+ sky_override_factor_lights: float = 0.05
109
+ sky_override_factor_p365_indoor_decision: float = 0.3
110
+ aerial_enclosure_reduction_factor: float = 0.75
111
+ ceiling_sky_override_factor: float = 0.1
112
+ p365_outdoor_reduces_enclosure_factor: float = 0.3
113
+ p365_indoor_boosts_ceiling_factor: float = 1.5
114
+
115
+
116
+ @dataclass
117
+ class ColorRanges:
118
+ """Configuration class for color range definitions."""
119
+ warm_hue_ranges: List[Tuple[float, float]] = field(
120
+ default_factory=lambda: [(0, 50), (330, 360)]
121
+ )
122
+ cool_hue_ranges: List[Tuple[float, float]] = field(
123
+ default_factory=lambda: [(90, 270)]
124
+ )
125
+
126
+
127
+ @dataclass
128
+ class AlgorithmParameters:
129
+ """Configuration class for algorithm-specific parameters."""
130
+ indoor_score_sigmoid_scale: float = 0.3
131
+ indoor_decision_threshold: float = 0.5
132
+ places365_high_confidence_thresh: float = 0.75
133
+ places365_moderate_confidence_thresh: float = 0.5
134
+ places365_attribute_confidence_thresh: float = 0.6
135
+ include_diagnostics: bool = True
136
+
137
+
138
+ class ConfigurationManager:
139
+ """
140
+ 這主要是管理光線分析的參數,會有很多不同情況, 做parameters配置
141
+
142
+ This class provides type-safe access to all configuration parameters,
143
+ supports loading from external files, and includes validation mechanisms.
144
+ """
145
+
146
+ def __init__(self, config_path: Optional[Union[str, Path]] = None):
147
+ """
148
+ Initialize the configuration manager.
149
+
150
+ Args:
151
+ config_path: Optional path to external configuration file.
152
+ If None, uses default configuration.
153
+ """
154
+ self._feature_thresholds = FeatureThresholds()
155
+ self._indoor_outdoor_thresholds = IndoorOutdoorThresholds()
156
+ self._lighting_thresholds = LightingThresholds()
157
+ self._weighting_factors = WeightingFactors()
158
+ self._override_factors = OverrideFactors()
159
+ self._color_ranges = ColorRanges()
160
+ self._algorithm_parameters = AlgorithmParameters()
161
+
162
+ if config_path is not None:
163
+ self.load_from_file(config_path)
164
+
165
+ @property
166
+ def feature_thresholds(self) -> FeatureThresholds:
167
+ """Get feature extraction thresholds."""
168
+ return self._feature_thresholds
169
+
170
+ @property
171
+ def indoor_outdoor_thresholds(self) -> IndoorOutdoorThresholds:
172
+ """Get indoor/outdoor classification thresholds."""
173
+ return self._indoor_outdoor_thresholds
174
+
175
+ @property
176
+ def lighting_thresholds(self) -> LightingThresholds:
177
+ """Get lighting condition analysis thresholds."""
178
+ return self._lighting_thresholds
179
+
180
+ @property
181
+ def weighting_factors(self) -> WeightingFactors:
182
+ """Get feature weighting factors."""
183
+ return self._weighting_factors
184
+
185
+ @property
186
+ def override_factors(self) -> OverrideFactors:
187
+ """Get override and reduction factors."""
188
+ return self._override_factors
189
+
190
+ @property
191
+ def color_ranges(self) -> ColorRanges:
192
+ """Get color range definitions."""
193
+ return self._color_ranges
194
+
195
+ @property
196
+ def algorithm_parameters(self) -> AlgorithmParameters:
197
+ """Get algorithm-specific parameters."""
198
+ return self._algorithm_parameters
199
+
200
+ def get_legacy_config_dict(self) -> Dict[str, Any]:
201
+ """
202
+ Generate legacy configuration dictionary for backward compatibility.
203
+
204
+ Returns:
205
+ Dictionary containing all configuration parameters in the original format.
206
+ """
207
+ config_dict = {}
208
+
209
+ # Feature thresholds
210
+ for field_name, field_value in self._feature_thresholds.__dict__.items():
211
+ config_dict[field_name] = field_value
212
+
213
+ # Indoor/outdoor thresholds
214
+ for field_name, field_value in self._indoor_outdoor_thresholds.__dict__.items():
215
+ config_dict[field_name] = field_value
216
+
217
+ # Lighting thresholds
218
+ for field_name, field_value in self._lighting_thresholds.__dict__.items():
219
+ config_dict[field_name] = field_value
220
+
221
+ # Override factors
222
+ for field_name, field_value in self._override_factors.__dict__.items():
223
+ config_dict[field_name] = field_value
224
+
225
+ # Color ranges
226
+ for field_name, field_value in self._color_ranges.__dict__.items():
227
+ config_dict[field_name] = field_value
228
+
229
+ # Algorithm parameters
230
+ for field_name, field_value in self._algorithm_parameters.__dict__.items():
231
+ config_dict[field_name] = field_value
232
+
233
+ # Weighting factors - stored under 'indoor_outdoor_weights' key
234
+ config_dict["indoor_outdoor_weights"] = self._weighting_factors.__dict__.copy()
235
+
236
+ return config_dict
237
+
238
+ def load_from_file(self, config_path: Union[str, Path]) -> None:
239
+ """
240
+ Load configuration from external JSON file.
241
+
242
+ Args:
243
+ config_path: Path to the configuration file.
244
+
245
+ Raises:
246
+ FileNotFoundError: If the configuration file doesn't exist.
247
+ ValueError: If the configuration file contains invalid data.
248
+ """
249
+ config_path = Path(config_path)
250
+
251
+ if not config_path.exists():
252
+ raise FileNotFoundError(f"Configuration file not found: {config_path}")
253
+
254
+ try:
255
+ with open(config_path, 'r', encoding='utf-8') as file:
256
+ config_data = json.load(file)
257
+
258
+ self._update_from_dict(config_data)
259
+
260
+ except json.JSONDecodeError as e:
261
+ raise ValueError(f"Invalid JSON in configuration file: {e}")
262
+ except Exception as e:
263
+ raise ValueError(f"Error loading configuration: {e}")
264
+
265
+ def save_to_file(self, config_path: Union[str, Path]) -> None:
266
+ """
267
+ Save current configuration to JSON file.
268
+
269
+ Args:
270
+ config_path: Path where to save the configuration file.
271
+ """
272
+ config_path = Path(config_path)
273
+ config_path.parent.mkdir(parents=True, exist_ok=True)
274
+
275
+ config_dict = self.get_legacy_config_dict()
276
+
277
+ with open(config_path, 'w', encoding='utf-8') as file:
278
+ json.dump(config_dict, file, indent=2, ensure_ascii=False)
279
+
280
+ def _update_from_dict(self, config_data: Dict[str, Any]) -> None:
281
+ """
282
+ Update configuration from dictionary data.
283
+
284
+ Args:
285
+ config_data: Dictionary containing configuration parameters.
286
+ """
287
+ # Update feature thresholds
288
+ self._update_dataclass_from_dict(self._feature_thresholds, config_data)
289
+
290
+ # Update indoor/outdoor thresholds
291
+ self._update_dataclass_from_dict(self._indoor_outdoor_thresholds, config_data)
292
+
293
+ # Update lighting thresholds
294
+ self._update_dataclass_from_dict(self._lighting_thresholds, config_data)
295
+
296
+ # Update override factors
297
+ self._update_dataclass_from_dict(self._override_factors, config_data)
298
+
299
+ # Update color ranges
300
+ self._update_dataclass_from_dict(self._color_ranges, config_data)
301
+
302
+ # Update algorithm parameters
303
+ self._update_dataclass_from_dict(self._algorithm_parameters, config_data)
304
+
305
+ # Update weighting factors from nested dictionary
306
+ if "indoor_outdoor_weights" in config_data:
307
+ self._update_dataclass_from_dict(
308
+ self._weighting_factors,
309
+ config_data["indoor_outdoor_weights"]
310
+ )
311
+
312
+ def _update_dataclass_from_dict(self, dataclass_instance: object, data_dict: Dict[str, Any]) -> None:
313
+ """
314
+ Update dataclass instance fields from dictionary.
315
+
316
+ Args:
317
+ dataclass_instance: The dataclass instance to update.
318
+ data_dict: Dictionary containing the update values.
319
+ """
320
+ for field_name, field_value in data_dict.items():
321
+ if hasattr(dataclass_instance, field_name):
322
+ # Type validation could be added here
323
+ setattr(dataclass_instance, field_name, field_value)
324
+
325
+ def validate_configuration(self) -> List[str]:
326
+ """
327
+ Validate the current configuration for logical consistency.
328
+
329
+ Returns:
330
+ List of validation error messages. Empty list if configuration is valid.
331
+ """
332
+ errors = []
333
+
334
+ # Validate threshold ranges
335
+ ft = self._feature_thresholds
336
+ if ft.dark_pixel_threshold >= ft.bright_pixel_threshold:
337
+ errors.append("Dark pixel threshold must be less than bright pixel threshold")
338
+
339
+ if ft.sky_blue_hue_min >= ft.sky_blue_hue_max:
340
+ errors.append("Sky blue hue min must be less than sky blue hue max")
341
+
342
+ if ft.gray_val_min >= ft.gray_val_max:
343
+ errors.append("Gray value min must be less than gray value max")
344
+
345
+ # Validate probability thresholds
346
+ ap = self._algorithm_parameters
347
+ if not (0.0 <= ap.indoor_decision_threshold <= 1.0):
348
+ errors.append("Indoor decision threshold must be between 0 and 1")
349
+
350
+ if not (0.0 <= ap.places365_high_confidence_thresh <= 1.0):
351
+ errors.append("Places365 high confidence threshold must be between 0 and 1")
352
+
353
+ # Validate color ranges
354
+ for warm_range in self._color_ranges.warm_hue_ranges:
355
+ if warm_range[0] >= warm_range[1]:
356
+ errors.append(f"Invalid warm hue range: {warm_range}")
357
+
358
+ for cool_range in self._color_ranges.cool_hue_ranges:
359
+ if cool_range[0] >= cool_range[1]:
360
+ errors.append(f"Invalid cool hue range: {cool_range}")
361
+
362
+ return errors
363
+
364
+ def get_threshold_value(self, threshold_name: str) -> Any:
365
+ """
366
+ Get a specific threshold value by name.
367
+
368
+ Args:
369
+ threshold_name: Name of the threshold parameter.
370
+
371
+ Returns:
372
+ The threshold value.
373
+
374
+ Raises:
375
+ AttributeError: If the threshold name doesn't exist.
376
+ """
377
+ # Search through all configuration sections
378
+ for config_section in [
379
+ self._feature_thresholds,
380
+ self._indoor_outdoor_thresholds,
381
+ self._lighting_thresholds,
382
+ self._override_factors,
383
+ self._algorithm_parameters
384
+ ]:
385
+ if hasattr(config_section, threshold_name):
386
+ return getattr(config_section, threshold_name)
387
+
388
+ # Check weighting factors
389
+ if hasattr(self._weighting_factors, threshold_name):
390
+ return getattr(self._weighting_factors, threshold_name)
391
+
392
+ raise AttributeError(f"Threshold '{threshold_name}' not found")
393
+
394
+ def update_threshold(self, threshold_name: str, value: Any) -> None:
395
+ """
396
+ Update a specific threshold value.
397
+
398
+ Args:
399
+ threshold_name: Name of the threshold parameter.
400
+ value: New value for the threshold.
401
+
402
+ Raises:
403
+ AttributeError: If the threshold name doesn't exist.
404
+ """
405
+ # Search through all configuration sections
406
+ for config_section in [
407
+ self._feature_thresholds,
408
+ self._indoor_outdoor_thresholds,
409
+ self._lighting_thresholds,
410
+ self._override_factors,
411
+ self._algorithm_parameters,
412
+ self._weighting_factors
413
+ ]:
414
+ if hasattr(config_section, threshold_name):
415
+ setattr(config_section, threshold_name, value)
416
+ return
417
+
418
+ raise AttributeError(f"Threshold '{threshold_name}' not found")
cultural_context_analyzer.py ADDED
@@ -0,0 +1,637 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import traceback
3
+ import random
4
+ from typing import Dict, List, Optional, Any
5
+
6
+ from cultural_templates import CULTURAL_TEMPLATES
7
+
8
+ class CulturalContextError(Exception):
9
+ """文化語境分析過程中的自定義異常"""
10
+ pass
11
+
12
+
13
+ class CulturalContextAnalyzer:
14
+ """
15
+ 文化語境分析器 - 檢測場景中的文化特徵並生成相關的描述
16
+
17
+ 該類別負責識別場景中的文化語境線索,包括建築風格、標誌特徵
18
+ 和物件配置,然後生成適當的文化描述元素。
19
+ """
20
+
21
+ def __init__(self, cultural_templates: Optional[Dict] = None):
22
+ """
23
+ 初始化文化語境分析器
24
+
25
+ Args:
26
+ cultural_templates: 可選的自定義文化模板,如果提供則會與默認模板合併
27
+ """
28
+ self.logger = logging.getLogger(self.__class__.__name__)
29
+
30
+ try:
31
+ # 載入文化模板
32
+ self.cultural_templates = self._load_cultural_templates()
33
+
34
+ # 如果提供了自定義模板,進行合併
35
+ if cultural_templates:
36
+ self._merge_custom_templates(cultural_templates)
37
+
38
+ # 初始化場景類型到文化語境的映射
39
+ self.scene_cultural_mapping = self._initialize_scene_cultural_mapping()
40
+
41
+ self.logger.info("CulturalContextAnalyzer initialized with %d cultural templates",
42
+ len(self.cultural_templates))
43
+
44
+ except Exception as e:
45
+ error_msg = f"Failed to initialize CulturalContextAnalyzer: {str(e)}"
46
+ self.logger.error(f"{error_msg}\n{traceback.format_exc()}")
47
+ raise CulturalContextError(error_msg) from e
48
+
49
+ def _load_cultural_templates(self) -> Dict:
50
+ """
51
+ 載入文化模板
52
+
53
+ Returns:
54
+ Dict: 文化模板字典
55
+
56
+ Raises:
57
+ CulturalContextError: 當模板載入失敗時
58
+ """
59
+ try:
60
+ self.logger.debug("Loading cultural templates")
61
+
62
+ # 從配置模組載入文化模板
63
+ templates = CULTURAL_TEMPLATES.copy()
64
+
65
+ # 確保模板結構正確
66
+ self._validate_cultural_templates(templates)
67
+
68
+ # 如果沒有載入到模板,使用默認模板
69
+ if not templates:
70
+ self.logger.warning("No cultural templates loaded, using defaults")
71
+ templates = self._get_default_cultural_templates()
72
+
73
+ self.logger.debug("Successfully loaded %d cultural template categories", len(templates))
74
+ return templates
75
+
76
+ except ImportError as e:
77
+ self.logger.warning(f"Failed to import cultural templates: {str(e)}, using defaults")
78
+ return self._get_default_cultural_templates()
79
+ except Exception as e:
80
+ error_msg = f"Error loading cultural templates: {str(e)}"
81
+ self.logger.error(f"{error_msg}\n{traceback.format_exc()}")
82
+ raise CulturalContextError(error_msg) from e
83
+
84
+ def _get_default_cultural_templates(self) -> Dict:
85
+ """
86
+ 獲取默認文化模板
87
+
88
+ Returns:
89
+ Dict: 默認文化模板字典
90
+ """
91
+ return {
92
+ "asian": {
93
+ "elements": [
94
+ "traditional architectural elements",
95
+ "cultural signage",
96
+ "Asian design features",
97
+ "oriental decorative patterns",
98
+ "traditional building materials",
99
+ "characteristic roofline styles",
100
+ "cultural landscaping elements"
101
+ ],
102
+ "description": "The scene displays distinctive Asian cultural characteristics with {elements}."
103
+ },
104
+ "european": {
105
+ "elements": [
106
+ "classical architecture",
107
+ "European design elements",
108
+ "historic features",
109
+ "traditional stonework",
110
+ "characteristic window styles",
111
+ "ornamental facades",
112
+ "heritage building elements"
113
+ ],
114
+ "description": "The scene exhibits European architectural and cultural elements including {elements}."
115
+ },
116
+ "american": {
117
+ "elements": [
118
+ "modern architectural styles",
119
+ "contemporary design features",
120
+ "commercial signage",
121
+ "urban planning elements",
122
+ "standardized building designs"
123
+ ],
124
+ "description": "The scene shows American urban characteristics featuring {elements}."
125
+ },
126
+ "mediterranean": {
127
+ "elements": [
128
+ "coastal architectural styles",
129
+ "warm climate adaptations",
130
+ "traditional building colors",
131
+ "characteristic outdoor spaces"
132
+ ],
133
+ "description": "The scene reflects Mediterranean cultural influences with {elements}."
134
+ }
135
+ }
136
+
137
+ def _validate_cultural_templates(self, templates: Dict):
138
+ """
139
+ 驗證文化模板結構
140
+
141
+ Args:
142
+ templates: 要驗證的模板字典
143
+
144
+ Raises:
145
+ CulturalContextError: 當模板結構無效時
146
+ """
147
+ try:
148
+ for culture, template_data in templates.items():
149
+ if not isinstance(template_data, dict):
150
+ self.logger.warning(f"Invalid cultural template structure for '{culture}': not a dictionary")
151
+ continue
152
+
153
+ required_keys = ["elements", "description"]
154
+ for key in required_keys:
155
+ if key not in template_data:
156
+ self.logger.warning(f"Missing required key '{key}' in cultural template '{culture}'")
157
+
158
+ # 驗證元素列表
159
+ if "elements" in template_data:
160
+ if not isinstance(template_data["elements"], list):
161
+ self.logger.warning(f"Cultural template '{culture}' elements should be a list")
162
+ elif not template_data["elements"]:
163
+ self.logger.warning(f"Cultural template '{culture}' has empty elements list")
164
+
165
+ # 驗證描述模板
166
+ if "description" in template_data:
167
+ if not isinstance(template_data["description"], str):
168
+ self.logger.warning(f"Cultural template '{culture}' description should be a string")
169
+ elif "{elements}" not in template_data["description"]:
170
+ self.logger.warning(f"Cultural template '{culture}' description missing {{elements}} placeholder")
171
+
172
+ self.logger.debug("Cultural templates validation completed")
173
+
174
+ except Exception as e:
175
+ self.logger.warning(f"Error validating cultural templates: {str(e)}")
176
+
177
+ def _merge_custom_templates(self, custom_templates: Dict):
178
+ """
179
+ 合併自定義文化模板
180
+
181
+ Args:
182
+ custom_templates: 自定義模板字典
183
+ """
184
+ try:
185
+ for culture, template_data in custom_templates.items():
186
+ if culture in self.cultural_templates:
187
+ # 合併現有文化的模板
188
+ if isinstance(self.cultural_templates[culture], dict) and isinstance(template_data, dict):
189
+ # 合併元素列表
190
+ if "elements" in template_data and "elements" in self.cultural_templates[culture]:
191
+ existing_elements = self.cultural_templates[culture]["elements"]
192
+ new_elements = template_data["elements"]
193
+ if isinstance(existing_elements, list) and isinstance(new_elements, list):
194
+ self.cultural_templates[culture]["elements"] = existing_elements + new_elements
195
+
196
+ # 更新其他鍵值
197
+ for key, value in template_data.items():
198
+ if key != "elements":
199
+ self.cultural_templates[culture][key] = value
200
+ else:
201
+ self.cultural_templates[culture] = template_data
202
+ else:
203
+ # 添加新的文化模板
204
+ self.cultural_templates[culture] = template_data
205
+
206
+ self.logger.debug(f"Merged custom template for culture: {culture}")
207
+
208
+ self.logger.info("Successfully merged custom cultural templates")
209
+
210
+ except Exception as e:
211
+ self.logger.warning(f"Error merging custom cultural templates: {str(e)}")
212
+
213
+ def _initialize_scene_cultural_mapping(self) -> Dict[str, str]:
214
+ """
215
+ 初始化場景類型到文化語境的display
216
+
217
+ Returns:
218
+ Dict[str, str]: 場景類型到文化語境的映射字典
219
+ """
220
+ return {
221
+ "asian_commercial_street": "asian",
222
+ "asian_night_market": "asian",
223
+ "asian_temple_area": "asian",
224
+ "chinese_restaurant": "asian",
225
+ "japanese_restaurant": "asian",
226
+ "korean_restaurant": "asian",
227
+ "european_plaza": "european",
228
+ "european_cafe": "european",
229
+ "mediterranean_restaurant": "mediterranean",
230
+ "american_diner": "american",
231
+ "american_fast_food": "american"
232
+ }
233
+
234
+ def detect_cultural_context(self, scene_type: str, detected_objects: List[Dict]) -> Optional[str]:
235
+ """
236
+ 檢測場景的文化語境
237
+
238
+ Args:
239
+ scene_type: 識別的場景類型
240
+ detected_objects: 檢測到的物件列表
241
+
242
+ Returns:
243
+ Optional[str]: 檢測到的文化語境(asian, european等)或None
244
+ """
245
+ try:
246
+ self.logger.debug(f"Detecting cultural context for scene_type: {scene_type}")
247
+
248
+ # 檢查場景類型是否直接指示文化語境
249
+ if scene_type in self.scene_cultural_mapping:
250
+ cultural_context = self.scene_cultural_mapping[scene_type]
251
+ self.logger.debug(f"Direct cultural mapping found: {scene_type} -> {cultural_context}")
252
+ return cultural_context
253
+
254
+ # 基於場景類型名稱的模式匹配
255
+ cultural_context = self._detect_from_scene_name_patterns(scene_type)
256
+ if cultural_context:
257
+ self.logger.debug(f"Cultural context detected from name patterns: {cultural_context}")
258
+ return cultural_context
259
+
260
+ # 基於檢測物件的文化特徵分析
261
+ cultural_context = self._detect_from_object_analysis(detected_objects)
262
+ if cultural_context:
263
+ self.logger.debug(f"Cultural context detected from object analysis: {cultural_context}")
264
+ return cultural_context
265
+
266
+ # 沒有檢測到特定文化語境
267
+ self.logger.debug("No specific cultural context detected")
268
+ return None
269
+
270
+ except Exception as e:
271
+ self.logger.warning(f"Error detecting cultural context: {str(e)}")
272
+ return None
273
+
274
+ def _detect_from_scene_name_patterns(self, scene_type: str) -> Optional[str]:
275
+ """
276
+ 基於場景類型名稱模式檢測文化語境
277
+
278
+ Args:
279
+ scene_type: 場景類型名稱
280
+
281
+ Returns:
282
+ Optional[str]: 檢測到的文化語境或None
283
+ """
284
+ try:
285
+ scene_lower = scene_type.lower()
286
+
287
+ # Asia
288
+ asian_keywords = [
289
+ "asian", "chinese", "japanese", "korean", "thai", "vietnamese",
290
+ "temple", "pagoda", "zen", "oriental", "bamboo", "tatami"
291
+ ]
292
+
293
+ # Europe
294
+ european_keywords = [
295
+ "european", "french", "italian", "spanish", "german", "british",
296
+ "plaza", "piazza", "cathedral", "gothic", "baroque", "renaissance",
297
+ "cafe", "bistro", "pub"
298
+ ]
299
+
300
+ # 地中海文化
301
+ mediterranean_keywords = [
302
+ "mediterranean", "greek", "turkish", "coastal", "terrace",
303
+ "villa", "courtyard"
304
+ ]
305
+
306
+ # 美國
307
+ american_keywords = [
308
+ "american", "diner", "fast_food", "mall", "suburban",
309
+ "downtown", "strip_mall"
310
+ ]
311
+
312
+ # 檢查各文化的key word
313
+ if any(keyword in scene_lower for keyword in asian_keywords):
314
+ return "asian"
315
+ elif any(keyword in scene_lower for keyword in european_keywords):
316
+ return "european"
317
+ elif any(keyword in scene_lower for keyword in mediterranean_keywords):
318
+ return "mediterranean"
319
+ elif any(keyword in scene_lower for keyword in american_keywords):
320
+ return "american"
321
+
322
+ return None
323
+
324
+ except Exception as e:
325
+ self.logger.warning(f"Error detecting cultural context from scene name patterns: {str(e)}")
326
+ return None
327
+
328
+ def _detect_from_object_analysis(self, detected_objects: List[Dict]) -> Optional[str]:
329
+ """
330
+ 基於檢測物件分析文化特徵
331
+
332
+ Args:
333
+ detected_objects: 檢測到的物件列表
334
+
335
+ Returns:
336
+ Optional[str]: 檢測到的文化語境或None
337
+ """
338
+ try:
339
+ if not detected_objects:
340
+ return None
341
+
342
+ # 統計文化相關物件
343
+ cultural_indicators = {
344
+ "asian": 0,
345
+ "european": 0,
346
+ "american": 0,
347
+ "mediterranean": 0
348
+ }
349
+
350
+ for obj in detected_objects:
351
+ class_name = obj.get("class_name", "").lower()
352
+
353
+ # Asia 特色
354
+ if any(indicator in class_name for indicator in [
355
+ "lantern", "chopsticks", "rice", "noodles", "tea",
356
+ "bamboo", "pagoda", "shrine", "torii"
357
+ ]):
358
+ cultural_indicators["asian"] += 1
359
+
360
+ # 歐洲的特色
361
+ elif any(indicator in class_name for indicator in [
362
+ "wine", "cheese", "bread", "fountain", "column",
363
+ "statue", "cathedral", "clock_tower"
364
+ ]):
365
+ cultural_indicators["european"] += 1
366
+
367
+ # 地中海的特色
368
+ elif any(indicator in class_name for indicator in [
369
+ "olive", "terracotta", "pergola", "villa",
370
+ "coastal", "maritime"
371
+ ]):
372
+ cultural_indicators["mediterranean"] += 1
373
+
374
+ # 美國的特色
375
+ elif any(indicator in class_name for indicator in [
376
+ "burger", "pizza", "hotdog", "soda",
377
+ "drive_through", "parking_lot"
378
+ ]):
379
+ cultural_indicators["american"] += 1
380
+
381
+ # 找出得分最高的文化語境
382
+ if max(cultural_indicators.values()) > 0:
383
+ dominant_culture = max(cultural_indicators.items(), key=lambda x: x[1])[0]
384
+ max_score = cultural_indicators[dominant_culture]
385
+
386
+ # 需要至少2個指標物件才算有效檢測
387
+ if max_score >= 2:
388
+ return dominant_culture
389
+
390
+ return None
391
+
392
+ except Exception as e:
393
+ self.logger.warning(f"Error detecting cultural context from object analysis: {str(e)}")
394
+ return None
395
+
396
+ def generate_cultural_elements(self, cultural_context: str) -> str:
397
+ """
398
+ 為檢測到的文化語境生成描述元素
399
+
400
+ Args:
401
+ cultural_context: 檢測到的文化語境
402
+
403
+ Returns:
404
+ str: 文化元素描述
405
+
406
+ Raises:
407
+ CulturalContextError: 當文化元素生成失敗時
408
+ """
409
+ try:
410
+ if not cultural_context:
411
+ return ""
412
+
413
+ self.logger.debug(f"Generating cultural elements for context: {cultural_context}")
414
+
415
+ # 獲取該文化語境的模板
416
+ if cultural_context not in self.cultural_templates:
417
+ self.logger.warning(f"No template found for cultural context: {cultural_context}")
418
+ return ""
419
+
420
+ template = self.cultural_templates[cultural_context]
421
+ elements = template.get("elements", [])
422
+
423
+ if not elements:
424
+ self.logger.warning(f"No elements found for cultural context: {cultural_context}")
425
+ return ""
426
+
427
+ # 選擇1-2個隨機元素
428
+ num_elements = min(len(elements), random.randint(1, 2))
429
+ selected_elements = random.sample(elements, num_elements)
430
+
431
+ # 格式化元素列表
432
+ if len(selected_elements) == 1:
433
+ elements_text = selected_elements[0]
434
+ else:
435
+ elements_text = " and ".join(selected_elements)
436
+
437
+ # 填充模板
438
+ description_template = template.get("description", "")
439
+ if not description_template:
440
+ return f"The scene displays {cultural_context} cultural characteristics."
441
+
442
+ # 替換佔位符
443
+ cultural_description = description_template.format(elements=elements_text)
444
+
445
+ self.logger.debug(f"Generated cultural description: {cultural_description}")
446
+ return cultural_description
447
+
448
+ except Exception as e:
449
+ error_msg = f"Error generating cultural elements for context '{cultural_context}': {str(e)}"
450
+ self.logger.error(f"{error_msg}\n{traceback.format_exc()}")
451
+ raise CulturalContextError(error_msg) from e
452
+
453
+ def get_cultural_template(self, cultural_context: str) -> Dict[str, Any]:
454
+ """
455
+ 獲取指定文化語境的模板
456
+
457
+ Args:
458
+ cultural_context: 文化語境名稱
459
+
460
+ Returns:
461
+ Dict[str, Any]: 文化模板字典
462
+ """
463
+ try:
464
+ if cultural_context in self.cultural_templates:
465
+ return self.cultural_templates[cultural_context].copy()
466
+
467
+ # 返回備用模板
468
+ self.logger.warning(f"Cultural template not found for '{cultural_context}', using fallback")
469
+ return {
470
+ "elements": ["various cultural elements"],
471
+ "description": f"The scene displays {cultural_context} cultural characteristics."
472
+ }
473
+
474
+ except Exception as e:
475
+ self.logger.warning(f"Error getting cultural template for '{cultural_context}': {str(e)}")
476
+ return {
477
+ "elements": ["various elements"],
478
+ "description": "The scene displays cultural characteristics."
479
+ }
480
+
481
+ def add_cultural_template(self, cultural_context: str, template: Dict[str, Any]):
482
+ """
483
+ 添加或更新文化模板
484
+
485
+ Args:
486
+ cultural_context: 文化語境名稱
487
+ template: 文化模板字典
488
+
489
+ Raises:
490
+ CulturalContextError: 當模板格式無效時
491
+ """
492
+ try:
493
+ # 驗證模板格式
494
+ if not isinstance(template, dict):
495
+ raise CulturalContextError("Template must be a dictionary")
496
+
497
+ required_keys = ["elements", "description"]
498
+ for key in required_keys:
499
+ if key not in template:
500
+ raise CulturalContextError(f"Template missing required key: {key}")
501
+
502
+ if not isinstance(template["elements"], list):
503
+ raise CulturalContextError("Template 'elements' must be a list")
504
+
505
+ if not isinstance(template["description"], str):
506
+ raise CulturalContextError("Template 'description' must be a string")
507
+
508
+ # 添加模板
509
+ self.cultural_templates[cultural_context] = template.copy()
510
+
511
+ self.logger.info(f"Added cultural template for context: {cultural_context}")
512
+
513
+ except CulturalContextError:
514
+ raise
515
+ except Exception as e:
516
+ error_msg = f"Error adding cultural template for '{cultural_context}': {str(e)}"
517
+ self.logger.error(f"{error_msg}\n{traceback.format_exc()}")
518
+ raise CulturalContextError(error_msg) from e
519
+
520
+ def get_supported_cultures(self) -> List[str]:
521
+ """
522
+ 獲取所有支援的文化語境列表
523
+
524
+ Returns:
525
+ List[str]: 支援的文化語境名稱列表
526
+ """
527
+ return list(self.cultural_templates.keys())
528
+
529
+ def has_cultural_context(self, cultural_context: str) -> bool:
530
+ """
531
+ 檢查是否支援指定的文化語境
532
+
533
+ Args:
534
+ cultural_context: 文化語境名稱
535
+
536
+ Returns:
537
+ bool: 是否支援該文化語境
538
+ """
539
+ return cultural_context in self.cultural_templates
540
+
541
+ def analyze_cultural_diversity(self, detected_objects: List[Dict]) -> Dict[str, int]:
542
+ """
543
+ 分析場景中的文化多樣性
544
+
545
+ Args:
546
+ detected_objects: 檢測到的物件列表
547
+
548
+ Returns:
549
+ Dict[str, int]: 各文化語境的指標物件計數
550
+ """
551
+ try:
552
+ cultural_scores = {culture: 0 for culture in self.cultural_templates.keys()}
553
+
554
+ if not detected_objects:
555
+ return cultural_scores
556
+
557
+ for obj in detected_objects:
558
+ class_name = obj.get("class_name", "").lower()
559
+
560
+ # 為每個文化語境計算指標分數
561
+ for culture in cultural_scores:
562
+ if self._is_cultural_indicator(class_name, culture):
563
+ cultural_scores[culture] += 1
564
+
565
+ self.logger.debug(f"Cultural diversity analysis: {cultural_scores}")
566
+ return cultural_scores
567
+
568
+ except Exception as e:
569
+ self.logger.warning(f"Error analyzing cultural diversity: {str(e)}")
570
+ return {culture: 0 for culture in self.cultural_templates.keys()}
571
+
572
+ def _is_cultural_indicator(self, object_name: str, culture: str) -> bool:
573
+ """
574
+ 檢查物件名稱是否為特定文化的指標
575
+
576
+ Args:
577
+ object_name: 物件名稱
578
+ culture: 文化語境
579
+
580
+ Returns:
581
+ bool: 是否為該文化的指標物件
582
+ """
583
+ try:
584
+ cultural_keywords = {
585
+ "asian": [
586
+ "lantern", "chopsticks", "rice", "noodles", "tea",
587
+ "bamboo", "pagoda", "shrine", "torii", "kimono",
588
+ "sushi", "ramen", "dim_sum"
589
+ ],
590
+ "european": [
591
+ "wine", "cheese", "bread", "fountain", "column",
592
+ "statue", "cathedral", "clock_tower", "baguette",
593
+ "croissant", "espresso", "gelato"
594
+ ],
595
+ "mediterranean": [
596
+ "olive", "terracotta", "pergola", "villa",
597
+ "coastal", "maritime", "cypress", "vineyard"
598
+ ],
599
+ "american": [
600
+ "burger", "pizza", "hotdog", "soda",
601
+ "drive_through", "parking_lot", "diner",
602
+ "strip_mall", "suburb"
603
+ ]
604
+ }
605
+
606
+ if culture not in cultural_keywords:
607
+ return False
608
+
609
+ keywords = cultural_keywords[culture]
610
+ return any(keyword in object_name for keyword in keywords)
611
+
612
+ except Exception as e:
613
+ self.logger.warning(f"Error checking cultural indicator for {object_name}, {culture}: {str(e)}")
614
+ return False
615
+
616
+ def get_template_summary(self) -> Dict[str, Dict[str, Any]]:
617
+ """
618
+ 獲取所有文化模板的摘要信息
619
+
620
+ Returns:
621
+ Dict[str, Dict[str, Any]]: 文化模板摘要
622
+ """
623
+ try:
624
+ summary = {}
625
+
626
+ for culture, template in self.cultural_templates.items():
627
+ summary[culture] = {
628
+ "element_count": len(template.get("elements", [])),
629
+ "has_description": bool(template.get("description", "")),
630
+ "sample_elements": template.get("elements", [])[:3] # 前3個元素作為樣本
631
+ }
632
+
633
+ return summary
634
+
635
+ except Exception as e:
636
+ self.logger.warning(f"Error generating template summary: {str(e)}")
637
+ return {}
enhanced_scene_describer.py ADDED
@@ -0,0 +1,1254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ import logging
5
+ import random
6
+ import numpy as np
7
+ from typing import Dict, List, Tuple, Any, Optional
8
+
9
+ from scene_type import SCENE_TYPES
10
+ from scene_detail_templates import SCENE_DETAIL_TEMPLATES
11
+ from object_template_fillers import OBJECT_TEMPLATE_FILLERS
12
+ from lighting_conditions import LIGHTING_CONDITIONS
13
+ from viewpoint_templates import VIEWPOINT_TEMPLATES
14
+ from cultural_templates import CULTURAL_TEMPLATES
15
+ from confidence_templates import CONFIDENCE_TEMPLATES
16
+ from landmark_data import ALL_LANDMARKS
17
+ from region_analyzer import RegionAnalyzer
18
+ from viewpoint_detector import ViewpointDetector, ViewpointDetectionError
19
+ from template_manager import TemplateManager, TemplateLoadingError, TemplateFillError
20
+ from object_description_generator import ObjectDescriptionGenerator, ObjectDescriptionError
21
+ from cultural_context_analyzer import CulturalContextAnalyzer, CulturalContextError
22
+ from text_formatter import TextFormatter, TextFormattingError
23
+
24
+ class EnhancedSceneDescriberError(Exception):
25
+ """場景描述生成過程中的自定義異常"""
26
+ pass
27
+
28
+ class EnhancedSceneDescriber:
29
+ """
30
+ 增強場景描述器 - 提供詳細自然語言場景描述的主要窗口,其他相關class匯集於此
31
+
32
+ 此class會協調多個專門組件來生成高質量的場景描述,包括視角檢測、
33
+ 模板管理、物件描述、文化語境分析和文本格式化。
34
+ """
35
+
36
+ def __init__(self, templates_db: Optional[Dict] = None, scene_types: Optional[Dict] = None, spatial_analyzer_instance: Optional[Any] = None):
37
+ """
38
+ 初始化增強場景描述器
39
+
40
+ Args:
41
+ templates_db: 可選的自定義模板數據庫
42
+ scene_types: 場景類型定義字典
43
+ spatial_analyzer_instance: 空間分析器實例(保持兼容性)
44
+ """
45
+ self.logger = logging.getLogger(self.__class__.__name__)
46
+ self.logger.setLevel(logging.INFO)
47
+
48
+ # 如果沒有logger,就加一個
49
+ if not self.logger.hasHandlers():
50
+ handler = logging.StreamHandler()
51
+ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
52
+ handler.setFormatter(formatter)
53
+ self.logger.addHandler(handler)
54
+
55
+ try:
56
+ # 載入場景類型定義
57
+ self.scene_types = scene_types or self._load_default_scene_types()
58
+
59
+ # 初始化子組件
60
+ self._initialize_components(templates_db)
61
+
62
+ # 保存空間分析器實例以保持兼容性
63
+ self.spatial_analyzer_instance = spatial_analyzer_instance
64
+
65
+ self.logger.info("EnhancedSceneDescriber initialized successfully with %d scene types",
66
+ len(self.scene_types))
67
+
68
+ except Exception as e:
69
+ error_msg = f"Failed to initialize EnhancedSceneDescriber: {str(e)}"
70
+ self.logger.error(f"{error_msg}\n{e.__class__.__name__}: {str(e)}")
71
+ raise EnhancedSceneDescriberError(error_msg) from e
72
+
73
+ def _load_default_scene_types(self) -> Dict:
74
+ """
75
+ 載入默認場景類型
76
+
77
+ Returns:
78
+ Dict: 場景類型定義
79
+ """
80
+ try:
81
+ return SCENE_TYPES
82
+ except Exception as e:
83
+ self.logger.error(f"Failed to import SCENE_TYPES: {str(e)}")
84
+ return {} # 返回空字典
85
+
86
+ def _initialize_components(self, templates_db: Optional[Dict]):
87
+ """
88
+ 初始化所有子組件
89
+
90
+ Args:
91
+ templates_db: 可選的模板數據庫
92
+ """
93
+ try:
94
+ # 初始化視角檢測器
95
+ self.viewpoint_detector = ViewpointDetector()
96
+
97
+ # 初始化區域分析器
98
+ self.region_analyzer = RegionAnalyzer()
99
+
100
+ # 初始化模板管理器
101
+ self.template_manager = TemplateManager(custom_templates_db=templates_db)
102
+
103
+ # 初始化物件描述生成器,傳入區域分析器
104
+ self.object_description_generator = ObjectDescriptionGenerator(
105
+ region_analyzer=self.region_analyzer
106
+ )
107
+
108
+ # 初始化文化語境分析器
109
+ self.cultural_context_analyzer = CulturalContextAnalyzer()
110
+
111
+ # 初始化文本格式化器
112
+ self.text_formatter = TextFormatter()
113
+
114
+ self.logger.debug("All components initialized successfully")
115
+
116
+ except Exception as e:
117
+ error_msg = f"Component initialization failed: {str(e)}"
118
+ self.logger.error(error_msg)
119
+ # 初始化基本組件而不是拋出異常
120
+ self._initialize_fallback_components()
121
+
122
+
123
+ def generate_description(self, scene_type: str, detected_objects: List[Dict], confidence: float,
124
+ lighting_info: Dict, functional_zones: List[str], enable_landmark: bool = True,
125
+ scene_scores: Optional[Dict] = None, spatial_analysis: Optional[Dict] = None,
126
+ image_dimensions: Optional[Tuple[int, int]] = None, # 改為 Tuple
127
+ places365_info: Optional[Dict] = None,
128
+ object_statistics: Optional[Dict] = None) -> str:
129
+ try:
130
+ traffic_list = [obj for obj in detected_objects if obj.get("class_name", "") == "traffic light"]
131
+ # print(f"[DEBUG] generate_description 一開始接收到的 traffic light 數量: {len(traffic_list)}") # 原始的 print
132
+ self.logger.debug(f"Initial traffic light count in generate_description: {len(traffic_list)}") # 改用 logger
133
+ # for idx, tl in enumerate(traffic_list): # 這部分 log 可能過於詳細,先註解
134
+ # self.logger.debug(f" idx={idx}, confidence={tl.get('confidence', 0):.4f}, bbox={tl.get('bbox')}, region={tl.get('region')}")
135
+
136
+ if scene_type == "unknown" or confidence < 0.4:
137
+ generic_desc = self._generate_generic_description(detected_objects, lighting_info)
138
+ return self.text_formatter.format_final_description(generic_desc)
139
+
140
+ current_detected_objects = detected_objects
141
+ if not enable_landmark:
142
+ current_detected_objects = [obj for obj in detected_objects if not obj.get("is_landmark", False)]
143
+
144
+ places365_context = ""
145
+ if places365_info and places365_info.get('confidence', 0) > 0.3:
146
+ scene_label = places365_info.get('scene_label', '')
147
+ attributes = places365_info.get('attributes', [])
148
+ is_indoor = places365_info.get('is_indoor', None)
149
+ if scene_label:
150
+ places365_context = f"Scene context: {scene_label}"
151
+ if attributes:
152
+ places365_context += f" with characteristics: {', '.join(attributes[:3])}"
153
+ if is_indoor is not None:
154
+ indoor_outdoor = "indoor" if is_indoor else "outdoor"
155
+ places365_context += f" ({indoor_outdoor} environment)"
156
+ self.logger.debug(f"Enhanced description incorporating Places365 context: {places365_context}")
157
+
158
+ landmark_objects_in_scene = [obj for obj in current_detected_objects if obj.get("is_landmark", False)]
159
+ has_landmark_in_scene = len(landmark_objects_in_scene) > 0
160
+
161
+ if enable_landmark and (scene_type in ["tourist_landmark", "natural_landmark", "historical_monument"] or has_landmark_in_scene):
162
+ landmark_desc = self._generate_landmark_description(
163
+ scene_type, current_detected_objects, confidence,
164
+ lighting_info, functional_zones, landmark_objects_in_scene
165
+ )
166
+ return self.text_formatter.format_final_description(landmark_desc)
167
+
168
+ viewpoint = self.viewpoint_detector.detect_viewpoint(current_detected_objects)
169
+ current_scene_type = scene_type
170
+
171
+ if viewpoint == "aerial":
172
+ if "intersection" in current_scene_type.lower() or self._is_intersection(current_detected_objects):
173
+ current_scene_type = "aerial_view_intersection"
174
+ elif any(keyword in current_scene_type.lower() for keyword in ["commercial", "shopping", "retail"]):
175
+ current_scene_type = "aerial_view_commercial_area"
176
+ elif any(keyword in current_scene_type.lower() for keyword in ["plaza", "square"]):
177
+ current_scene_type = "aerial_view_plaza"
178
+ else:
179
+ current_scene_type = "aerial_view_general"
180
+
181
+ current_scene_type = self._sanitize_scene_type_for_description(current_scene_type)
182
+
183
+ # 偵測文化背景資訊
184
+ cultural_context = None
185
+ if viewpoint != "aerial":
186
+ cultural_context = self.cultural_context_analyzer.detect_cultural_context(current_scene_type, current_detected_objects)
187
+
188
+ # 設定基礎描述
189
+ base_description = "A scene"
190
+ if viewpoint == "aerial":
191
+ if current_scene_type in self.scene_types: # 確保 self.scene_types 已有
192
+ base_description = self.scene_types.get(current_scene_type, {}).get("description", "An aerial view showing the layout and movement patterns from above")
193
+ else:
194
+ base_description = "An aerial view showing the layout and movement patterns from above"
195
+ elif current_scene_type in self.scene_types: # 確保 self.scene_types 已有
196
+ base_description = self.scene_types.get(current_scene_type, {}).get("description", "A scene")
197
+
198
+ # 假設 template_manager 內部可以處理 List[str] 的 functional_zones
199
+ selected_template = self.template_manager.get_template_by_scene_type(
200
+ scene_type=current_scene_type,
201
+ detected_objects=current_detected_objects,
202
+ functional_zones=functional_zones or [] # 傳入 List[str]
203
+ )
204
+
205
+ # 用於 fill_template 中的某些佔位符
206
+ processed_functional_zones = {}
207
+ if functional_zones:
208
+ if isinstance(functional_zones, dict): # 如果外部傳入的就是dict
209
+ processed_functional_zones = functional_zones
210
+ elif isinstance(functional_zones, list): # 如果是 list of strings
211
+ processed_functional_zones = {f"zone_{i}": {"description": zone_desc} for i, zone_desc in enumerate(functional_zones)}
212
+
213
+
214
+ # 組織場景資料
215
+ scene_data = {
216
+ "detected_objects": current_detected_objects,
217
+ "functional_zones": processed_functional_zones, # 傳入處理過的字典
218
+ "scene_type": current_scene_type,
219
+ "object_statistics": object_statistics or {},
220
+ "lighting_info": lighting_info,
221
+ "spatial_analysis": spatial_analysis,
222
+ "places365_info": places365_info
223
+ }
224
+
225
+ # 應用模板產生核心場景描述
226
+ core_scene_details = self.template_manager.apply_template(selected_template, scene_data)
227
+
228
+ # 組合基礎描述與核心場景細節
229
+ description = base_description
230
+ if core_scene_details and core_scene_details.strip():
231
+ cleaned_scene_details = self._validate_and_clean_scene_details(core_scene_details)
232
+ if base_description.lower() == "a scene" and len(cleaned_scene_details) > len(base_description):
233
+ description = cleaned_scene_details
234
+ else:
235
+ description = self.text_formatter.smart_append(description, cleaned_scene_details)
236
+ elif not core_scene_details and not description: # 如果兩者都為空
237
+ description = self._generate_generic_description(current_detected_objects, lighting_info)
238
+
239
+ # 添加次要描述資訊
240
+ if current_scene_type in self.scene_types and "secondary_description" in self.scene_types[current_scene_type]:
241
+ secondary_desc = self.scene_types[current_scene_type]["secondary_description"]
242
+ if secondary_desc:
243
+ description = self.text_formatter.smart_append(description, secondary_desc)
244
+
245
+ # 處理人物相關的描述
246
+ people_objs = [obj for obj in current_detected_objects if obj.get("class_id") == 0]
247
+ if people_objs:
248
+ people_count = len(people_objs)
249
+ if people_count == 1: people_phrase = "a single person"
250
+ elif 1 < people_count <= 3: people_phrase = f"{people_count} people"
251
+ elif 3 < people_count <= 7: people_phrase = "several people"
252
+ else: people_phrase = "multiple people"
253
+ if not any(p_word in description.lower() for p_word in ["person", "people", "pedestrian"]):
254
+ description = self.text_formatter.smart_append(description, f"The scene includes {people_phrase}.")
255
+
256
+ # 添加文化背景元素(非空中視角)
257
+ if cultural_context and viewpoint != "aerial":
258
+ cultural_elements = self.cultural_context_analyzer.generate_cultural_elements(cultural_context)
259
+ if cultural_elements:
260
+ description = self.text_formatter.smart_append(description, cultural_elements)
261
+
262
+ # 處理光照條件描述
263
+ lighting_description_text = ""
264
+ if lighting_info and "time_of_day" in lighting_info:
265
+ lighting_type = lighting_info["time_of_day"]
266
+ lighting_desc_template = self.template_manager.get_lighting_template(lighting_type)
267
+ if lighting_desc_template: lighting_description_text = lighting_desc_template
268
+ if lighting_description_text and lighting_description_text.lower() not in description.lower():
269
+ description = self.text_formatter.smart_append(description, lighting_description_text)
270
+
271
+ # 添加視角特定的觀察描述
272
+ if viewpoint != "eye_level":
273
+ viewpoint_template = self.template_manager.get_viewpoint_template(viewpoint)
274
+ prefix = viewpoint_template.get('prefix', '')
275
+ observation_template = viewpoint_template.get("observation", "")
276
+ scene_elements_for_vp = "the overall layout and objects"
277
+ if viewpoint == "aerial": scene_elements_for_vp = "crossing patterns and general layout"
278
+ viewpoint_observation_text = observation_template.format(scene_elements=scene_elements_for_vp)
279
+ full_viewpoint_text = ""
280
+ if prefix:
281
+ full_viewpoint_text = prefix.strip() + " "
282
+ if viewpoint_observation_text and viewpoint_observation_text[0].islower():
283
+ full_viewpoint_text += viewpoint_observation_text
284
+ elif viewpoint_observation_text:
285
+ full_viewpoint_text = prefix + (viewpoint_observation_text[0].lower() + viewpoint_observation_text[1:] if description else viewpoint_observation_text)
286
+ elif viewpoint_observation_text:
287
+ full_viewpoint_text = viewpoint_observation_text[0].upper() + viewpoint_observation_text[1:]
288
+ if full_viewpoint_text and full_viewpoint_text.lower() not in description.lower():
289
+ description = self.text_formatter.smart_append(description, full_viewpoint_text)
290
+
291
+ # 需要轉換或調整 describe_functional_zones
292
+ if functional_zones and len(functional_zones) > 0:
293
+ if isinstance(functional_zones, dict):
294
+ zones_desc_text = self.object_description_generator.describe_functional_zones(functional_zones)
295
+ else: # 如果是 list of strings
296
+ temp_zones_dict = {f"area_{i}": {"description": desc} for i, desc in enumerate(functional_zones)}
297
+ zones_desc_text = self.object_description_generator.describe_functional_zones(temp_zones_dict)
298
+
299
+ if zones_desc_text:
300
+ description = self.text_formatter.smart_append(description, zones_desc_text)
301
+
302
+ # 避免重複提到
303
+ if hasattr(self.text_formatter, 'deduplicate_sentences_in_description'):
304
+ deduplicated_description = self.text_formatter.deduplicate_sentences_in_description(description)
305
+ self.logger.info(f"Description before pre-LLM deduplication (len {len(description)}): '{description[:150]}...'")
306
+ self.logger.info(f"Description after pre-LLM deduplication (len {len(deduplicated_description)}): '{deduplicated_description[:150]}...'")
307
+ description = deduplicated_description # 更新 description 為去除重複後的版本
308
+ else:
309
+ self.logger.warning("TextFormatter does not have 'deduplicate_sentences_in_description'. Skipping pre-LLM deduplication of the internally generated description.")
310
+
311
+ # 格式化最終描述
312
+ final_formatted_description = self.text_formatter.format_final_description(description)
313
+
314
+ # 如果禁用地標,過濾地標引用
315
+ if not enable_landmark:
316
+ final_formatted_description = self.text_formatter.filter_landmark_references(final_formatted_description, enable_landmark=False)
317
+
318
+ # 如果描述為空,使用備用描述
319
+ if not final_formatted_description.strip() or final_formatted_description.strip() == ".":
320
+ self.logger.warning(f"Description for scene_type '{current_scene_type}' became empty after processing. Falling back.")
321
+ final_formatted_description = self.text_formatter.format_final_description(
322
+ self._generate_generic_description(current_detected_objects, lighting_info)
323
+ )
324
+
325
+ return final_formatted_description
326
+
327
+ except Exception as e:
328
+ error_msg = f"Error generating scene description: {str(e)}"
329
+ self.logger.error(f"{error_msg}\n{e.__class__.__name__}: {str(e)}")
330
+ try:
331
+ fallback_desc = self._generate_generic_description(detected_objects, lighting_info)
332
+ return self.text_formatter.format_final_description(fallback_desc)
333
+ except:
334
+ return "A scene with various elements is visible."
335
+
336
+ def deduplicate_sentences_in_description(self, description: str, similarity_threshold: float = 0.80) -> str:
337
+ """
338
+ 從一段描述文本中移除重複或高度相似的句子。
339
+ 此方法會嘗試保留更長、資訊更豐富的句子版本。
340
+
341
+ Args:
342
+ description (str): 原始描述文本。
343
+ similarity_threshold (float): 判斷句子是否相似的 Jaccard 相似度閾值 (0 到 1)。
344
+ 預設為 0.8,表示詞彙重疊度達到80%即視為相似。
345
+
346
+ Returns:
347
+ str: 移除了重複或高度相似句子後的文本。
348
+ """
349
+ try:
350
+ if not description or not description.strip():
351
+ self.logger.debug("deduplicate_sentences_in_description: Received empty or blank description.")
352
+ return ""
353
+
354
+ # 使用正則表達式分割句子,保留句尾標點符號
355
+ sentences = re.split(r'(?<=[.!?])\s+', description.strip())
356
+
357
+ if not sentences:
358
+ self.logger.debug("deduplicate_sentences_in_description: No sentences found after splitting.")
359
+ return ""
360
+
361
+ unique_sentences_data = [] # 存儲 (原始句子文本, 該句子的詞彙集合)
362
+
363
+ for current_sentence_text in sentences:
364
+ current_sentence_text = current_sentence_text.strip()
365
+ if not current_sentence_text:
366
+ continue
367
+
368
+ # 預處理當前句子以進行比較:轉小寫、移除標點、分割成詞彙集合
369
+ simplified_current_text = re.sub(r'[^\w\s\d]', '', current_sentence_text.lower()) # 保留數字
370
+ current_sentence_words = set(simplified_current_text.split())
371
+
372
+ if not current_sentence_words: # 如果處理後是空集合 (例如句子只包含標點)
373
+ # 如果原始句子有內容(例如只有一個標點),就保留它
374
+ if current_sentence_text and not unique_sentences_data: # 避免在開頭加入孤立標點
375
+ unique_sentences_data.append((current_sentence_text, current_sentence_words))
376
+ continue
377
+
378
+ is_subsumed_or_highly_similar = False
379
+ index_to_replace = -1
380
+
381
+ for i, (kept_sentence_text, kept_sentence_words) in enumerate(unique_sentences_data):
382
+ if not kept_sentence_words: # 跳過已保留的空詞彙集合
383
+ continue
384
+
385
+ # 計算 Jaccard 相似度
386
+ intersection_len = len(current_sentence_words.intersection(kept_sentence_words))
387
+ union_len = len(current_sentence_words.union(kept_sentence_words))
388
+
389
+ jaccard_similarity = 0.0
390
+ if union_len > 0:
391
+ jaccard_similarity = intersection_len / union_len
392
+ elif not current_sentence_words and not kept_sentence_words: # 兩個都是空的
393
+ jaccard_similarity = 1.0
394
+
395
+
396
+ if jaccard_similarity >= similarity_threshold:
397
+ # 如果當前句子比已保留的句子長,則標記替換舊的
398
+ if len(current_sentence_words) > len(kept_sentence_words):
399
+ self.logger.debug(f"Deduplication: Replacing shorter \"{kept_sentence_text[:50]}...\" "
400
+ f"with longer similar \"{current_sentence_text[:50]}...\" (Jaccard: {jaccard_similarity:.2f})")
401
+ index_to_replace = i
402
+ break # 找到一個可以被替換的,就跳出內層循環
403
+ # 如果當前句子比已保留的句子短,或者長度相近但內容高度相似,則標記當前句子為重複
404
+ else: # current_sentence_words is shorter or of similar length
405
+ is_subsumed_or_highly_similar = True
406
+ self.logger.debug(f"Deduplication: Current sentence \"{current_sentence_text[:50]}...\" "
407
+ f"is subsumed by or highly similar to \"{kept_sentence_text[:50]}...\" (Jaccard: {jaccard_similarity:.2f}). Skipping.")
408
+ break
409
+
410
+ if index_to_replace != -1:
411
+ unique_sentences_data[index_to_replace] = (current_sentence_text, current_sentence_words)
412
+ elif not is_subsumed_or_highly_similar:
413
+ unique_sentences_data.append((current_sentence_text, current_sentence_words))
414
+
415
+ # 從 unique_sentences_data 中提取最終的句子文本
416
+ final_sentences = [s_data[0] for s_data in unique_sentences_data]
417
+
418
+ # 重組句子,確保每個句子以標點符號結尾,並且句子間有空格
419
+ reconstructed_response = ""
420
+ for i, s_text in enumerate(final_sentences):
421
+ s_text = s_text.strip()
422
+ if not s_text:
423
+ continue
424
+ # 確保句子以標點結尾
425
+ if not re.search(r'[.!?]$', s_text):
426
+ s_text += "."
427
+
428
+ reconstructed_response += s_text
429
+ if i < len(final_sentences) - 1: # 如果不是最後一句,添加空格
430
+ reconstructed_response += " "
431
+
432
+ self.logger.debug(f"Deduplicated description (len {len(reconstructed_response.strip())}): '{reconstructed_response.strip()[:150]}...'")
433
+ return reconstructed_response.strip()
434
+
435
+ except Exception as e:
436
+ self.logger.error(f"Error in deduplicate_sentences_in_description: {str(e)}")
437
+ self.logger.error(traceback.format_exc())
438
+ return description # 發生錯誤時返回原始描述
439
+
440
+ def _extract_placeholders(self, template: str) -> List[str]:
441
+ """提取模板中的佔位符"""
442
+ import re
443
+ return re.findall(r'\{([^}]+)\}', template)
444
+
445
+ def _generate_placeholder_content(self, placeholder: str, detected_objects: List[Dict],
446
+ functional_zones: List, scene_type: str,
447
+ object_statistics: Dict) -> str:
448
+ """生成佔位符內容"""
449
+ all_replacements = self._generate_default_replacements()
450
+ return self._get_placeholder_replacement(
451
+ placeholder, {}, all_replacements, detected_objects, scene_type
452
+ )
453
+
454
+ def _preprocess_functional_zones(self, functional_zones: List) -> Dict:
455
+ """預處理功能區域數據"""
456
+ if isinstance(functional_zones, list):
457
+ # 將列表轉換為字典格式
458
+ zones_dict = {}
459
+ for i, zone in enumerate(functional_zones):
460
+ if isinstance(zone, str):
461
+ zones_dict[f"area {i+1}"] = {"description": zone}
462
+ elif isinstance(zone, dict):
463
+ zones_dict[f"area {i+1}"] = zone
464
+ return zones_dict
465
+ elif isinstance(functional_zones, dict):
466
+ return functional_zones
467
+ else:
468
+ return {}
469
+
470
+ def _standardize_placeholder_content(self, content: str, placeholder_type: str) -> str:
471
+ """標準化佔位符內容"""
472
+ if not content:
473
+ return "various elements"
474
+ return content.strip()
475
+
476
+ def _finalize_description_output(self, description: str) -> str:
477
+ """最終化描述輸出"""
478
+ if not description:
479
+ return "A scene featuring various elements and organized areas of activity."
480
+
481
+ # 基本清理
482
+ import re
483
+ finalized = re.sub(r'\s+', ' ', description).strip()
484
+
485
+ # 確保適當結尾
486
+ if finalized and not finalized.endswith(('.', '!', '?')):
487
+ finalized += '.'
488
+
489
+ # 首字母大寫
490
+ if finalized:
491
+ finalized = finalized[0].upper() + finalized[1:] if len(finalized) > 1 else finalized.upper()
492
+
493
+ return finalized
494
+
495
+ def _sanitize_scene_type_for_description(self, scene_type: str) -> str:
496
+ """
497
+ 清理場景類型名稱,確保不包含內部標識符格式
498
+
499
+ Args:
500
+ scene_type: 原始場景類型名稱
501
+
502
+ Returns:
503
+ str: 清理後的場景類型名稱
504
+ """
505
+ try:
506
+ # 移除下劃線並轉換為空格分隔的自然語言
507
+ cleaned_type = scene_type.replace('_', ' ')
508
+
509
+ # 確保不直接在描述中使用技術性場景類型名稱
510
+ return cleaned_type
511
+
512
+ except Exception as e:
513
+ self.logger.warning(f"Error sanitizing scene type '{scene_type}': {str(e)}")
514
+ return "general scene"
515
+
516
+ def _validate_and_clean_scene_details(self, scene_details: str) -> str:
517
+ """
518
+ 驗證並清理場景詳細信息,移除可能的模板填充錯誤
519
+
520
+ Args:
521
+ scene_details: 原始場景詳細信息
522
+
523
+ Returns:
524
+ str: 清理後的場景詳細信息
525
+ """
526
+ try:
527
+ if not scene_details or not scene_details.strip():
528
+ return ""
529
+
530
+ cleaned = scene_details.strip()
531
+
532
+ # 移除常見的模板填充錯誤模式
533
+ import re
534
+
535
+ # 修復 "In ," 類型的錯誤
536
+ cleaned = re.sub(r'\bIn\s*,\s*', 'In this scene, ', cleaned)
537
+ cleaned = re.sub(r'\bAt\s*,\s*', 'At this location, ', cleaned)
538
+ cleaned = re.sub(r'\bWithin\s*,\s*', 'Within this area, ', cleaned)
539
+
540
+ # 移除內部標識符格式
541
+ cleaned = re.sub(r'\b\w+_\w+(?:_\w+)*\b(?!\s+(area|zone|region))',
542
+ lambda m: m.group(0).replace('_', ' '), cleaned)
543
+
544
+ # 確保句子完整性
545
+ if cleaned and not cleaned.endswith(('.', '!', '?')):
546
+ cleaned += '.'
547
+
548
+ return cleaned
549
+
550
+ except Exception as e:
551
+ self.logger.warning(f"Error validating scene details: {str(e)}")
552
+ return scene_details if scene_details else ""
553
+
554
+ def _generate_landmark_description(self,
555
+ scene_type: str,
556
+ detected_objects: List[Dict],
557
+ confidence: float,
558
+ lighting_info: Optional[Dict] = None,
559
+ functional_zones: Optional[Dict] = None,
560
+ landmark_objects: Optional[List[Dict]] = None) -> str:
561
+ """
562
+ 生成包含地標信息的場景描述
563
+
564
+ Args:
565
+ scene_type: 識別的場景類型
566
+ detected_objects: 檢測到的物件列表
567
+ confidence: 場景分類置信度
568
+ lighting_info: 照明條件信息
569
+ functional_zones: 功能區域信息
570
+ landmark_objects: 識別為地標的物件列表
571
+
572
+ Returns:
573
+ str: 包含地標信息的自然語言場景描述
574
+ """
575
+ try:
576
+ # 如果沒有提供地標物件,從檢測物件中篩選
577
+ if landmark_objects is None:
578
+ landmark_objects = [obj for obj in detected_objects if obj.get("is_landmark", False)]
579
+
580
+ # 如果沒有地標,退回到標準描述
581
+ if not landmark_objects:
582
+ if scene_type in ["tourist_landmark", "natural_landmark", "historical_monument"]:
583
+ base_description = "A scenic area that appears to be a tourist destination, though specific landmarks are not clearly identifiable."
584
+ else:
585
+ return self.text_formatter.format_final_description(self._generate_scene_details(
586
+ scene_type,
587
+ detected_objects,
588
+ lighting_info,
589
+ self.viewpoint_detector.detect_viewpoint(detected_objects)
590
+ ))
591
+ else:
592
+ # 獲取主要地標
593
+ primary_landmark = max(landmark_objects, key=lambda x: x.get("confidence", 0))
594
+ landmark_name = primary_landmark.get("class_name", "landmark")
595
+ # 先取原生 location
596
+ landmark_location = primary_landmark.get("location", "")
597
+ # 如果 location 為空,就從全域 ALL_LANDMARKS 補上
598
+ lm_id = primary_landmark.get("landmark_id")
599
+ if not landmark_location and lm_id and lm_id in ALL_LANDMARKS:
600
+ landmark_location = ALL_LANDMARKS[lm_id].get("location", "")
601
+
602
+ # 根據地標類型選擇適當的描述模板,並插入 location
603
+ if scene_type == "natural_landmark" or primary_landmark.get("landmark_type") == "natural":
604
+ base_description = f"A natural landmark scene featuring {landmark_name} in {landmark_location}."
605
+ elif scene_type == "historical_monument" or primary_landmark.get("landmark_type") == "monument":
606
+ base_description = f"A historical monument scene showcasing {landmark_name}, a significant landmark in {landmark_location}."
607
+ else:
608
+ base_description = f"A tourist landmark scene centered around {landmark_name}, an iconic structure in {landmark_location}."
609
+
610
+ # 添加地標的額外信息
611
+ landmark_details = []
612
+ for landmark in landmark_objects:
613
+ details = []
614
+
615
+ if "year_built" in landmark:
616
+ details.append(f"built in {landmark['year_built']}")
617
+
618
+ if "architectural_style" in landmark:
619
+ details.append(f"featuring {landmark['architectural_style']} architectural style")
620
+
621
+ if "significance" in landmark:
622
+ details.append(landmark["significance"])
623
+
624
+ # 補 location(如果該物件沒有 location,就再從 ALL_LANDMARKS 撈一次)
625
+ loc = landmark.get("location", "")
626
+ lm_id_iter = landmark.get("landmark_id")
627
+ if not loc and lm_id_iter and lm_id_iter in ALL_LANDMARKS:
628
+ loc = ALL_LANDMARKS[lm_id_iter].get("location", "")
629
+ if loc:
630
+ details.append(f"located in {loc}")
631
+
632
+ if details:
633
+ landmark_details.append(f"{landmark['class_name']} ({', '.join(details)})")
634
+
635
+ # 將詳細信息添加到基本描述中
636
+ if landmark_details:
637
+ description = base_description + " The scene features " + ", ".join(landmark_details) + "."
638
+ else:
639
+ description = base_description
640
+
641
+ # 獲取視角
642
+ viewpoint = self.viewpoint_detector.detect_viewpoint(detected_objects)
643
+
644
+ # 生成人員活動描述
645
+ people_count = len([obj for obj in detected_objects if obj["class_id"] == 0])
646
+
647
+ if people_count > 0:
648
+ if people_count == 1:
649
+ people_description = "There is one person in the scene, likely a tourist or visitor."
650
+ elif people_count < 5:
651
+ people_description = f"There are {people_count} people in the scene, possibly tourists visiting the landmark."
652
+ else:
653
+ people_description = f"The scene includes a group of {people_count} people, indicating this is a popular tourist destination."
654
+
655
+ description = self.text_formatter.smart_append(description, people_description)
656
+
657
+ # 添加照明信息
658
+ if lighting_info and "time_of_day" in lighting_info:
659
+ lighting_type = lighting_info["time_of_day"]
660
+ lighting_description = self.template_manager.get_lighting_template(lighting_type)
661
+ description = self.text_formatter.smart_append(description, lighting_description)
662
+
663
+ # 添加視角描述
664
+ if viewpoint != "eye_level":
665
+ viewpoint_template = self.template_manager.get_viewpoint_template(viewpoint)
666
+
667
+ prefix = viewpoint_template.get('prefix', '')
668
+ if prefix and not description.startswith(prefix):
669
+ if description and description[0].isupper():
670
+ description = prefix + description[0].lower() + description[1:]
671
+ else:
672
+ description = prefix + description
673
+
674
+ viewpoint_desc = viewpoint_template.get("observation", "").format(
675
+ scene_elements="the landmark and surrounding area"
676
+ )
677
+
678
+ if viewpoint_desc and viewpoint_desc not in description:
679
+ description = self.text_formatter.smart_append(description, viewpoint_desc)
680
+
681
+ # 添加功能區域描述
682
+ if functional_zones and len(functional_zones) > 0:
683
+ zones_desc = self.object_description_generator.describe_functional_zones(functional_zones)
684
+ if zones_desc:
685
+ description = self.text_formatter.smart_append(description, zones_desc)
686
+
687
+ # 描述可能的活動
688
+ landmark_activities = []
689
+
690
+ if scene_type == "natural_landmark" or any(obj.get("landmark_type") == "natural" for obj in landmark_objects):
691
+ landmark_activities = [
692
+ "nature photography",
693
+ "scenic viewing",
694
+ "hiking or walking",
695
+ "guided nature tours",
696
+ "outdoor appreciation"
697
+ ]
698
+ elif scene_type == "historical_monument" or any(obj.get("landmark_type") == "monument" for obj in landmark_objects):
699
+ landmark_activities = [
700
+ "historical sightseeing",
701
+ "educational tours",
702
+ "cultural appreciation",
703
+ "photography of historical architecture",
704
+ "learning about historical significance"
705
+ ]
706
+ else:
707
+ landmark_activities = [
708
+ "sightseeing",
709
+ "taking photographs",
710
+ "guided tours",
711
+ "cultural tourism",
712
+ "souvenir shopping"
713
+ ]
714
+
715
+ # 添加活動描述
716
+ if landmark_activities:
717
+ activities_text = "Common activities at this location include " + ", ".join(landmark_activities[:3]) + "."
718
+ description = self.text_formatter.smart_append(description, activities_text)
719
+
720
+ return self.text_formatter.format_final_description(description)
721
+
722
+ except Exception as e:
723
+ self.logger.warning(f"Error generating landmark description: {str(e)}")
724
+ # 備用處理
725
+ return self.text_formatter.format_final_description(
726
+ "A landmark scene with notable architectural or natural features."
727
+ )
728
+
729
+
730
+ def _is_intersection(self, detected_objects: List[Dict]) -> bool:
731
+ """
732
+ 通過分析物件分布來判斷場景是否為十字路口
733
+
734
+ Args:
735
+ detected_objects: 檢測到的物件列表
736
+
737
+ Returns:
738
+ bool: 是否為十字路口
739
+ """
740
+ try:
741
+ pedestrians = [obj for obj in detected_objects if obj.get("class_id") == 0]
742
+
743
+ if len(pedestrians) >= 8:
744
+ positions = [obj.get("normalized_center", (0, 0)) for obj in pedestrians]
745
+
746
+ x_coords = [pos[0] for pos in positions]
747
+ y_coords = [pos[1] for pos in positions]
748
+
749
+ x_variance = np.var(x_coords) if len(x_coords) > 1 else 0
750
+ y_variance = np.var(y_coords) if len(y_coords) > 1 else 0
751
+
752
+ x_range = max(x_coords) - min(x_coords)
753
+ y_range = max(y_coords) - min(y_coords)
754
+
755
+ if x_range > 0.5 and y_range > 0.5 and 0.7 < (x_range / y_range) < 1.3:
756
+ return True
757
+
758
+ return False
759
+
760
+ except Exception as e:
761
+ self.logger.warning(f"Error detecting intersection: {str(e)}")
762
+ return False
763
+
764
+ def _generate_generic_description(self, detected_objects: List[Dict], lighting_info: Optional[Dict] = None) -> str:
765
+ """
766
+ 當場景類型未知或置信度極低時生成通用描述
767
+
768
+ Args:
769
+ detected_objects: 檢測到的物件列表
770
+ lighting_info: 可選的照明條件信息
771
+
772
+ Returns:
773
+ str: 基於檢測物件的通用描述
774
+ """
775
+ try:
776
+ obj_counts = {}
777
+ for obj in detected_objects:
778
+ class_name = obj.get("class_name", "unknown object")
779
+ if class_name not in obj_counts:
780
+ obj_counts[class_name] = 0
781
+ obj_counts[class_name] += 1
782
+
783
+ top_objects = sorted(obj_counts.items(), key=lambda x: x[1], reverse=True)[:5]
784
+
785
+ if not top_objects:
786
+ base_desc = "This scene displays various elements, though specific objects are not clearly identifiable."
787
+ else:
788
+ objects_text = []
789
+ for name, count in top_objects:
790
+ # 確保物件名稱不包含技術性格式
791
+ clean_name = name.replace('_', ' ') if isinstance(name, str) else str(name)
792
+ if count > 1:
793
+ objects_text.append(f"{count} {clean_name}s")
794
+ else:
795
+ objects_text.append(f"a {clean_name}" if clean_name[0].lower() not in 'aeiou' else f"an {clean_name}")
796
+
797
+ if len(objects_text) == 1:
798
+ objects_list = objects_text[0]
799
+ elif len(objects_text) == 2:
800
+ objects_list = f"{objects_text[0]} and {objects_text[1]}"
801
+ else:
802
+ objects_list = ", ".join(objects_text[:-1]) + f", and {objects_text[-1]}"
803
+
804
+ base_desc = f"This scene features {objects_list}."
805
+
806
+ # 添加照明信息
807
+ if lighting_info and "time_of_day" in lighting_info:
808
+ lighting_type = lighting_info["time_of_day"]
809
+ lighting_desc = self.template_manager.get_lighting_template(lighting_type)
810
+ base_desc += f" {lighting_desc}"
811
+
812
+ return base_desc
813
+
814
+ except Exception as e:
815
+ self.logger.warning(f"Error generating generic description: {str(e)}")
816
+ return "A general scene is visible with various elements."
817
+
818
+ def _generate_scene_details(self,
819
+ scene_type: str,
820
+ detected_objects: List[Dict],
821
+ lighting_info: Optional[Dict] = None,
822
+ viewpoint: str = "eye_level",
823
+ spatial_analysis: Optional[Dict] = None,
824
+ image_dimensions: Optional[Tuple[int, int]] = None,
825
+ places365_info: Optional[Dict] = None,
826
+ object_statistics: Optional[Dict] = None) -> str:
827
+ """
828
+ 基於場景類型和檢測物件生成詳細描述
829
+
830
+ Args:
831
+ scene_type: 識別的場景類型
832
+ detected_objects: 檢測到的物件列表
833
+ lighting_info: 可選的照明條件信息
834
+ viewpoint: 檢測到的視角
835
+ spatial_analysis: 可選的空間分析結果
836
+ image_dimensions: 可選的圖像尺寸
837
+ places365_info: 可選的 Places365 場景分類結果
838
+ object_statistics: 可選的詳細物件統計信息
839
+
840
+ Returns:
841
+ str: 詳細場景描述
842
+ """
843
+ try:
844
+ scene_details = ""
845
+
846
+ # 日常場景類型列表
847
+ everyday_scene_types = [
848
+ "general_indoor_space", "generic_street_view",
849
+ "desk_area_workspace", "outdoor_gathering_spot",
850
+ "kitchen_counter_or_utility_area", "unknown"
851
+ ]
852
+
853
+ # 預處理場景類型以避免內部格式洩漏
854
+ processed_scene_type = self._sanitize_scene_type_for_description(scene_type)
855
+
856
+ # 確定場景描述方法
857
+ is_confident_specific_scene = scene_type not in everyday_scene_types and scene_type in self.template_manager.get_scene_detail_templates(scene_type)
858
+ treat_as_everyday = scene_type in everyday_scene_types
859
+
860
+ if hasattr(self, 'enable_landmark') and not self.enable_landmark:
861
+ if scene_type not in ["kitchen", "bedroom", "living_room", "office_workspace", "dining_area", "professional_kitchen"]:
862
+ treat_as_everyday = True
863
+
864
+ if treat_as_everyday or not is_confident_specific_scene:
865
+ self.logger.debug(f"Generating dynamic description for scene_type: {scene_type}")
866
+ scene_details = self.object_description_generator.generate_dynamic_everyday_description(
867
+ detected_objects,
868
+ lighting_info,
869
+ viewpoint,
870
+ spatial_analysis,
871
+ image_dimensions,
872
+ places365_info,
873
+ object_statistics
874
+ )
875
+ else:
876
+ self.logger.debug(f"Using template for scene_type: {scene_type}")
877
+ templates_list = self.template_manager.get_scene_detail_templates(scene_type, viewpoint)
878
+
879
+ if templates_list:
880
+ detail_template = random.choice(templates_list)
881
+ scene_details = self.template_manager.fill_template(
882
+ detail_template,
883
+ detected_objects,
884
+ scene_type,
885
+ places365_info,
886
+ object_statistics
887
+ )
888
+ else:
889
+ scene_details = self.object_description_generator.generate_dynamic_everyday_description(
890
+ detected_objects, lighting_info, viewpoint, spatial_analysis,
891
+ image_dimensions, places365_info, object_statistics
892
+ )
893
+
894
+ # 如果禁用地標檢測,過濾地標引用
895
+ if hasattr(self, 'enable_landmark') and not self.enable_landmark:
896
+ scene_details = self.text_formatter.filter_landmark_references(scene_details, enable_landmark=False)
897
+
898
+ return scene_details if scene_details else "A scene with some visual elements."
899
+
900
+ except Exception as e:
901
+ self.logger.warning(f"Error generating scene details: {str(e)}")
902
+ return "A scene with various elements."
903
+
904
+ def filter_landmark_references(self, text, enable_landmark=True):
905
+ """
906
+ 動態過濾文本中的地標引用
907
+
908
+ Args:
909
+ text: 需要過濾的文本
910
+ enable_landmark: 是否啟用地標功能
911
+
912
+ Returns:
913
+ str: 過濾後的文本
914
+ """
915
+ return self.text_formatter.filter_landmark_references(text, enable_landmark)
916
+
917
+ def get_prominent_objects(self, detected_objects: List[Dict],
918
+ min_prominence_score: float = 0.5,
919
+ max_categories_to_return: Optional[int] = None,
920
+ max_total_objects: Optional[int] = None) -> List[Dict]:
921
+ """
922
+ 獲取最重要的物件
923
+
924
+ Args:
925
+ detected_objects: 檢測到的物件列表
926
+ min_prominence_score: 最小重要性分數閾值,預設為0.5
927
+ max_categories_to_return: 可選的最大返回類別數量限制
928
+ max_total_objects: 可選的最大返回物件總數限制
929
+
930
+ Returns:
931
+ List[Dict]: 重要物件列表
932
+ """
933
+ try:
934
+ # 傳遞所有參數
935
+ prominent_objects = self.object_description_generator.get_prominent_objects(
936
+ detected_objects,
937
+ min_prominence_score,
938
+ max_categories_to_return
939
+ )
940
+
941
+ # 如果指定了最大物件總數限制,進行額外過濾
942
+ if max_total_objects is not None and max_total_objects > 0:
943
+ # 限制總物件數量,保持重要性排序
944
+ prominent_objects = prominent_objects[:max_total_objects]
945
+
946
+ # 如果指定了最大類別數量限制,則進行額外過濾
947
+ if max_categories_to_return is not None and max_categories_to_return > 0:
948
+ # 按類別分組物件
949
+ categories_seen = set()
950
+ filtered_objects = []
951
+
952
+ for obj in prominent_objects:
953
+ class_name = obj.get("class_name", "unknown")
954
+ if class_name not in categories_seen:
955
+ categories_seen.add(class_name)
956
+ filtered_objects.append(obj)
957
+
958
+ # 如果已達到最大類別數量,停止添加新類別
959
+ if len(categories_seen) >= max_categories_to_return:
960
+ break
961
+ elif class_name in categories_seen:
962
+ # 如果是已見過的類別,仍然添加該物件
963
+ filtered_objects.append(obj)
964
+
965
+ return filtered_objects
966
+
967
+ return prominent_objects
968
+
969
+ except Exception as e:
970
+ self.logger.warning(f"Error getting prominent objects: {str(e)}")
971
+ return []
972
+
973
+ def detect_viewpoint(self, detected_objects: List[Dict]) -> str:
974
+ """
975
+ 檢測圖像視角類型
976
+
977
+ Args:
978
+ detected_objects: 檢測到的物件列表
979
+
980
+ Returns:
981
+ str: 檢測到的視角類型
982
+ """
983
+ try:
984
+ return self.viewpoint_detector.detect_viewpoint(detected_objects)
985
+ except Exception as e:
986
+ self.logger.warning(f"Error detecting viewpoint: {str(e)}")
987
+ return "eye_level"
988
+
989
+ def detect_cultural_context(self, scene_type: str, detected_objects: List[Dict]) -> Optional[str]:
990
+ """
991
+ 檢測場景的文化語境
992
+
993
+ Args:
994
+ scene_type: 識別的場景類型
995
+ detected_objects: 檢測到的物件列表
996
+
997
+ Returns:
998
+ Optional[str]: 檢測到的文化語境或None
999
+ """
1000
+ try:
1001
+ return self.cultural_context_analyzer.detect_cultural_context(scene_type, detected_objects)
1002
+ except CulturalContextError as e:
1003
+ self.logger.warning(f"Error detecting cultural context: {str(e)}")
1004
+ return None
1005
+
1006
+ def generate_cultural_elements(self, cultural_context: str) -> str:
1007
+ """
1008
+ 為檢測到的文化語境生成描述元素
1009
+
1010
+ Args:
1011
+ cultural_context: 檢測到的文化語境
1012
+
1013
+ Returns:
1014
+ str: 文化元素描述
1015
+ """
1016
+ try:
1017
+ return self.cultural_context_analyzer.generate_cultural_elements(cultural_context)
1018
+ except CulturalContextError as e:
1019
+ self.logger.warning(f"Error generating cultural elements: {str(e)}")
1020
+ return ""
1021
+
1022
+ def format_object_list_for_description(self, objects: List[Dict],
1023
+ use_indefinite_article_for_one: bool = False,
1024
+ count_threshold_for_generalization: int = -1,
1025
+ max_types_to_list: int = 5) -> str:
1026
+ """
1027
+ 將物件列表格式化為人類可讀的字符串
1028
+
1029
+ Args:
1030
+ objects: 物件字典列表
1031
+ use_indefinite_article_for_one: 單個物件是否使用 "a/an"
1032
+ count_threshold_for_generalization: 計數閾值
1033
+ max_types_to_list: 最大物件類型數���
1034
+
1035
+ Returns:
1036
+ str: 格式化的物件描述字符串
1037
+ """
1038
+ try:
1039
+ return self.object_description_generator.format_object_list_for_description(
1040
+ objects, use_indefinite_article_for_one, count_threshold_for_generalization, max_types_to_list
1041
+ )
1042
+ except ObjectDescriptionError as e:
1043
+ self.logger.warning(f"Error formatting object list: {str(e)}")
1044
+ return "various objects"
1045
+
1046
+ def get_spatial_description(self, obj: Dict, image_width: Optional[int] = None,
1047
+ image_height: Optional[int] = None) -> str:
1048
+ """
1049
+ 為物件生成空間位置描述
1050
+
1051
+ Args:
1052
+ obj: 物件字典
1053
+ image_width: 可選的圖像寬度
1054
+ image_height: 可選的圖像高度
1055
+
1056
+ Returns:
1057
+ str: 空間描述字符串
1058
+ """
1059
+ try:
1060
+ return self.object_description_generator.get_spatial_description(obj, image_width, image_height)
1061
+ except ObjectDescriptionError as e:
1062
+ self.logger.warning(f"Error generating spatial description: {str(e)}")
1063
+ return "in the scene"
1064
+
1065
+ def optimize_object_description(self, description: str) -> str:
1066
+ """
1067
+ 優化物件描述,避免重複列舉相同物件
1068
+
1069
+ Args:
1070
+ description: 原始描述文本
1071
+
1072
+ Returns:
1073
+ str: 優化後的描述文本
1074
+ """
1075
+ try:
1076
+ return self.object_description_generator.optimize_object_description(description)
1077
+ except ObjectDescriptionError as e:
1078
+ self.logger.warning(f"Error optimizing object description: {str(e)}")
1079
+ return description
1080
+
1081
+ def describe_functional_zones(self, functional_zones: Dict) -> str:
1082
+ """
1083
+ 生成場景功能區域的描述
1084
+
1085
+ Args:
1086
+ functional_zones: 識別出的功能區域字典
1087
+
1088
+ Returns:
1089
+ str: 功能區域描述
1090
+ """
1091
+ try:
1092
+ return self.object_description_generator.describe_functional_zones(functional_zones)
1093
+ except ObjectDescriptionError as e:
1094
+ self.logger.warning(f"Error describing functional zones: {str(e)}")
1095
+ return ""
1096
+
1097
+ def smart_append(self, current_text: str, new_fragment: str) -> str:
1098
+ """
1099
+ 智能地將新文本片段附加到現有文本
1100
+
1101
+ Args:
1102
+ current_text: 要附加到的現有文本
1103
+ new_fragment: 要附加的新文本片段
1104
+
1105
+ Returns:
1106
+ str: 合併後的文本
1107
+ """
1108
+ try:
1109
+ return self.text_formatter.smart_append(current_text, new_fragment)
1110
+ except TextFormattingError as e:
1111
+ self.logger.warning(f"Error in smart append: {str(e)}")
1112
+ return f"{current_text} {new_fragment}" if current_text else new_fragment
1113
+
1114
+ def format_final_description(self, text: str) -> str:
1115
+ """
1116
+ 格式化最終描述文本
1117
+
1118
+ Args:
1119
+ text: 要格式化的文本
1120
+
1121
+ Returns:
1122
+ str: 格式化後的文本
1123
+ """
1124
+ try:
1125
+ return self.text_formatter.format_final_description(text)
1126
+ except TextFormattingError as e:
1127
+ self.logger.warning(f"Error formatting final description: {str(e)}")
1128
+ return text
1129
+
1130
+ def get_template(self, category: str, key: Optional[str] = None):
1131
+ """
1132
+ 獲取指定類別的模板
1133
+
1134
+ Args:
1135
+ category: 模板類別名稱
1136
+ key: 可選的具體模板鍵值
1137
+
1138
+ Returns:
1139
+ 模板內容
1140
+ """
1141
+ try:
1142
+ return self.template_manager.get_template(category, key)
1143
+ except (TemplateLoadingError, TemplateFillError) as e:
1144
+ self.logger.warning(f"Error getting template: {str(e)}")
1145
+ return None
1146
+
1147
+ def get_viewpoint_confidence(self, detected_objects: List[Dict]) -> Tuple[str, float]:
1148
+ """
1149
+ 獲取視角檢測結果及其信心度
1150
+
1151
+ Args:
1152
+ detected_objects: 檢測到的物件列表
1153
+
1154
+ Returns:
1155
+ Tuple[str, float]: (視角類型, 信心度)
1156
+ """
1157
+ try:
1158
+ return self.viewpoint_detector.get_viewpoint_confidence(detected_objects)
1159
+ except ViewpointDetectionError as e:
1160
+ self.logger.warning(f"Error getting viewpoint confidence: {str(e)}")
1161
+ return "eye_level", 0.5
1162
+
1163
+ def get_supported_cultures(self) -> List[str]:
1164
+ """
1165
+ 獲取所有支援的文化語境列表
1166
+
1167
+ Returns:
1168
+ List[str]: 支援的文化語境名稱列表
1169
+ """
1170
+ return self.cultural_context_analyzer.get_supported_cultures()
1171
+
1172
+ def has_cultural_context(self, cultural_context: str) -> bool:
1173
+ """
1174
+ 檢查是否支援指定的文化語境
1175
+
1176
+ Args:
1177
+ cultural_context: 文化語境名稱
1178
+
1179
+ Returns:
1180
+ bool: 是否支援該文化語境
1181
+ """
1182
+ return self.cultural_context_analyzer.has_cultural_context(cultural_context)
1183
+
1184
+ def validate_text_quality(self, text: str) -> Dict[str, bool]:
1185
+ """
1186
+ 驗證文本質量
1187
+
1188
+ Args:
1189
+ text: 要驗證的文本
1190
+
1191
+ Returns:
1192
+ Dict[str, bool]: 質量檢查結果
1193
+ """
1194
+ try:
1195
+ return self.text_formatter.validate_text_quality(text)
1196
+ except TextFormattingError as e:
1197
+ self.logger.warning(f"Error validating text quality: {str(e)}")
1198
+ return {"error": True}
1199
+
1200
+ def get_text_statistics(self, text: str) -> Dict[str, int]:
1201
+ """
1202
+ 獲取文本統計信息
1203
+
1204
+ Args:
1205
+ text: 要分析的文本
1206
+
1207
+ Returns:
1208
+ Dict[str, int]: 文本統計信息
1209
+ """
1210
+ try:
1211
+ return self.text_formatter.get_text_statistics(text)
1212
+ except TextFormattingError as e:
1213
+ self.logger.warning(f"Error getting text statistics: {str(e)}")
1214
+ return {"characters": 0, "words": 0, "sentences": 0}
1215
+
1216
+ def reload_templates(self):
1217
+ """
1218
+ 重新載入所有模板
1219
+ """
1220
+ try:
1221
+ self.template_manager.reload_templates()
1222
+ self.logger.info("Templates reloaded successfully")
1223
+ except (TemplateLoadingError, TemplateFillError) as e:
1224
+ self.logger.error(f"Error reloading templates: {str(e)}")
1225
+ raise EnhancedSceneDescriberError(f"Failed to reload templates: {str(e)}") from e
1226
+
1227
+ def get_configuration(self) -> Dict[str, Any]:
1228
+ """
1229
+ 獲取當前配置信息
1230
+
1231
+ Returns:
1232
+ Dict[str, Any]: 配置信息字典
1233
+ """
1234
+ try:
1235
+ return {
1236
+ "scene_types_count": len(self.scene_types),
1237
+ "viewpoint_detector_config": self.viewpoint_detector.viewpoint_params,
1238
+ "object_generator_config": self.object_description_generator.get_configuration(),
1239
+ "supported_cultures": self.cultural_context_analyzer.get_supported_cultures(),
1240
+ "template_categories": self.template_manager.get_template_categories()
1241
+ }
1242
+ except Exception as e:
1243
+ self.logger.warning(f"Error getting configuration: {str(e)}")
1244
+ return {"error": str(e)}
1245
+
1246
+ def _initialize_fallback_components(self):
1247
+ """備用組件初始化"""
1248
+ try:
1249
+ self.region_analyzer = RegionAnalyzer()
1250
+ self.object_description_generator = ObjectDescriptionGenerator(
1251
+ region_analyzer=self.region_analyzer
1252
+ )
1253
+ except Exception as e:
1254
+ self.logger.error(f"Fallback component initialization failed: {str(e)}")
feature_extractor.py ADDED
@@ -0,0 +1,822 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import cv2
3
+ import logging
4
+ import traceback
5
+ from typing import Dict, Any, Optional
6
+ from configuration_manager import ConfigurationManager
7
+
8
+
9
+ class FeatureExtractor:
10
+ """
11
+ Extracts comprehensive lighting and scene features from images.(主要從圖片提取光線資訊)
12
+
13
+ This class handles all basic feature computation including brightness analysis,
14
+ color characteristics, texture complexity, and structural features for
15
+ lighting analysis and scene understanding.
16
+ """
17
+
18
+ def __init__(self, config_manager: ConfigurationManager):
19
+ """
20
+ Initialize the feature extractor.
21
+
22
+ Args:
23
+ config_manager: Configuration manager instance for accessing thresholds.
24
+ """
25
+ self.config_manager = config_manager
26
+ self.logger = self._setup_logger()
27
+
28
+ def _setup_logger(self) -> logging.Logger:
29
+ """Set up logger for feature extraction operations."""
30
+ logger = logging.getLogger(f"{__name__}.FeatureExtractor")
31
+ if not logger.handlers:
32
+ handler = logging.StreamHandler()
33
+ formatter = logging.Formatter(
34
+ '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
35
+ )
36
+ handler.setFormatter(formatter)
37
+ logger.addHandler(handler)
38
+ logger.setLevel(logging.INFO)
39
+ return logger
40
+
41
+ def extract_features(self, image_rgb: np.ndarray) -> Dict[str, Any]:
42
+ """
43
+ Extract all features from an RGB image.
44
+
45
+ Args:
46
+ image_rgb: Input image as numpy array in RGB format.
47
+
48
+ Returns:
49
+ Dictionary containing all extracted features.
50
+ """
51
+ try:
52
+ # Validate input image
53
+ if not self._validate_image(image_rgb):
54
+ return self._get_default_features()
55
+
56
+ # Get image dimensions and prepare processing parameters
57
+ height, width = image_rgb.shape[:2]
58
+ scale_factor = self._calculate_scale_factor(height, width)
59
+
60
+ # Create processed image versions
61
+ small_rgb = cv2.resize(
62
+ image_rgb,
63
+ (width // scale_factor, height // scale_factor),
64
+ interpolation=cv2.INTER_AREA
65
+ )
66
+ hsv_img = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2HSV)
67
+ gray_img = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2GRAY)
68
+ small_gray = cv2.cvtColor(small_rgb, cv2.COLOR_RGB2GRAY)
69
+
70
+ # Extract features by category
71
+ brightness_features = self.compute_brightness_features(hsv_img, height, width)
72
+ color_features = self.compute_color_features(hsv_img, height, width)
73
+ texture_features = self.compute_texture_features(small_gray, gray_img, height, width)
74
+ structure_features = self.compute_structure_features(
75
+ small_gray, gray_img, hsv_img, height, width, scale_factor
76
+ )
77
+
78
+ # Combine all features
79
+ features = {**brightness_features, **color_features, **texture_features, **structure_features}
80
+
81
+ # Add compatibility features for legacy code
82
+ legacy_features = self._compute_legacy_compatibility_features(
83
+ hsv_img, small_gray, features, scale_factor
84
+ )
85
+ features.update(legacy_features)
86
+
87
+ self.logger.debug(f"Successfully extracted {len(features)} features from image")
88
+ return features
89
+
90
+ except Exception as e:
91
+ self.logger.error(f"Error in feature extraction: {str(e)}")
92
+ self.logger.error(f"Traceback: {traceback.format_exc()}")
93
+ return self._get_default_features()
94
+
95
+ def compute_brightness_features(self, hsv_img: np.ndarray, height: int, width: int) -> Dict[str, float]:
96
+ """
97
+ Compute brightness-related features from HSV image.
98
+
99
+ Args:
100
+ hsv_img: Image in HSV color space.
101
+ height: Image height.
102
+ width: Image width.
103
+
104
+ Returns:
105
+ Dictionary containing brightness features.
106
+ """
107
+ try:
108
+ v_channel = hsv_img[:, :, 2] # Value channel represents brightness
109
+
110
+ # 基本的亮度統計
111
+ avg_brightness = float(np.mean(v_channel))
112
+ brightness_std = float(np.std(v_channel))
113
+
114
+ # Pixel ratio calculations
115
+ dark_threshold = self.config_manager.feature_thresholds.dark_pixel_threshold
116
+ bright_threshold = self.config_manager.feature_thresholds.bright_pixel_threshold
117
+
118
+ total_pixels = height * width
119
+ dark_pixel_ratio = float(np.sum(v_channel < dark_threshold) / total_pixels)
120
+ bright_pixel_ratio = float(np.sum(v_channel > bright_threshold) / total_pixels)
121
+
122
+ # Brightness uniformity
123
+ brightness_uniformity = 1.0 - min(1.0, brightness_std / max(avg_brightness, 1e-5))
124
+
125
+ return {
126
+ "avg_brightness": avg_brightness,
127
+ "brightness_std": brightness_std,
128
+ "dark_pixel_ratio": dark_pixel_ratio,
129
+ "bright_pixel_ratio": bright_pixel_ratio,
130
+ "brightness_uniformity": brightness_uniformity
131
+ }
132
+
133
+ except Exception as e:
134
+ self.logger.error(f"Error computing brightness features: {str(e)}")
135
+ return {
136
+ "avg_brightness": 100.0,
137
+ "brightness_std": 50.0,
138
+ "dark_pixel_ratio": 0.0,
139
+ "bright_pixel_ratio": 0.0,
140
+ "brightness_uniformity": 0.5
141
+ }
142
+
143
+ def compute_color_features(self, hsv_img: np.ndarray, height: int, width: int) -> Dict[str, Any]:
144
+ """
145
+ Compute color-related features from HSV image.
146
+
147
+ Args:
148
+ hsv_img: Image in HSV color space.
149
+ height: Image height.
150
+ width: Image width.
151
+
152
+ Returns:
153
+ Dictionary containing color features.
154
+ """
155
+ try:
156
+ h_channel, s_channel, v_channel = cv2.split(hsv_img)
157
+ total_pixels = height * width
158
+
159
+ # Color ratio calculations
160
+ color_features = {}
161
+
162
+ # Blue color detection (general and sky-specific)
163
+ blue_mask = ((h_channel >= 90) & (h_channel <= 140))
164
+ color_features["blue_ratio"] = float(np.sum(blue_mask) / total_pixels)
165
+
166
+ # Sky-like blue detection
167
+ ft = self.config_manager.feature_thresholds
168
+ sky_blue_mask = (
169
+ (h_channel >= ft.sky_blue_hue_min) & (h_channel <= ft.sky_blue_hue_max) &
170
+ (s_channel > ft.sky_blue_sat_min) & (v_channel > ft.sky_blue_val_min)
171
+ )
172
+ color_features["sky_like_blue_ratio"] = float(np.sum(sky_blue_mask) / total_pixels)
173
+
174
+ # Yellow-orange detection
175
+ yellow_orange_mask = ((h_channel >= 15) & (h_channel <= 45))
176
+ color_features["yellow_orange_ratio"] = float(np.sum(yellow_orange_mask) / total_pixels)
177
+
178
+ # Gray detection
179
+ gray_mask = (
180
+ (s_channel < ft.gray_sat_max) &
181
+ (v_channel > ft.gray_val_min) &
182
+ (v_channel < ft.gray_val_max)
183
+ )
184
+ color_features["gray_ratio"] = float(np.sum(gray_mask) / total_pixels)
185
+
186
+ # Saturation statistics
187
+ color_features["avg_saturation"] = float(np.mean(s_channel))
188
+
189
+ # Sky region analysis
190
+ sky_region_features = self._analyze_sky_region(h_channel, s_channel, v_channel, height)
191
+ color_features.update(sky_region_features)
192
+
193
+ # Color atmosphere analysis
194
+ atmosphere_features = self._analyze_color_atmosphere(h_channel, s_channel, total_pixels)
195
+ color_features.update(atmosphere_features)
196
+
197
+ return color_features
198
+
199
+ except Exception as e:
200
+ self.logger.error(f"Error computing color features: {str(e)}")
201
+ return self._get_default_color_features()
202
+
203
+ def compute_texture_features(self, small_gray: np.ndarray, gray_img: np.ndarray,
204
+ height: int, width: int) -> Dict[str, float]:
205
+ """
206
+ Compute texture and gradient features.
207
+
208
+ Args:
209
+ small_gray: Downscaled grayscale image for efficient processing.
210
+ gray_img: Full-resolution grayscale image.
211
+ height: Original image height.
212
+ width: Original image width.
213
+
214
+ Returns:
215
+ Dictionary containing texture features.
216
+ """
217
+ try:
218
+ # Compute gradients on small image for efficiency
219
+ gx = cv2.Sobel(small_gray, cv2.CV_32F, 1, 0, ksize=3)
220
+ gy = cv2.Sobel(small_gray, cv2.CV_32F, 0, 1, ksize=3)
221
+
222
+ avg_abs_gx = float(np.mean(np.abs(gx)))
223
+ avg_abs_gy = float(np.mean(np.abs(gy)))
224
+
225
+ # Gradient ratio (vertical to horizontal)
226
+ gradient_ratio_vertical_horizontal = avg_abs_gy / max(avg_abs_gx, 1e-5)
227
+
228
+ # Top region texture complexity
229
+ small_top_third_height = small_gray.shape[0] // 3
230
+ small_sky_region_gray = small_gray[:small_top_third_height, :]
231
+
232
+ if small_sky_region_gray.size > 0:
233
+ laplacian_var_sky = cv2.Laplacian(small_sky_region_gray, cv2.CV_64F).var()
234
+ top_region_texture_complexity = min(1.0, laplacian_var_sky / 1000.0)
235
+ else:
236
+ top_region_texture_complexity = 0.5
237
+
238
+ # Shadow clarity estimation
239
+ brightness_std = float(np.std(gray_img))
240
+ avg_brightness = float(np.mean(gray_img))
241
+ dark_pixel_ratio = float(np.sum(gray_img < 50) / (height * width))
242
+
243
+ if brightness_std > 60 and dark_pixel_ratio < 0.15 and avg_brightness > 100:
244
+ shadow_clarity_score = 0.7
245
+ elif brightness_std < 30 and dark_pixel_ratio > 0.1:
246
+ shadow_clarity_score = 0.3
247
+ else:
248
+ shadow_clarity_score = 0.5
249
+
250
+ # Edge density
251
+ edges_density = min(1.0, (avg_abs_gx + avg_abs_gy) / 100.0)
252
+
253
+ return {
254
+ "gradient_ratio_vertical_horizontal": gradient_ratio_vertical_horizontal,
255
+ "top_region_texture_complexity": top_region_texture_complexity,
256
+ "shadow_clarity_score": shadow_clarity_score,
257
+ "vertical_strength": avg_abs_gy,
258
+ "horizontal_strength": avg_abs_gx,
259
+ "edges_density": edges_density
260
+ }
261
+
262
+ except Exception as e:
263
+ self.logger.error(f"Error computing texture features: {str(e)}")
264
+ return {
265
+ "gradient_ratio_vertical_horizontal": 1.0,
266
+ "top_region_texture_complexity": 0.5,
267
+ "shadow_clarity_score": 0.5,
268
+ "vertical_strength": 0.0,
269
+ "horizontal_strength": 0.0,
270
+ "edges_density": 0.0
271
+ }
272
+
273
+ def compute_structure_features(self, small_gray: np.ndarray, gray_img: np.ndarray,
274
+ hsv_img: np.ndarray, height: int, width: int,
275
+ scale_factor: int) -> Dict[str, float]:
276
+ """
277
+ Compute structural features including ceiling likelihood and boundary clarity.
278
+
279
+ Args:
280
+ small_gray: Downscaled grayscale image.
281
+ gray_img: Full-resolution grayscale image.
282
+ hsv_img: HSV image for brightness analysis.
283
+ height: Original image height.
284
+ width: Original image width.
285
+ scale_factor: Downscaling factor used.
286
+
287
+ Returns:
288
+ Dictionary containing structural features.
289
+ """
290
+ try:
291
+ # Compute gradients
292
+ gx = cv2.Sobel(small_gray, cv2.CV_32F, 1, 0, ksize=3)
293
+ gy = cv2.Sobel(small_gray, cv2.CV_32F, 0, 1, ksize=3)
294
+ avg_abs_gx = float(np.mean(np.abs(gx)))
295
+ avg_abs_gy = float(np.mean(np.abs(gy)))
296
+
297
+ # Ceiling likelihood analysis
298
+ ceiling_features = self._analyze_ceiling_likelihood(
299
+ small_gray, hsv_img, gx, avg_abs_gx, height, scale_factor
300
+ )
301
+
302
+ # Boundary clarity analysis
303
+ boundary_clarity = self._compute_boundary_clarity(small_gray, avg_abs_gx, avg_abs_gy)
304
+
305
+ # Openness analysis
306
+ openness_top_edge = self._compute_openness_top_edge(gy, avg_abs_gy)
307
+
308
+ # Legacy compatibility features
309
+ legacy_structure = self._compute_legacy_structure_features(gray_img, height)
310
+
311
+ structure_features = {
312
+ "ceiling_likelihood": ceiling_features["ceiling_likelihood"],
313
+ "boundary_clarity": boundary_clarity,
314
+ "openness_top_edge": openness_top_edge,
315
+ **legacy_structure
316
+ }
317
+
318
+ return structure_features
319
+
320
+ except Exception as e:
321
+ self.logger.error(f"Error computing structure features: {str(e)}")
322
+ return {
323
+ "ceiling_likelihood": 0.0,
324
+ "boundary_clarity": 0.0,
325
+ "openness_top_edge": 0.5,
326
+ "ceiling_uniformity": 0.5,
327
+ "horizontal_line_ratio": 0.0
328
+ }
329
+
330
+ def _analyze_sky_region(self, h_channel: np.ndarray, s_channel: np.ndarray,
331
+ v_channel: np.ndarray, height: int) -> Dict[str, float]:
332
+ """Analyze features specific to the sky region (top third of image)."""
333
+ try:
334
+ top_third_height = height // 3
335
+ sky_region_v = v_channel[:top_third_height, :]
336
+ sky_region_s = s_channel[:top_third_height, :]
337
+ sky_region_h = h_channel[:top_third_height, :]
338
+
339
+ if sky_region_v.size == 0:
340
+ return self._get_default_sky_features()
341
+
342
+ # Sky region brightness analysis
343
+ sky_region_avg_brightness = float(np.mean(sky_region_v))
344
+ overall_avg_brightness = float(np.mean(v_channel))
345
+ sky_region_brightness_ratio = sky_region_avg_brightness / max(overall_avg_brightness, 1e-5)
346
+ sky_region_saturation = float(np.mean(sky_region_s))
347
+
348
+ # Sky blue dominance in sky region
349
+ ft = self.config_manager.feature_thresholds
350
+ sky_region_blue_pixels = np.sum(
351
+ (sky_region_h >= ft.sky_blue_hue_min) & (sky_region_h <= ft.sky_blue_hue_max) &
352
+ (sky_region_s > ft.sky_blue_sat_min) & (sky_region_v > ft.sky_blue_val_min)
353
+ )
354
+ sky_region_blue_dominance = float(sky_region_blue_pixels / max(1, sky_region_v.size))
355
+
356
+ return {
357
+ "sky_region_brightness_ratio": sky_region_brightness_ratio,
358
+ "sky_region_saturation": sky_region_saturation,
359
+ "sky_region_blue_dominance": sky_region_blue_dominance,
360
+ "sky_brightness": sky_region_avg_brightness
361
+ }
362
+
363
+ except Exception as e:
364
+ self.logger.error(f"Error analyzing sky region: {str(e)}")
365
+ return self._get_default_sky_features()
366
+
367
+ def _analyze_color_atmosphere(self, h_channel: np.ndarray, s_channel: np.ndarray,
368
+ total_pixels: int) -> Dict[str, Any]:
369
+ """Analyze warm/cool color atmosphere."""
370
+ try:
371
+ cr = self.config_manager.color_ranges
372
+
373
+ # Warm colors detection
374
+ warm_mask = np.zeros_like(h_channel, dtype=bool)
375
+ for h_min, h_max in cr.warm_hue_ranges:
376
+ warm_mask |= ((h_channel >= h_min) & (h_channel <= h_max))
377
+ warm_ratio = float(np.sum(warm_mask & (s_channel > 30)) / total_pixels)
378
+
379
+ # Cool colors detection
380
+ cool_mask = np.zeros_like(h_channel, dtype=bool)
381
+ for h_min, h_max in cr.cool_hue_ranges:
382
+ cool_mask |= ((h_channel >= h_min) & (h_channel <= h_max))
383
+ cool_ratio = float(np.sum(cool_mask & (s_channel > 30)) / total_pixels)
384
+
385
+ # Determine overall atmosphere
386
+ if warm_ratio > cool_ratio and warm_ratio > 0.3:
387
+ color_atmosphere = "warm"
388
+ elif cool_ratio > warm_ratio and cool_ratio > 0.3:
389
+ color_atmosphere = "cool"
390
+ else:
391
+ color_atmosphere = "neutral"
392
+
393
+ return {
394
+ "warm_ratio": warm_ratio,
395
+ "cool_ratio": cool_ratio,
396
+ "color_atmosphere": color_atmosphere
397
+ }
398
+
399
+ except Exception as e:
400
+ self.logger.error(f"Error analyzing color atmosphere: {str(e)}")
401
+ return {
402
+ "warm_ratio": 0.0,
403
+ "cool_ratio": 0.0,
404
+ "color_atmosphere": "neutral"
405
+ }
406
+
407
+ def _analyze_ceiling_likelihood(self, small_gray: np.ndarray, hsv_img: np.ndarray,
408
+ gx: np.ndarray, avg_abs_gx: float, height: int,
409
+ scale_factor: int) -> Dict[str, float]:
410
+ """Analyze likelihood of ceiling presence."""
411
+ try:
412
+ ceiling_likelihood = 0.0
413
+ config = self.config_manager.indoor_outdoor_thresholds
414
+
415
+ # Get sky region brightness for analysis
416
+ v_channel = hsv_img[:, :, 2]
417
+ top_third_height = height // 3
418
+ sky_region_v = v_channel[:top_third_height, :]
419
+ sky_region_avg_brightness = float(np.mean(sky_region_v)) if sky_region_v.size > 0 else 0
420
+
421
+ # Get top region texture complexity
422
+ small_top_third_height = small_gray.shape[0] // 3
423
+ small_sky_region_gray = small_gray[:small_top_third_height, :]
424
+
425
+ if small_sky_region_gray.size > 0:
426
+ laplacian_var = cv2.Laplacian(small_sky_region_gray, cv2.CV_64F).var()
427
+ top_region_texture_complexity = min(1.0, laplacian_var / 1000.0)
428
+ else:
429
+ top_region_texture_complexity = 0.5
430
+
431
+ # Condition 1: Simple texture and moderate brightness
432
+ ceiling_texture_thresh = getattr(config, 'ceiling_texture_thresh', 0.4)
433
+ ceiling_brightness_min = getattr(config, 'ceiling_brightness_min', 60)
434
+ ceiling_brightness_max = getattr(config, 'ceiling_brightness_max', 230)
435
+
436
+ if (top_region_texture_complexity < ceiling_texture_thresh and
437
+ ceiling_brightness_min < sky_region_avg_brightness < ceiling_brightness_max):
438
+ ceiling_likelihood += 0.45
439
+
440
+ # Condition 2: Horizontal line strength
441
+ top_horizontal_lines_strength = float(np.mean(np.abs(gx[:small_gray.shape[0]//3, :])))
442
+ ceiling_horizontal_line_factor = getattr(config, 'ceiling_horizontal_line_factor', 1.15)
443
+
444
+ if top_horizontal_lines_strength > avg_abs_gx * ceiling_horizontal_line_factor:
445
+ ceiling_likelihood += 0.35
446
+
447
+ # Condition 3: Central bright spot (lamp detection)
448
+ center_y_sm, center_x_sm = small_gray.shape[0]//2, small_gray.shape[1]//2
449
+ lamp_check_radius_y = small_gray.shape[0] // 8
450
+ lamp_check_radius_x = small_gray.shape[1] // 8
451
+
452
+ center_region = small_gray[
453
+ max(0, center_y_sm - lamp_check_radius_y):min(small_gray.shape[0], center_y_sm + lamp_check_radius_y),
454
+ max(0, center_x_sm - lamp_check_radius_x):min(small_gray.shape[1], center_x_sm + lamp_check_radius_x)
455
+ ]
456
+
457
+ if center_region.size > 0:
458
+ avg_brightness = float(np.mean(small_gray))
459
+ center_brightness = float(np.mean(center_region))
460
+ ceiling_center_bright_factor = getattr(config, 'ceiling_center_bright_factor', 1.25)
461
+
462
+ if center_brightness > avg_brightness * ceiling_center_bright_factor:
463
+ ceiling_likelihood += 0.30
464
+
465
+ # Sky dominance analysis for penalty
466
+ sky_region_blue_dominance = self._compute_sky_blue_dominance(hsv_img, height)
467
+ sky_region_brightness_ratio = sky_region_avg_brightness / max(float(np.mean(v_channel)), 1e-5)
468
+
469
+ # Penalties for strong sky signals
470
+ ceiling_max_sky_blue_thresh = getattr(config, 'ceiling_max_sky_blue_thresh', 0.08)
471
+ ceiling_max_sky_brightness_ratio = getattr(config, 'ceiling_max_sky_brightness_ratio', 1.15)
472
+
473
+ if (sky_region_blue_dominance < ceiling_max_sky_blue_thresh and
474
+ sky_region_brightness_ratio < ceiling_max_sky_brightness_ratio):
475
+ ceiling_likelihood += 0.15
476
+
477
+ # Strong sky override
478
+ sky_blue_dominance_strong_thresh = getattr(config, 'sky_blue_dominance_strong_thresh', 0.25)
479
+ sky_brightness_strong_thresh = getattr(config, 'sky_brightness_strong_thresh', 1.25)
480
+ ceiling_sky_override_factor = getattr(config, 'ceiling_sky_override_factor', 0.1)
481
+
482
+ if (sky_region_blue_dominance > sky_blue_dominance_strong_thresh and
483
+ sky_region_brightness_ratio > sky_brightness_strong_thresh):
484
+ ceiling_likelihood *= ceiling_sky_override_factor
485
+
486
+ ceiling_likelihood = min(1.0, ceiling_likelihood)
487
+
488
+ return {"ceiling_likelihood": ceiling_likelihood}
489
+
490
+ except Exception as e:
491
+ self.logger.error(f"Error analyzing ceiling likelihood: {str(e)}")
492
+ return {"ceiling_likelihood": 0.0}
493
+
494
+ def _compute_sky_blue_dominance(self, hsv_img: np.ndarray, height: int) -> float:
495
+ """Compute blue dominance in sky region."""
496
+ try:
497
+ h_channel, s_channel, v_channel = cv2.split(hsv_img)
498
+ top_third_height = height // 3
499
+ sky_region_h = h_channel[:top_third_height, :]
500
+ sky_region_s = s_channel[:top_third_height, :]
501
+ sky_region_v = v_channel[:top_third_height, :]
502
+
503
+ if sky_region_h.size == 0:
504
+ return 0.0
505
+
506
+ ft = self.config_manager.feature_thresholds
507
+ sky_region_blue_pixels = np.sum(
508
+ (sky_region_h >= ft.sky_blue_hue_min) & (sky_region_h <= ft.sky_blue_hue_max) &
509
+ (sky_region_s > ft.sky_blue_sat_min) & (sky_region_v > ft.sky_blue_val_min)
510
+ )
511
+
512
+ return float(sky_region_blue_pixels / max(1, sky_region_h.size))
513
+
514
+ except Exception as e:
515
+ self.logger.error(f"Error computing sky blue dominance: {str(e)}")
516
+ return 0.0
517
+
518
+ def _compute_boundary_clarity(self, small_gray: np.ndarray, avg_abs_gx: float,
519
+ avg_abs_gy: float) -> float:
520
+ """Compute boundary clarity score."""
521
+ try:
522
+ edge_width_sm = max(1, small_gray.shape[1] // 10)
523
+ edge_height_sm = max(1, small_gray.shape[0] // 10)
524
+
525
+ # Edge gradients
526
+ left_edge_grad_x = 0.0
527
+ right_edge_grad_x = 0.0
528
+ top_edge_grad_y = 0.0
529
+
530
+ if small_gray.shape[1] > edge_width_sm:
531
+ left_edge = small_gray[:, :edge_width_sm]
532
+ right_edge = small_gray[:, -edge_width_sm:]
533
+ left_edge_grad_x = float(np.mean(np.abs(cv2.Sobel(left_edge, cv2.CV_32F, 1, 0, ksize=3))))
534
+ right_edge_grad_x = float(np.mean(np.abs(cv2.Sobel(right_edge, cv2.CV_32F, 1, 0, ksize=3))))
535
+
536
+ if small_gray.shape[0] > edge_height_sm:
537
+ top_edge = small_gray[:edge_height_sm, :]
538
+ top_edge_grad_y = float(np.mean(np.abs(cv2.Sobel(top_edge, cv2.CV_32F, 0, 1, ksize=3))))
539
+
540
+ # Normalize against average gradients
541
+ boundary_clarity = (left_edge_grad_x + right_edge_grad_x + top_edge_grad_y) / (
542
+ 3 * max(avg_abs_gx, avg_abs_gy, 1e-5)
543
+ )
544
+ boundary_clarity = min(1.0, boundary_clarity / 1.5)
545
+
546
+ return boundary_clarity
547
+
548
+ except Exception as e:
549
+ self.logger.error(f"Error computing boundary clarity: {str(e)}")
550
+ return 0.0
551
+
552
+ def _compute_openness_top_edge(self, gy: np.ndarray, avg_abs_gy: float) -> float:
553
+ """Compute openness of top edge."""
554
+ try:
555
+ top_edge_strip_gy = float(np.mean(np.abs(gy[:max(1, gy.shape[0]//20), :])))
556
+ openness_top_edge = 1.0 - min(1.0, top_edge_strip_gy / max(avg_abs_gy, 1e-5) / 0.5)
557
+ return openness_top_edge
558
+ except Exception as e:
559
+ self.logger.error(f"Error computing top edge openness: {str(e)}")
560
+ return 0.5
561
+
562
+ def _compute_legacy_compatibility_features(self, hsv_img: np.ndarray, small_gray: np.ndarray,
563
+ features: Dict[str, Any], scale_factor: int) -> Dict[str, Any]:
564
+ """Compute additional features for backward compatibility."""
565
+ try:
566
+ v_channel = hsv_img[:, :, 2]
567
+
568
+ # Light source detection
569
+ light_features = self._detect_light_sources(v_channel, features["avg_brightness"],
570
+ features["brightness_std"], scale_factor)
571
+
572
+ # Street line detection
573
+ street_score = self._compute_street_line_score(small_gray)
574
+
575
+ # Additional legacy features
576
+ legacy_features = {
577
+ **light_features,
578
+ "street_line_score": street_score,
579
+ "sky_blue_ratio": features.get("sky_like_blue_ratio", 0.0), # Alias
580
+ "gradient_ratio": features.get("gradient_ratio_vertical_horizontal", 1.0) # Alias
581
+ }
582
+
583
+ return legacy_features
584
+
585
+ except Exception as e:
586
+ self.logger.error(f"Error computing legacy compatibility features: {str(e)}")
587
+ return {}
588
+
589
+ def _detect_light_sources(self, v_channel: np.ndarray, avg_brightness: float,
590
+ brightness_std: float, scale_factor: int) -> Dict[str, float]:
591
+ """Detect artificial light sources in the image."""
592
+ try:
593
+ # Sample pixels for efficiency
594
+ sampled_v = v_channel[::scale_factor*2, ::scale_factor*2]
595
+
596
+ # Light threshold
597
+ light_threshold = min(
598
+ self.config_manager.feature_thresholds.light_source_abs_thresh,
599
+ avg_brightness + 2 * brightness_std
600
+ )
601
+
602
+ is_bright_spots = sampled_v > light_threshold
603
+ bright_spot_count = int(np.sum(is_bright_spots))
604
+
605
+ # Initialize light features
606
+ circular_light_count = 0
607
+ indoor_light_score = 0.0
608
+ light_distribution_uniformity = 0.5
609
+
610
+ # Analyze light distribution if spots are found
611
+ if 1 < bright_spot_count < 20:
612
+ bright_y, bright_x = np.where(is_bright_spots)
613
+ if len(bright_y) > 1:
614
+ mean_x, mean_y = np.mean(bright_x), np.mean(bright_y)
615
+ dist_from_center = np.sqrt((bright_x - mean_x)**2 + (bright_y - mean_y)**2)
616
+
617
+ if np.std(dist_from_center) < np.mean(dist_from_center):
618
+ circular_light_count = min(3, len(bright_y) // 2)
619
+ light_distribution_uniformity = 0.7
620
+
621
+ if np.mean(bright_y) < sampled_v.shape[0] / 2:
622
+ indoor_light_score = 0.6
623
+ else:
624
+ indoor_light_score = 0.3
625
+
626
+ return {
627
+ "bright_spot_count": bright_spot_count,
628
+ "circular_light_count": circular_light_count,
629
+ "indoor_light_score": indoor_light_score,
630
+ "light_distribution_uniformity": light_distribution_uniformity
631
+ }
632
+
633
+ except Exception as e:
634
+ self.logger.error(f"Error detecting light sources: {str(e)}")
635
+ return {
636
+ "bright_spot_count": 0,
637
+ "circular_light_count": 0,
638
+ "indoor_light_score": 0.0,
639
+ "light_distribution_uniformity": 0.5
640
+ }
641
+
642
+ def _compute_street_line_score(self, small_gray: np.ndarray) -> float:
643
+ """Compute street line detection score."""
644
+ try:
645
+ street_line_score = 0.0
646
+ bottom_half_sm = small_gray[small_gray.shape[0]//2:, :]
647
+
648
+ if bottom_half_sm.size > 0:
649
+ bottom_vert_gradient = cv2.Sobel(bottom_half_sm, cv2.CV_32F, 0, 1, ksize=3)
650
+ strong_vert_lines = np.abs(bottom_vert_gradient) > 50
651
+
652
+ if np.sum(strong_vert_lines) > (bottom_half_sm.size * 0.05):
653
+ street_line_score = 0.7
654
+
655
+ return street_line_score
656
+
657
+ except Exception as e:
658
+ self.logger.error(f"Error computing street line score: {str(e)}")
659
+ return 0.0
660
+
661
+ def _compute_legacy_structure_features(self, gray_img: np.ndarray, height: int) -> Dict[str, float]:
662
+ """Compute legacy structure features for backward compatibility."""
663
+ try:
664
+ # Top region analysis for ceiling uniformity
665
+ top_region = gray_img[:height//4, :]
666
+ top_region_std = float(np.std(top_region)) if top_region.size > 0 else 0.0
667
+ ceiling_uniformity = 1.0 - min(1.0, top_region_std / max(float(np.mean(top_region)) if top_region.size > 0 else 1e-5, 1e-5))
668
+
669
+ # Horizontal line detection in top region
670
+ if top_region.size > 0:
671
+ top_gradients = np.abs(cv2.Sobel(top_region, cv2.CV_32F, 0, 1, ksize=3))
672
+ horizontal_lines_strength = float(np.mean(top_gradients))
673
+ horizontal_line_ratio = min(1.0, horizontal_lines_strength / 40.0)
674
+ else:
675
+ horizontal_line_ratio = 0.0
676
+
677
+ # Boundary edge score computation
678
+ boundary_edge_score = self._compute_legacy_boundary_score(gray_img)
679
+
680
+ return {
681
+ "ceiling_uniformity": ceiling_uniformity,
682
+ "horizontal_line_ratio": horizontal_line_ratio,
683
+ "top_region_std": top_region_std,
684
+ "boundary_edge_score": boundary_edge_score
685
+ }
686
+
687
+ except Exception as e:
688
+ self.logger.error(f"Error computing legacy structure features: {str(e)}")
689
+ return {
690
+ "ceiling_uniformity": 0.5,
691
+ "horizontal_line_ratio": 0.0,
692
+ "top_region_std": 0.0,
693
+ "boundary_edge_score": 0.0
694
+ }
695
+
696
+ def _compute_legacy_boundary_score(self, gray_img: np.ndarray) -> float:
697
+ """Compute legacy boundary edge score."""
698
+ try:
699
+ height, width = gray_img.shape
700
+
701
+ # Create small version for boundary analysis
702
+ small_height, small_width = height // 4, width // 4
703
+ small_gray = cv2.resize(gray_img, (small_width, small_height), interpolation=cv2.INTER_AREA)
704
+
705
+ # Edge regions
706
+ left_edge_sm = small_gray[:, :small_width//6] if small_width > 6 else small_gray
707
+ right_edge_sm = small_gray[:, 5*small_width//6:] if small_width > 6 else small_gray
708
+ top_edge_sm = small_gray[:small_height//6, :] if small_height > 6 else small_gray
709
+
710
+ # Compute gradients for each edge
711
+ left_gradient = float(np.mean(np.abs(cv2.Sobel(left_edge_sm, cv2.CV_32F, 1, 0, ksize=3)))) if left_edge_sm.size > 0 else 0
712
+ right_gradient = float(np.mean(np.abs(cv2.Sobel(right_edge_sm, cv2.CV_32F, 1, 0, ksize=3)))) if right_edge_sm.size > 0 else 0
713
+ top_gradient = float(np.mean(np.abs(cv2.Sobel(top_edge_sm, cv2.CV_32F, 0, 1, ksize=3)))) if top_edge_sm.size > 0 else 0
714
+
715
+ # Combine and normalize
716
+ boundary_edge_score = (min(1.0, left_gradient/50) + min(1.0, right_gradient/50) + min(1.0, top_gradient/50)) / 3
717
+
718
+ return boundary_edge_score
719
+
720
+ except Exception as e:
721
+ self.logger.error(f"Error computing legacy boundary score: {str(e)}")
722
+ return 0.0
723
+
724
+ def _validate_image(self, image_rgb: np.ndarray) -> bool:
725
+ """Validate input image format and dimensions."""
726
+ try:
727
+ if not isinstance(image_rgb, np.ndarray):
728
+ self.logger.error("Input is not a numpy array")
729
+ return False
730
+
731
+ if len(image_rgb.shape) != 3 or image_rgb.shape[2] != 3:
732
+ self.logger.error(f"Invalid image shape: {image_rgb.shape}. Expected (H, W, 3)")
733
+ return False
734
+
735
+ height, width = image_rgb.shape[:2]
736
+ if height == 0 or width == 0:
737
+ self.logger.error(f"Invalid image dimensions: {height}x{width}")
738
+ return False
739
+
740
+ return True
741
+
742
+ except Exception as e:
743
+ self.logger.error(f"Error validating image: {str(e)}")
744
+ return False
745
+
746
+ def _calculate_scale_factor(self, height: int, width: int) -> int:
747
+ """Calculate appropriate scale factor for image processing efficiency."""
748
+ try:
749
+ base_scale = 4
750
+ scale_factor = base_scale + min(8, max(0, int((height * width) / (1000 * 1000)) if height * width > 0 else 0))
751
+ return max(1, scale_factor)
752
+ except Exception as e:
753
+ self.logger.error(f"Error calculating scale factor: {str(e)}")
754
+ return 4
755
+
756
+ def _get_default_features(self) -> Dict[str, Any]:
757
+ """Return default feature values in case of processing errors."""
758
+ return {
759
+ "avg_brightness": 100.0,
760
+ "brightness_std": 50.0,
761
+ "dark_pixel_ratio": 0.0,
762
+ "bright_pixel_ratio": 0.0,
763
+ "brightness_uniformity": 0.5,
764
+ "blue_ratio": 0.0,
765
+ "sky_like_blue_ratio": 0.0,
766
+ "yellow_orange_ratio": 0.0,
767
+ "gray_ratio": 0.0,
768
+ "avg_saturation": 100.0,
769
+ "sky_region_brightness_ratio": 1.0,
770
+ "sky_region_saturation": 0.0,
771
+ "sky_region_blue_dominance": 0.0,
772
+ "sky_brightness": 100.0,
773
+ "warm_ratio": 0.0,
774
+ "cool_ratio": 0.0,
775
+ "color_atmosphere": "neutral",
776
+ "gradient_ratio_vertical_horizontal": 1.0,
777
+ "top_region_texture_complexity": 0.5,
778
+ "shadow_clarity_score": 0.5,
779
+ "vertical_strength": 0.0,
780
+ "horizontal_strength": 0.0,
781
+ "edges_density": 0.0,
782
+ "ceiling_likelihood": 0.0,
783
+ "boundary_clarity": 0.0,
784
+ "openness_top_edge": 0.5,
785
+ "ceiling_uniformity": 0.5,
786
+ "horizontal_line_ratio": 0.0,
787
+ "top_region_std": 0.0,
788
+ "boundary_edge_score": 0.0,
789
+ "bright_spot_count": 0,
790
+ "circular_light_count": 0,
791
+ "indoor_light_score": 0.0,
792
+ "light_distribution_uniformity": 0.5,
793
+ "street_line_score": 0.0,
794
+ "sky_blue_ratio": 0.0,
795
+ "gradient_ratio": 1.0
796
+ }
797
+
798
+ def _get_default_color_features(self) -> Dict[str, Any]:
799
+ """Return default color feature values."""
800
+ return {
801
+ "blue_ratio": 0.0,
802
+ "sky_like_blue_ratio": 0.0,
803
+ "yellow_orange_ratio": 0.0,
804
+ "gray_ratio": 0.0,
805
+ "avg_saturation": 100.0,
806
+ "sky_region_brightness_ratio": 1.0,
807
+ "sky_region_saturation": 0.0,
808
+ "sky_region_blue_dominance": 0.0,
809
+ "sky_brightness": 100.0,
810
+ "warm_ratio": 0.0,
811
+ "cool_ratio": 0.0,
812
+ "color_atmosphere": "neutral"
813
+ }
814
+
815
+ def _get_default_sky_features(self) -> Dict[str, float]:
816
+ """Return default sky region feature values."""
817
+ return {
818
+ "sky_region_brightness_ratio": 1.0,
819
+ "sky_region_saturation": 0.0,
820
+ "sky_region_blue_dominance": 0.0,
821
+ "sky_brightness": 100.0
822
+ }
functional_zone_identifier.py ADDED
@@ -0,0 +1,938 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import logging
3
+ import traceback
4
+ from typing import Dict, List, Any, Optional
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+ class FunctionalZoneIdentifier:
9
+ """
10
+ 作為功能區域辨識的主要窗口
11
+ 整合區域評估和場景特定的區域辨識邏輯,提供統一的功能區域辨識接口
12
+ """
13
+
14
+ def __init__(self, zone_evaluator=None, scene_zone_identifier=None, scene_viewpoint_analyzer=None):
15
+ """
16
+ 初始化功能區域識別器
17
+
18
+ Args:
19
+ zone_evaluator: 區域評估器實例
20
+ scene_zone_identifier: 場景區域辨識器實例
21
+ scene_viewpoint_analyzer: 場景視角分析器
22
+ """
23
+ try:
24
+ self.zone_evaluator = zone_evaluator
25
+ self.scene_zone_identifier = scene_zone_identifier
26
+
27
+ self.scene_viewpoint_analyzer = scene_viewpoint_analyzer
28
+ self.viewpoint_detector = scene_viewpoint_analyzer
29
+
30
+ logger.info("FunctionalZoneIdentifier initialized successfully with SceneViewpointAnalyzer")
31
+
32
+ except Exception as e:
33
+ logger.error(f"Failed to initialize FunctionalZoneIdentifier: {str(e)}")
34
+ logger.error(traceback.format_exc())
35
+ raise
36
+
37
+ def identify_functional_zones(self, detected_objects: List[Dict], scene_type: str) -> Dict:
38
+ """
39
+ 識別場景內的功能區域,具有針對不同視角和文化背景的改進檢測能力。
40
+ 如果偵測到 is_landmark=True 的物件,則優先直接呼叫 identify_landmark_zones 並回傳結果。
41
+ """
42
+
43
+ try:
44
+ # 1. 如果沒有啟用地標功能,就先把所有有 is_landmark=True 的物件過濾掉
45
+ if not getattr(self, 'enable_landmark', True):
46
+ detected_objects = [obj for obj in detected_objects if not obj.get("is_landmark", False)]
47
+
48
+ # 2. 只要檢測到任何 is_landmark=True 的物件,立即優先使用 identify_landmark_zones
49
+ landmark_objects = [obj for obj in detected_objects if obj.get("is_landmark", False)]
50
+ if landmark_objects and self.scene_zone_identifier:
51
+ lm_zones = self.scene_zone_identifier.identify_landmark_zones(landmark_objects)
52
+ return self._standardize_zone_keys_and_descriptions(lm_zones)
53
+
54
+ # 3. city_street
55
+ if scene_type in ["tourist_landmark", "natural_landmark", "historical_monument"]:
56
+ scene_type = "city_street"
57
+
58
+ # 4. 判斷與物件數量檢查
59
+ if self.zone_evaluator:
60
+ should_identify = self.zone_evaluator.evaluate_zone_identification_feasibility(
61
+ detected_objects, scene_type
62
+ )
63
+ if not should_identify:
64
+ logger.info(f"Zone identification not feasible for scene type '{scene_type}'")
65
+ return {}
66
+ else:
67
+ if len(detected_objects) < 2:
68
+ logger.info("Insufficient objects for zone identification")
69
+ return {}
70
+
71
+ # 5. 建立 category_regions
72
+ category_regions = self._build_category_regions_mapping(detected_objects)
73
+ zones = {}
74
+
75
+ # 6. 檢測場景視角
76
+ viewpoint_info = {"viewpoint": "eye_level"}
77
+ if self.scene_viewpoint_analyzer:
78
+ viewpoint_info = self.scene_viewpoint_analyzer.detect_scene_viewpoint(detected_objects)
79
+
80
+ # 7. 根據不同 scene_type 使用各種自己的區域辨識
81
+ if scene_type in ["living_room", "bedroom", "dining_area", "kitchen", "office_workspace", "meeting_room"]:
82
+ if self.scene_zone_identifier:
83
+ raw_zones = self.scene_zone_identifier.identify_indoor_zones(
84
+ category_regions, detected_objects, scene_type
85
+ )
86
+ zones.update(self._standardize_zone_keys_and_descriptions(raw_zones))
87
+
88
+ elif scene_type in ["city_street", "parking_lot", "park_area"]:
89
+ if self.scene_zone_identifier:
90
+ raw_zones = self.scene_zone_identifier.identify_outdoor_general_zones(
91
+ category_regions, detected_objects, scene_type
92
+ )
93
+ zones.update(self._standardize_zone_keys_and_descriptions(raw_zones))
94
+
95
+ elif "aerial" in scene_type or viewpoint_info.get("viewpoint") == "aerial":
96
+ if self.scene_zone_identifier:
97
+ raw_zones = self.scene_zone_identifier.identify_aerial_view_zones(
98
+ category_regions, detected_objects, scene_type
99
+ )
100
+ zones.update(self._standardize_zone_keys_and_descriptions(raw_zones))
101
+
102
+ elif "asian" in scene_type:
103
+ if self.scene_zone_identifier:
104
+ asian_zones = self.scene_zone_identifier.identify_asian_cultural_zones(
105
+ category_regions, detected_objects, scene_type
106
+ )
107
+ zones.update(self._standardize_zone_keys_and_descriptions(asian_zones))
108
+
109
+ elif scene_type == "urban_intersection":
110
+ if self.scene_zone_identifier:
111
+ raw_zones = self.scene_zone_identifier.identify_intersection_zones(
112
+ category_regions, detected_objects, viewpoint_info.get("viewpoint")
113
+ )
114
+ zones.update(self._standardize_zone_keys_and_descriptions(raw_zones))
115
+ used_tl_count_per_region = {}
116
+ for zone_info in raw_zones.values():
117
+ obj_list = zone_info.get("objects", [])
118
+ if "traffic light" in obj_list:
119
+ rg = zone_info.get("region", "")
120
+ count_in_zone = obj_list.count("traffic light")
121
+ used_tl_count_per_region[rg] = used_tl_count_per_region.get(rg, 0) + count_in_zone
122
+
123
+ signal_regions = {}
124
+ for t in [obj for obj in detected_objects if obj.get("class_id") == 9]:
125
+ region = t.get("region", "")
126
+ signal_regions.setdefault(region, []).append(t)
127
+
128
+ for idx, (region, signals) in enumerate(signal_regions.items()):
129
+ total_in_region = len(signals)
130
+ used_in_region = used_tl_count_per_region.get(region, 0)
131
+ remaining_in_region = total_in_region - used_in_region
132
+
133
+ if remaining_in_region > 0:
134
+ direction = self._get_directional_description(region)
135
+ if direction and direction != "central":
136
+ zone_key = f"{direction} traffic control area"
137
+ else:
138
+ zone_key = "primary traffic control area" if idx == 0 else "auxiliary traffic control area"
139
+
140
+ if zone_key in zones:
141
+ suffix = 1
142
+ new_key = f"{zone_key} ({suffix})"
143
+ while new_key in zones:
144
+ suffix += 1
145
+ new_key = f"{zone_key} ({suffix})"
146
+ zone_key = new_key
147
+
148
+ zones[zone_key] = {
149
+ "region": region,
150
+ "objects": ["traffic light"] * remaining_in_region,
151
+ "description": f"Traffic control area with {remaining_in_region} traffic lights in {region}"
152
+ }
153
+
154
+ for region, signals in signal_regions.items():
155
+ used = used_tl_count_per_region.get(region, 0)
156
+ total = len(signals)
157
+ remaining = total - used
158
+ # print(f"[DEBUG] Region '{region}': Total TL = {total}, Used in crossing = {used}, Remaining = {remaining}")
159
+
160
+ elif scene_type == "financial_district":
161
+ if self.scene_zone_identifier:
162
+ fd_zones = self.scene_zone_identifier.identify_financial_district_zones(
163
+ category_regions, detected_objects
164
+ )
165
+ zones.update(self._standardize_zone_keys_and_descriptions(fd_zones))
166
+
167
+ elif scene_type == "upscale_dining":
168
+ if self.scene_zone_identifier:
169
+ ud_zones = self.scene_zone_identifier.identify_upscale_dining_zones(
170
+ category_regions, detected_objects
171
+ )
172
+ zones.update(self._standardize_zone_keys_and_descriptions(ud_zones))
173
+
174
+ else:
175
+ # 如果不是上述任何一種場景,就用「預設功能區」
176
+ default_zones = self._identify_default_zones(category_regions, detected_objects)
177
+ zones.update(self._standardize_zone_keys_and_descriptions(default_zones))
178
+
179
+ # 8. 如果此時 zones 仍為空,就會變成 default → basic → fallback
180
+ if not zones:
181
+ default_zones = self._identify_default_zones(category_regions, detected_objects)
182
+ if default_zones:
183
+ zones.update(self._standardize_zone_keys_and_descriptions(default_zones))
184
+ else:
185
+ basic_zones = self._create_basic_zones_from_objects(detected_objects, scene_type)
186
+ zones.update(self._standardize_zone_keys_and_descriptions(basic_zones))
187
+
188
+ # 通用 fallback:把所有還沒被列出的 (class_name, region) 通通補進去
189
+ fallback_zones = self._generate_category_fallback_zones(detected_objects, zones)
190
+ zones.update(fallback_zones)
191
+
192
+ # Debug: 列印出各功能區的 traffic light 統計
193
+ total_tl_in_zones = 0
194
+ for zone_key, zone_info in zones.items():
195
+ if isinstance(zone_info, dict):
196
+ sub_objs = zone_info.get("objects", [])
197
+ else:
198
+ sub_objs = []
199
+ t_in_zone = [obj for obj in sub_objs if obj == "traffic light"]
200
+ # print(f"[DEBUG] identify_functional_zones - Zone '{zone_key}' has {len(t_in_zone)} traffic light(s).")
201
+ total_tl_in_zones += len(t_in_zone)
202
+ # print(f"[DEBUG] identify_functional_zones - Total traffic lights in zones: {total_tl_in_zones}")
203
+
204
+ logger.info(f"Identified {len(zones)} functional zones for scene type '{scene_type}'")
205
+ return zones
206
+
207
+ except Exception as e:
208
+ logger.error(f"Error identifying functional zones: {str(e)}")
209
+ logger.error(traceback.format_exc())
210
+ return {}
211
+
212
+ def _standardize_zone_keys_and_descriptions(self, raw_zones: Dict) -> Dict:
213
+ """
214
+ 標準化區域鍵名和描述,將內部標識符轉換為描述性名稱
215
+
216
+ Args:
217
+ raw_zones: 原始區域識別結果
218
+
219
+ Returns:
220
+ Dict: 標準化後的區域字典
221
+ """
222
+ try:
223
+ standardized_zones = {}
224
+
225
+ for zone_key, zone_data in raw_zones.items():
226
+ # 生成描述性的區域鍵名
227
+ descriptive_key = self._generate_descriptive_zone_key(zone_key, zone_data)
228
+
229
+ # 確保區域描述也經過標準化
230
+ if isinstance(zone_data, dict) and "description" in zone_data:
231
+ zone_data["description"] = self._enhance_zone_description(zone_data["description"], zone_data)
232
+
233
+ standardized_zones[descriptive_key] = zone_data
234
+
235
+ return standardized_zones
236
+
237
+ except Exception as e:
238
+ logger.error(f"Error standardizing zone keys and descriptions: {str(e)}")
239
+ return raw_zones
240
+
241
+ def _generate_descriptive_zone_key(self, original_key: str, zone_data: Dict) -> str:
242
+ """
243
+ 基於區域內容生成描述性的鍵名
244
+ 核心修改:只要該區域內有任一個 'traffic light',就優先回傳 'traffic control zone',
245
+ """
246
+ try:
247
+ objects = zone_data.get("objects", [])
248
+ region = zone_data.get("region", "")
249
+
250
+ # 優先檢查是否含有 traffic light
251
+ if any(obj == "traffic light" or "traffic light" in obj for obj in objects):
252
+ return "traffic control zone"
253
+
254
+ # 如果沒有 traffic light,才繼續分析「主要物件」順序
255
+ primary_objects = self._analyze_primary_objects(objects)
256
+
257
+ # 依序檢查人、車、家具、紅綠燈等
258
+ if "person" in primary_objects:
259
+ if len([o for o in objects if o == "person"]) > 1:
260
+ return "pedestrian activity area"
261
+ else:
262
+ return "individual activity zone"
263
+ elif any(vehicle in primary_objects for vehicle in ["car", "truck", "bus", "motorcycle"]):
264
+ return "vehicle movement area"
265
+ elif any(furniture in primary_objects for furniture in ["chair", "table", "sofa", "bed"]):
266
+ return "furniture arrangement area"
267
+
268
+ # 若上述都不符合,改用「基於位置」做 fallback
269
+ position_descriptions = {
270
+ "top_left": "upper left area",
271
+ "top_center": "upper central area",
272
+ "top_right": "upper right area",
273
+ "middle_left": "left side area",
274
+ "middle_center": "main crossing area",
275
+ "middle_right": "right side area",
276
+ "bottom_left": "lower left area",
277
+ "bottom_center": "lower central area",
278
+ "bottom_right": "lower right area"
279
+ }
280
+ if region in position_descriptions:
281
+ return position_descriptions[region]
282
+
283
+ # 再次檢查主要物件,給出另一種 fallback 命名
284
+ if primary_objects:
285
+ if "traffic light" in primary_objects:
286
+ return "traffic control zone"
287
+ elif any(vehicle in primary_objects for vehicle in ["car", "truck", "bus"]):
288
+ return "vehicle movement area"
289
+ elif "person" in primary_objects:
290
+ return "pedestrian activity area"
291
+
292
+ # 最後最後的備用名稱
293
+ return "activity area"
294
+
295
+ except Exception as e:
296
+ logger.warning(f"Error generating descriptive key for '{original_key}': {str(e)}")
297
+ return "activity area"
298
+
299
+ def _analyze_primary_objects(self, objects: List[str]) -> List[str]:
300
+ """
301
+ 分析區域中的主要物件類型
302
+
303
+ Args:
304
+ objects: 物件名稱列表
305
+
306
+ Returns:
307
+ List[str]: 主要物件類型列表
308
+ """
309
+ try:
310
+ # 計算物件出現頻率
311
+ object_counts = {}
312
+ for obj in objects:
313
+ normalized_obj = obj.replace('_', ' ').lower().strip()
314
+ object_counts[normalized_obj] = object_counts.get(normalized_obj, 0) + 1
315
+
316
+ # 按出現頻率排序,返回前三個主要物件
317
+ sorted_objects = sorted(object_counts.items(), key=lambda x: x[1], reverse=True)
318
+ return [obj[0] for obj in sorted_objects[:3]]
319
+
320
+ except Exception as e:
321
+ logger.warning(f"Error analyzing primary objects: {str(e)}")
322
+ return []
323
+
324
+ def _enhance_zone_description(self, original_description: str, zone_data: Dict) -> str:
325
+ """
326
+ 增強區域描述的自然性和完整性
327
+ """
328
+ try:
329
+ if not original_description or not original_description.strip():
330
+ return self._generate_fallback_description(zone_data)
331
+
332
+ import re
333
+ enhanced = original_description.strip()
334
+
335
+ # 改善技術性表達為自然語言
336
+ enhanced = re.sub(r'\bin central direction\b', 'in the center', enhanced)
337
+ enhanced = re.sub(r'\bin west area\b', 'on the left side', enhanced)
338
+ enhanced = re.sub(r'\bin east direction\b', 'on the right side', enhanced)
339
+ enhanced = re.sub(r'\bnear traffic signals\b', 'near the traffic lights', enhanced)
340
+ enhanced = re.sub(r'\bwith (\d+) (\w+)\b', r'where \1 \2 can be seen', enhanced)
341
+
342
+ # 移除重複和冗餘表達
343
+ enhanced = re.sub(r'\barea with.*?in.*?area\b', lambda m: m.group(0).split(' in ')[0], enhanced)
344
+ enhanced = enhanced.replace('traffic area', 'area').replace('crossing area', 'crossing')
345
+
346
+ # 標準化描述結構
347
+ if enhanced.startswith('Pedestrian'):
348
+ enhanced = re.sub(r'^Pedestrian crossing area', 'The main pedestrian crossing', enhanced)
349
+ elif enhanced.startswith('Vehicle'):
350
+ enhanced = re.sub(r'^Vehicle traffic area', 'The vehicle movement area', enhanced)
351
+ elif enhanced.startswith('Traffic control'):
352
+ enhanced = re.sub(r'^Traffic control area', 'Traffic management elements', enhanced)
353
+
354
+ # 移除內部標識符格式
355
+ enhanced = re.sub(r'\b\w+_\w+(?:_\w+)*\b', lambda m: m.group(0).replace('_', ' '), enhanced)
356
+
357
+ # 確保描述的完整性
358
+ if not enhanced.endswith('.'):
359
+ enhanced += '.'
360
+
361
+ # 改善描述的自然性
362
+ enhanced = enhanced.replace('with with', 'with')
363
+ enhanced = re.sub(r'\s{2,}', ' ', enhanced)
364
+
365
+ return enhanced
366
+
367
+ except Exception as e:
368
+ logger.warning(f"Error enhancing zone description: {str(e)}")
369
+ return original_description if original_description else "A functional area within the scene."
370
+
371
+ def _generate_fallback_description(self, zone_data: Dict) -> str:
372
+ """
373
+ 為缺少描述的區域生成備用描述
374
+
375
+ Args:
376
+ zone_data: 區域數據
377
+
378
+ Returns:
379
+ str: 備用描述
380
+ """
381
+ try:
382
+ objects = zone_data.get("objects", [])
383
+ region = zone_data.get("region", "")
384
+
385
+ if objects:
386
+ object_count = len(objects)
387
+ unique_objects = list(set(objects))
388
+
389
+ if object_count == 1:
390
+ return f"Area containing {unique_objects[0].replace('_', ' ')}."
391
+ elif len(unique_objects) <= 3:
392
+ obj_list = ", ".join([obj.replace('_', ' ') for obj in unique_objects])
393
+ return f"Area featuring {obj_list}."
394
+ else:
395
+ return f"Multi-functional area with {object_count} elements including various objects."
396
+
397
+ return "Functional area within the scene."
398
+
399
+ except Exception as e:
400
+ logger.warning(f"Error generating fallback description: {str(e)}")
401
+ return "Activity area."
402
+
403
+ def _build_category_regions_mapping(self, detected_objects: List[Dict]) -> Dict:
404
+ """
405
+ 建立物件按類別和區域的分組映射
406
+
407
+ Args:
408
+ detected_objects: 檢測到的物件列表
409
+
410
+ Returns:
411
+ 按類別和區域分組的物件字典
412
+ """
413
+ try:
414
+ category_regions = {}
415
+
416
+ for obj in detected_objects:
417
+ category = self._categorize_object(obj)
418
+ if not category:
419
+ continue
420
+
421
+ if category not in category_regions:
422
+ category_regions[category] = {}
423
+
424
+ region = obj.get("region", "center")
425
+ if region not in category_regions[category]:
426
+ category_regions[category][region] = []
427
+
428
+ category_regions[category][region].append(obj)
429
+
430
+ logger.debug(f"Built category regions mapping with {len(category_regions)} categories")
431
+ return category_regions
432
+
433
+ except Exception as e:
434
+ logger.error(f"Error building category regions mapping: {str(e)}")
435
+ logger.error(traceback.format_exc())
436
+ return {}
437
+
438
+ def _categorize_object(self, obj: Dict) -> str:
439
+ """
440
+ 將檢測到的物件分類到功能類別中,用於區域識別
441
+
442
+ Args:
443
+ obj: 物件字典
444
+
445
+ Returns:
446
+ 物件功能類別字串
447
+ """
448
+ try:
449
+ class_id = obj.get("class_id", -1)
450
+ class_name = obj.get("class_name", "").lower()
451
+
452
+ # 使用現有的類別映射(如果可用)
453
+ if hasattr(self, 'OBJECT_CATEGORIES') and self.OBJECT_CATEGORIES:
454
+ for category, ids in self.OBJECT_CATEGORIES.items():
455
+ if class_id in ids:
456
+ return category
457
+
458
+ # 基於COCO類別名稱的後備分類
459
+ furniture_items = ["chair", "couch", "bed", "dining table", "toilet"]
460
+ plant_items = ["potted plant"]
461
+ electronic_items = ["tv", "laptop", "mouse", "remote", "keyboard", "cell phone"]
462
+ vehicle_items = ["bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat"]
463
+ person_items = ["person"]
464
+ kitchen_items = ["bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl",
465
+ "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog",
466
+ "pizza", "donut", "cake", "refrigerator", "oven", "toaster", "sink", "microwave"]
467
+ sports_items = ["frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat",
468
+ "baseball glove", "skateboard", "surfboard", "tennis racket"]
469
+ personal_items = ["handbag", "tie", "suitcase", "umbrella", "backpack"]
470
+
471
+ if any(item in class_name for item in furniture_items):
472
+ return "furniture"
473
+ elif any(item in class_name for item in plant_items):
474
+ return "plant"
475
+ elif any(item in class_name for item in electronic_items):
476
+ return "electronics"
477
+ elif any(item in class_name for item in vehicle_items):
478
+ return "vehicle"
479
+ elif any(item in class_name for item in person_items):
480
+ return "person"
481
+ elif any(item in class_name for item in kitchen_items):
482
+ return "kitchen_items"
483
+ elif any(item in class_name for item in sports_items):
484
+ return "sports"
485
+ elif any(item in class_name for item in personal_items):
486
+ return "personal_items"
487
+ else:
488
+ return "misc"
489
+
490
+ except Exception as e:
491
+ logger.error(f"Error categorizing object: {str(e)}")
492
+ logger.error(traceback.format_exc())
493
+ return "misc"
494
+
495
+ def _identify_default_zones(self, category_regions: Dict, detected_objects: List[Dict]) -> Dict:
496
+ """
497
+ 當沒有匹配到特定場景類型時的一般功能區域識別
498
+
499
+ Args:
500
+ category_regions: 按類別和區域分組的物件字典
501
+ detected_objects: 檢測到的物件列表
502
+
503
+ Returns:
504
+ 預設功能區域字典
505
+ """
506
+ try:
507
+ zones = {}
508
+
509
+ # 按類別分組物件並找到主要集中區域
510
+ for category, regions in category_regions.items():
511
+ if not regions:
512
+ continue
513
+
514
+ # 找到此類別中物件最多的區域
515
+ main_region = max(regions.items(),
516
+ key=lambda x: len(x[1]),
517
+ default=(None, []))
518
+
519
+ if main_region[0] is None or len(main_region[1]) < 2:
520
+ continue
521
+
522
+ # 創建基於物件類別的區域
523
+ zone_objects = [obj["class_name"] for obj in main_region[1]]
524
+
525
+ # 如果物件太少,跳過
526
+ if len(zone_objects) < 2:
527
+ continue
528
+
529
+ # 根據類別創建區域名稱和描述
530
+ if category == "furniture":
531
+ zones["furniture arrangement area"] = {
532
+ "region": main_region[0],
533
+ "objects": zone_objects,
534
+ "description": f"Furniture arrangement area featuring {self._format_object_list_naturally(zone_objects[:3])}"
535
+ }
536
+ elif category == "electronics":
537
+ zones["electronics area"] = {
538
+ "region": main_region[0],
539
+ "objects": zone_objects,
540
+ "description": f"Electronics area containing {self._format_object_list_naturally(zone_objects[:3])}"
541
+ }
542
+ elif category == "kitchen_items":
543
+ zones["dining_zone"] = {
544
+ "region": main_region[0],
545
+ "objects": zone_objects,
546
+ "description": f"Dining or food area with {', '.join(zone_objects[:3])}"
547
+ }
548
+ elif category == "vehicle":
549
+ zones["vehicle_zone"] = {
550
+ "region": main_region[0],
551
+ "objects": zone_objects,
552
+ "description": f"Area with vehicles including {', '.join(zone_objects[:3])}"
553
+ }
554
+ elif category == "personal_items":
555
+ zones["personal_items_zone"] = {
556
+ "region": main_region[0],
557
+ "objects": zone_objects,
558
+ "description": f"Area with personal items including {', '.join(zone_objects[:3])}"
559
+ }
560
+
561
+ # 檢查人群聚集
562
+ people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
563
+ if len(people_objs) >= 2:
564
+ people_regions = {}
565
+ for obj in people_objs:
566
+ region = obj["region"]
567
+ if region not in people_regions:
568
+ people_regions[region] = []
569
+ people_regions[region].append(obj)
570
+
571
+ if people_regions:
572
+ main_people_region = max(people_regions.items(),
573
+ key=lambda x: len(x[1]),
574
+ default=(None, []))
575
+
576
+ if main_people_region[0] is not None:
577
+ zones["people_zone"] = {
578
+ "region": main_people_region[0],
579
+ "objects": ["person"] * len(main_people_region[1]),
580
+ "description": f"Area with {len(main_people_region[1])} people"
581
+ }
582
+
583
+ logger.debug(f"Identified {len(zones)} default zones")
584
+ return zones
585
+
586
+ except Exception as e:
587
+ logger.error(f"Error identifying default zones: {str(e)}")
588
+ logger.error(traceback.format_exc())
589
+ return {}
590
+
591
+ def _format_object_list_naturally(self, object_list: List[str]) -> str:
592
+ """
593
+ 將物件列表格式化為自然語言表達
594
+
595
+ Args:
596
+ object_list: 物件名稱列表
597
+
598
+ Returns:
599
+ str: 自然語言格式的物件列表
600
+ """
601
+ try:
602
+ if not object_list:
603
+ return "various items"
604
+
605
+ # 標準化物件名稱
606
+ normalized_objects = []
607
+ for obj in object_list:
608
+ normalized = obj.replace('_', ' ').strip()
609
+ if normalized:
610
+ normalized_objects.append(normalized)
611
+
612
+ if not normalized_objects:
613
+ return "various items"
614
+
615
+ # 格式化列表
616
+ if len(normalized_objects) == 1:
617
+ return normalized_objects[0]
618
+ elif len(normalized_objects) == 2:
619
+ return f"{normalized_objects[0]} and {normalized_objects[1]}"
620
+ else:
621
+ return ", ".join(normalized_objects[:-1]) + f", and {normalized_objects[-1]}"
622
+
623
+ except Exception as e:
624
+ logger.warning(f"Error formatting object list naturally: {str(e)}")
625
+ return "various items"
626
+
627
+ def _create_basic_zones_from_objects(self, detected_objects: List[Dict], scene_type: str) -> Dict:
628
+ """
629
+ 從個別高置信度物件創建基本功能區域
630
+ 這是標準區域識別失敗時的後備方案
631
+
632
+ Args:
633
+ detected_objects: 檢測到的物件列表
634
+ scene_type: 場景類型
635
+
636
+ Returns:
637
+ 基本區域字典
638
+ """
639
+ try:
640
+ zones = {}
641
+
642
+ # 專注於高置信度物件
643
+ high_conf_objects = [obj for obj in detected_objects if obj.get("confidence", 0) >= 0.6]
644
+
645
+ if not high_conf_objects:
646
+ high_conf_objects = detected_objects # 後備到所有物件
647
+
648
+ # 基於個別重要物件創建區域
649
+ processed_objects = set() # 避免重複處理相同類型的物件
650
+
651
+ for obj in high_conf_objects[:3]: # 限制為前3個物件
652
+ class_name = obj["class_name"]
653
+ region = obj.get("region", "center")
654
+
655
+ # 避免為同一類型物件創建多個區域
656
+ if class_name in processed_objects:
657
+ continue
658
+ processed_objects.add(class_name)
659
+
660
+ # 基於物件類型創建描述性區域
661
+ zone_description = self._get_basic_zone_description(class_name, scene_type)
662
+ descriptive_key = self._generate_object_based_zone_key(class_name, region)
663
+
664
+ if zone_description and descriptive_key:
665
+ zones[descriptive_key] = {
666
+ "region": region,
667
+ "objects": [class_name],
668
+ "description": zone_description
669
+ }
670
+
671
+ logger.debug(f"Created {len(zones)} basic zones from high confidence objects")
672
+ return zones
673
+
674
+ except Exception as e:
675
+ logger.error(f"Error creating basic zones from objects: {str(e)}")
676
+ logger.error(traceback.format_exc())
677
+ return {}
678
+
679
+ def _generate_object_based_zone_key(self, class_name: str, region: str) -> str:
680
+ """
681
+ 基於物件類型和位置生成描述性的區域鍵名
682
+
683
+ Args:
684
+ class_name: 物件類別名稱
685
+ region: 區域位置
686
+
687
+ Returns:
688
+ str: 描述性區域鍵名
689
+ """
690
+ try:
691
+ # 標準化物件名稱
692
+ normalized_class = class_name.replace('_', ' ').lower().strip()
693
+
694
+ # 物件類型對應的區域描述
695
+ object_zone_mapping = {
696
+ 'person': 'activity area',
697
+ 'car': 'vehicle area',
698
+ 'truck': 'vehicle area',
699
+ 'bus': 'vehicle area',
700
+ 'motorcycle': 'vehicle area',
701
+ 'bicycle': 'cycling area',
702
+ 'traffic light': 'traffic control area',
703
+ 'chair': 'seating area',
704
+ 'sofa': 'seating area',
705
+ 'bed': 'rest area',
706
+ 'dining table': 'dining area',
707
+ 'tv': 'entertainment area',
708
+ 'laptop': 'workspace area',
709
+ 'potted plant': 'decorative area'
710
+ }
711
+
712
+ base_description = object_zone_mapping.get(normalized_class, f"{normalized_class} area")
713
+
714
+ # 添加位置信息以提供更具體的描述
715
+ position_modifiers = {
716
+ 'top_left': 'upper left',
717
+ 'top_center': 'upper central',
718
+ 'top_right': 'upper right',
719
+ 'middle_left': 'left side',
720
+ 'middle_center': 'central',
721
+ 'middle_right': 'right side',
722
+ 'bottom_left': 'lower left',
723
+ 'bottom_center': 'lower central',
724
+ 'bottom_right': 'lower right'
725
+ }
726
+
727
+ if region in position_modifiers:
728
+ return f"{position_modifiers[region]} {base_description}"
729
+
730
+ return base_description
731
+
732
+ except Exception as e:
733
+ logger.warning(f"Error generating object-based zone key for '{class_name}': {str(e)}")
734
+ return "activity area"
735
+
736
+ def _get_basic_zone_description(self, class_name: str, scene_type: str) -> str:
737
+ """
738
+ 基於物件和場景類型生成基本區域描述
739
+
740
+ Args:
741
+ class_name: 物件類別名稱
742
+ scene_type: 場景類型
743
+
744
+ Returns:
745
+ 區域描述字串
746
+ """
747
+ try:
748
+ # 物件特定描述
749
+ descriptions = {
750
+ "bed": "Sleeping and rest area",
751
+ "sofa": "Seating and relaxation area",
752
+ "chair": "Seating area",
753
+ "dining table": "Dining and meal area",
754
+ "tv": "Entertainment and media area",
755
+ "laptop": "Work and computing area",
756
+ "potted plant": "Decorative and green space area",
757
+ "refrigerator": "Food storage and kitchen area",
758
+ "car": "Vehicle and transportation area",
759
+ "person": "Activity and social area"
760
+ }
761
+
762
+ return descriptions.get(class_name, f"Functional area with {class_name}")
763
+
764
+ except Exception as e:
765
+ logger.error(f"Error getting basic zone description for '{class_name}': {str(e)}")
766
+ return f"Functional area with {class_name}"
767
+
768
+
769
+ def _generate_category_fallback_zones(self, all_detected_objects: List[Dict], current_zones: Dict) -> Dict:
770
+ """
771
+ 通用 fallback:針對 all_detected_objects 裡,每一個 (class_name, region) 組合是否已經
772
+ 在 current_zones 裡出現過。如果還沒,就為它們產生一個 fallback zone。
773
+ """
774
+ general_fallback = {
775
+ 0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus',
776
+ 6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light', 10: 'fire hydrant',
777
+ 11: 'stop sign', 12: 'parking meter', 13: 'bench', 14: 'bird', 15: 'cat',
778
+ 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow', 20: 'elephant', 21: 'bear',
779
+ 22: 'zebra', 23: 'giraffe', 24: 'backpack', 25: 'umbrella', 26: 'handbag',
780
+ 27: 'tie', 28: 'suitcase', 29: 'frisbee', 30: 'skis', 31: 'snowboard',
781
+ 32: 'sports ball', 33: 'kite', 34: 'baseball bat', 35: 'baseball glove',
782
+ 36: 'skateboard', 37: 'surfboard', 38: 'tennis racket', 39: 'bottle',
783
+ 40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife', 44: 'spoon', 45: 'bowl',
784
+ 46: 'banana', 47: 'apple', 48: 'sandwich', 49: 'orange', 50: 'broccoli',
785
+ 51: 'carrot', 52: 'hot dog', 53: 'pizza', 54: 'donut', 55: 'cake', 56: 'chair',
786
+ 57: 'couch', 58: 'potted plant', 59: 'bed', 60: 'dining table', 61: 'toilet',
787
+ 62: 'tv', 63: 'laptop', 64: 'mouse', 65: 'remote', 66: 'keyboard',
788
+ 67: 'cell phone', 68: 'microwave', 69: 'oven', 70: 'toaster', 71: 'sink',
789
+ 72: 'refrigerator', 73: 'book', 74: 'clock', 75: 'vase', 76: 'scissors',
790
+ 77: 'teddy bear', 78: 'hair drier', 79: 'toothbrush'
791
+
792
+ }
793
+
794
+ # 1. 統計 current_zones 裡,已使用掉的 (class_name, region) 次數
795
+ used_count = {}
796
+ for zone_info in current_zones.values():
797
+ rg = zone_info.get("region", "")
798
+ for obj_name in zone_info.get("objects", []):
799
+ key = (obj_name, rg)
800
+ used_count[key] = used_count.get(key, 0) + 1
801
+
802
+ # 2. 統計 all_detected_objects 裡的 (class_name, region) 總次數
803
+ total_count = {}
804
+ for obj in all_detected_objects:
805
+ cname = obj.get("class_name", "")
806
+ rg = obj.get("region", "")
807
+ key = (cname, rg)
808
+ total_count[key] = total_count.get(key, 0) + 1
809
+
810
+ # 3. 把 default_classes 轉換成「class_name → fallback 區域 type」的對照表
811
+ category_to_fallback = {
812
+ # 行人與交通工具
813
+ "person": "pedestrian area",
814
+ "bicycle": "vehicle movement area",
815
+ "car": "vehicle movement area",
816
+ "motorcycle": "vehicle movement area",
817
+ "airplane": "vehicle movement area",
818
+ "bus": "vehicle movement area",
819
+ "train": "vehicle movement area",
820
+ "truck": "vehicle movement area",
821
+ "boat": "vehicle movement area",
822
+ "traffic light": "traffic control area",
823
+ "fire hydrant": "traffic control area",
824
+ "stop sign": "traffic control area",
825
+ "parking meter": "traffic control area",
826
+ "bench": "public furniture area",
827
+
828
+ # 動物類、鳥類
829
+ "bird": "animal area",
830
+ "cat": "animal area",
831
+ "dog": "animal area",
832
+ "horse": "animal area",
833
+ "sheep": "animal area",
834
+ "cow": "animal area",
835
+ "elephant": "animal area",
836
+ "bear": "animal area",
837
+ "zebra": "animal area",
838
+ "giraffe": "animal area",
839
+
840
+ # 托運與行李
841
+ "backpack": "personal items area",
842
+ "umbrella": "personal items area",
843
+ "handbag": "personal items area",
844
+ "tie": "personal items area",
845
+ "suitcase": "personal items area",
846
+
847
+ # 運動器材
848
+ "frisbee": "sports area",
849
+ "skis": "sports area",
850
+ "snowboard": "sports area",
851
+ "sports ball": "sports area",
852
+ "kite": "sports area",
853
+ "baseball bat": "sports area",
854
+ "baseball glove":"sports area",
855
+ "skateboard": "sports area",
856
+ "surfboard": "sports area",
857
+ "tennis racket": "sports area",
858
+
859
+ # 廚房與食品(Kitchen)
860
+ "bottle": "kitchen area",
861
+ "wine glass": "kitchen area",
862
+ "cup": "kitchen area",
863
+ "fork": "kitchen area",
864
+ "knife": "kitchen area",
865
+ "spoon": "kitchen area",
866
+ "bowl": "kitchen area",
867
+ "banana": "kitchen area",
868
+ "apple": "kitchen area",
869
+ "sandwich": "kitchen area",
870
+ "orange": "kitchen area",
871
+ "broccoli": "kitchen area",
872
+ "carrot": "kitchen area",
873
+ "hot dog": "kitchen area",
874
+ "pizza": "kitchen area",
875
+ "donut": "kitchen area",
876
+ "cake": "kitchen area",
877
+ "dining table": "furniture arrangement area",
878
+ "refrigerator": "kitchen area",
879
+ "oven": "kitchen area",
880
+ "microwave": "kitchen area",
881
+ "toaster": "kitchen area",
882
+ "sink": "kitchen area",
883
+ "book": "miscellaneous area",
884
+ "clock": "miscellaneous area",
885
+ "vase": "decorative area",
886
+ "scissors": "miscellaneous area",
887
+ "teddy bear": "miscellaneous area",
888
+ "hair drier": "miscellaneous area",
889
+ "toothbrush": "miscellaneous area",
890
+
891
+ # 電子產品
892
+ "tv": "electronics area",
893
+ "laptop": "electronics area",
894
+ "mouse": "electronics area",
895
+ "remote": "electronics area",
896
+ "keyboard": "electronics area",
897
+ "cell phone": "electronics area",
898
+
899
+ # 家具類
900
+ "chair": "furniture arrangement area",
901
+ "couch": "furniture arrangement area",
902
+ "bed": "furniture arrangement area",
903
+ "toilet": "furniture arrangement area",
904
+
905
+ # 植物(室內植物或戶外綠化)
906
+ "potted plant": "decorative area",
907
+ }
908
+
909
+ # 4. 計算缺少的 (class_name, region) 並建立 fallback zone
910
+ for (cname, rg), total in total_count.items():
911
+ used = used_count.get((cname, rg), 0)
912
+ missing = total - used
913
+ if missing <= 0:
914
+ continue
915
+
916
+ # (A) 決定這個 cname 在 fallback 裡屬於哪個大 class(zone_type)
917
+ zone_type = category_to_fallback.get(cname, "miscellaneous area")
918
+
919
+ # (B) 根據 region 與 zone_type 組合成 fallback_key
920
+ fallback_key = f"{rg} {zone_type}"
921
+
922
+ # (C) 如果名稱重複,就在後面加 (1),(2),… 避免掉衝突
923
+ if fallback_key in current_zones or fallback_key in general_fallback:
924
+ suffix = 1
925
+ new_key = f"{fallback_key} ({suffix})"
926
+ while new_key in current_zones or new_key in general_fallback:
927
+ suffix += 1
928
+ new_key = f"{fallback_key} ({suffix})"
929
+ fallback_key = new_key
930
+
931
+ # (D) 建立這支 fallback zone,objects 裡放 missing 個 cname
932
+ general_fallback[fallback_key] = {
933
+ "region": rg,
934
+ "objects": [cname] * missing,
935
+ "description": f"{missing} {cname}(s) placed in fallback {zone_type} for region {rg}"
936
+ }
937
+
938
+ return general_fallback
image_analyzer.py ADDED
@@ -0,0 +1,365 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import numpy as np
3
+ import logging
4
+ import traceback
5
+ from typing import List, Dict, Tuple, Optional, Union, Any
6
+ from PIL import Image
7
+
8
+ class ImageAnalyzer:
9
+ """
10
+ 專注於圖像分析和預處理,包括多尺度金字塔分析、視角分析、建築特徵識別和圖像增強等功能
11
+ """
12
+
13
+ def __init__(self):
14
+ """
15
+ 初始化圖像分析器
16
+ """
17
+ self.logger = logging.getLogger(__name__)
18
+
19
+ def get_image_hash(self, image: Union[Image.Image, np.ndarray]) -> int:
20
+ """
21
+ 為圖像生成簡單的 hash 值用於快取
22
+
23
+ Args:
24
+ image: PIL Image 或 numpy 數組
25
+
26
+ Returns:
27
+ int: 圖像的 hash 值
28
+ """
29
+ try:
30
+ if isinstance(image, np.ndarray):
31
+ # 對於 numpy 數組,降採樣並計算簡單 hash
32
+ small_img = image[::10, ::10] if image.ndim == 3 else image
33
+ return hash(small_img.tobytes())
34
+ else:
35
+ # 對於 PIL 圖像,調整大小後轉換為 bytes
36
+ small_img = image.resize((32, 32))
37
+ return hash(small_img.tobytes())
38
+ except Exception as e:
39
+ self.logger.error(f"Error generating image hash: {e}")
40
+ self.logger.error(traceback.format_exc())
41
+ return 0
42
+
43
+ def enhance_features(self, image: Union[Image.Image, np.ndarray]) -> Image.Image:
44
+ """
45
+ 增強圖像特徵以改善地標檢測
46
+
47
+ Args:
48
+ image: 輸入圖像
49
+
50
+ Returns:
51
+ PIL.Image: 增強後的圖像
52
+ """
53
+ try:
54
+ # ensure PIL format
55
+ if not isinstance(image, Image.Image):
56
+ if isinstance(image, np.ndarray):
57
+ image = Image.fromarray(image)
58
+ else:
59
+ raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
60
+
61
+ # 轉換為numpy進行處理
62
+ img_array = np.array(image)
63
+
64
+ # 跳過灰度圖像的處理
65
+ if len(img_array.shape) < 3:
66
+ return image
67
+
68
+ # 應用自適應對比度增強
69
+ try:
70
+ from skimage import color, exposure
71
+
72
+ # 轉換到LAB色彩空間
73
+ if img_array.shape[2] == 4: # 處理RGBA
74
+ img_array = img_array[:,:,:3]
75
+
76
+ lab = color.rgb2lab(img_array[:,:,:3] / 255.0)
77
+ l_channel = lab[:,:,0]
78
+
79
+ # 增強L通道的對比度
80
+ p2, p98 = np.percentile(l_channel, (2, 98))
81
+ l_channel_enhanced = exposure.rescale_intensity(l_channel, in_range=(p2, p98))
82
+
83
+ # 替換L通道並轉換回RGB
84
+ lab[:,:,0] = l_channel_enhanced
85
+ enhanced_img = color.lab2rgb(lab) * 255.0
86
+ enhanced_img = enhanced_img.astype(np.uint8)
87
+
88
+ return Image.fromarray(enhanced_img)
89
+
90
+ except ImportError:
91
+ self.logger.warning("skimage not available for feature enhancement")
92
+ return image
93
+
94
+ except Exception as e:
95
+ self.logger.error(f"Error in feature enhancement: {e}")
96
+ self.logger.error(traceback.format_exc())
97
+ return image
98
+
99
+ def analyze_viewpoint(self, image: Union[Image.Image, np.ndarray],
100
+ clip_model_manager) -> Dict[str, Any]:
101
+ """
102
+ 分析圖像視角以調整檢測參數
103
+
104
+ Args:
105
+ image: 輸入圖像
106
+ clip_model_manager: CLIP模型管理器實例
107
+
108
+ Returns:
109
+ Dict: 視角分析結果
110
+ """
111
+ try:
112
+ viewpoint_prompts = {
113
+ "aerial_view": "an aerial view from above looking down",
114
+ "street_level": "a street level view looking up at a tall structure",
115
+ "eye_level": "an eye-level horizontal view of a landmark",
116
+ "distant": "a distant view of a landmark on the horizon",
117
+ "close_up": "a close-up detailed view of architectural features",
118
+ "interior": "an interior view inside a structure",
119
+ "angled_view": "an angled view of a structure",
120
+ "low_angle": "a low angle view looking up at a building"
121
+ }
122
+
123
+ # 計算相似度分數
124
+ viewpoint_scores = self.calculate_similarity_scores(image, viewpoint_prompts, clip_model_manager)
125
+
126
+ # 找到主要視角
127
+ dominant_viewpoint = max(viewpoint_scores.items(), key=lambda x: x[1])
128
+
129
+ return {
130
+ "viewpoint_scores": viewpoint_scores,
131
+ "dominant_viewpoint": dominant_viewpoint[0],
132
+ "confidence": dominant_viewpoint[1]
133
+ }
134
+
135
+ except Exception as e:
136
+ self.logger.error(f"Error in viewpoint analysis: {e}")
137
+ self.logger.error(traceback.format_exc())
138
+ return {
139
+ "viewpoint_scores": {},
140
+ "dominant_viewpoint": "eye_level",
141
+ "confidence": 0.0
142
+ }
143
+
144
+ def calculate_similarity_scores(self, image: Union[Image.Image, np.ndarray],
145
+ prompts: Dict[str, str],
146
+ clip_model_manager) -> Dict[str, float]:
147
+ """
148
+ 計算圖像與一組特定提示之間的相似度分數
149
+
150
+ Args:
151
+ image: 輸入圖像
152
+ prompts: 提示詞字典 {名稱: 提示文本}
153
+ clip_model_manager: CLIP模型管理器實例
154
+
155
+ Returns:
156
+ Dict[str, float]: 每個提示的相似度分數
157
+ """
158
+ try:
159
+ # ensure PIL format
160
+ if not isinstance(image, Image.Image):
161
+ if isinstance(image, np.ndarray):
162
+ image = Image.fromarray(image)
163
+ else:
164
+ raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
165
+
166
+ # preprocess image
167
+ image_input = clip_model_manager.preprocess_image(image)
168
+
169
+ # get image features
170
+ image_features = clip_model_manager.encode_image(image_input)
171
+
172
+ # 計算與每個提示的similarity
173
+ scores = {}
174
+ prompt_texts = list(prompts.values())
175
+ prompt_features = clip_model_manager.encode_single_text(prompt_texts)
176
+
177
+ # 計算相似度
178
+ similarity = clip_model_manager.calculate_similarity(image_features, prompt_features)
179
+
180
+ # result
181
+ for i, (name, _) in enumerate(prompts.items()):
182
+ scores[name] = float(similarity[0][i])
183
+
184
+ return scores
185
+
186
+ except Exception as e:
187
+ self.logger.error(f"Error calculating similarity scores: {e}")
188
+ self.logger.error(traceback.format_exc())
189
+ return {}
190
+
191
+ def analyze_architectural_features(self, image: Union[Image.Image, np.ndarray],
192
+ clip_model_manager) -> Dict[str, Any]:
193
+ """
194
+ 分析圖像中結構的建築特徵,不硬編碼特定地標
195
+
196
+ Args:
197
+ image: 輸入圖像
198
+ clip_model_manager: CLIP模型管理器實例
199
+
200
+ Returns:
201
+ Dict: 建築特徵分析結果
202
+ """
203
+ try:
204
+ # 定義通用建築特徵提示,適用於所有類型的地標
205
+ architecture_prompts = {
206
+ "tall_structure": "a tall vertical structure standing alone",
207
+ "tiered_building": "a building with multiple stacked tiers or segments",
208
+ "historical_structure": "a building with historical architectural elements",
209
+ "modern_design": "a modern structure with contemporary architectural design",
210
+ "segmented_exterior": "a structure with visible segmented or sectioned exterior",
211
+ "viewing_platform": "a tall structure with observation area at the top",
212
+ "time_display": "a structure with timepiece features",
213
+ "glass_facade": "a building with prominent glass exterior surfaces",
214
+ "memorial_structure": "a monument or memorial structure",
215
+ "ancient_construction": "ancient constructed elements or archaeological features",
216
+ "natural_landmark": "a natural geographic formation or landmark",
217
+ "slanted_design": "a structure with non-vertical or leaning profile"
218
+ }
219
+
220
+ # 計算與通用建築模式的相似度分數
221
+ context_scores = self.calculate_similarity_scores(image, architecture_prompts, clip_model_manager)
222
+
223
+ # 確定最相關的建築特徵
224
+ top_features = sorted(context_scores.items(), key=lambda x: x[1], reverse=True)[:3]
225
+
226
+ # 計算特徵置信度
227
+ context_confidence = sum(score for _, score in top_features) / 3
228
+
229
+ # 根據頂級特徵確定主要建築類別
230
+ architectural_categories = {
231
+ "tower": ["tall_structure", "viewing_platform", "time_display"],
232
+ "skyscraper": ["tall_structure", "modern_design", "glass_facade"],
233
+ "historical": ["historical_structure", "ancient_construction", "memorial_structure"],
234
+ "natural": ["natural_landmark"],
235
+ "distinctive": ["tiered_building", "segmented_exterior", "slanted_design"]
236
+ }
237
+
238
+ # 根據頂級特徵為每個類別評分
239
+ category_scores = {}
240
+ for category, features in architectural_categories.items():
241
+ category_score = 0
242
+ for feature, score in context_scores.items():
243
+ if feature in features:
244
+ category_score += score
245
+ category_scores[category] = category_score
246
+
247
+ primary_category = max(category_scores.items(), key=lambda x: x[1])[0]
248
+
249
+ return {
250
+ "architectural_features": top_features,
251
+ "context_confidence": context_confidence,
252
+ "primary_category": primary_category,
253
+ "category_scores": category_scores
254
+ }
255
+
256
+ except Exception as e:
257
+ self.logger.error(f"Error in architectural feature analysis: {e}")
258
+ self.logger.error(traceback.format_exc())
259
+ return {
260
+ "architectural_features": [],
261
+ "context_confidence": 0.0,
262
+ "primary_category": "building",
263
+ "category_scores": {}
264
+ }
265
+
266
+ def perform_pyramid_analysis(self, image: Union[Image.Image, np.ndarray],
267
+ clip_model_manager, landmark_data_manager,
268
+ levels: int = 4, base_threshold: float = 0.25,
269
+ aspect_ratios: List[float] = [1.0, 0.75, 1.5]) -> Dict[str, Any]:
270
+ """
271
+ 對圖像執行多尺度金字塔分析以改善地標檢測
272
+
273
+ Args:
274
+ image: 輸入圖像
275
+ clip_model_manager: CLIP模型管理器實例
276
+ landmark_data_manager: 地標數據管理器實例
277
+ levels: 金字塔層級數
278
+ base_threshold: 基礎置信度閾值
279
+ aspect_ratios: 不同縱橫比列表
280
+
281
+ Returns:
282
+ Dict: 金字塔分析結果
283
+ """
284
+ try:
285
+ # 確保圖像是PIL格式
286
+ if not isinstance(image, Image.Image):
287
+ if isinstance(image, np.ndarray):
288
+ image = Image.fromarray(image)
289
+ else:
290
+ raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
291
+
292
+ width, height = image.size
293
+ pyramid_results = []
294
+
295
+ # 獲取預計算的地標文本特徵
296
+ landmark_prompts = landmark_data_manager.get_landmark_prompts()
297
+ if not landmark_prompts:
298
+ return {
299
+ "is_landmark": False,
300
+ "results": [],
301
+ "best_result": None
302
+ }
303
+
304
+ landmark_text_features = clip_model_manager.encode_text_batch(landmark_prompts)
305
+
306
+ # 對每個縮放和縱橫比組合進行處理
307
+ for level in range(levels):
308
+ # 計算縮放因子
309
+ scale_factor = 1.0 - (level * 0.2)
310
+
311
+ for aspect_ratio in aspect_ratios:
312
+ # 計算新尺寸,保持面積近似不變
313
+ if aspect_ratio != 1.0:
314
+ # 保持面積近似不變的情況下調整縱橫比
315
+ new_width = int(width * scale_factor * (1/aspect_ratio)**0.5)
316
+ new_height = int(height * scale_factor * aspect_ratio**0.5)
317
+ else:
318
+ new_width = int(width * scale_factor)
319
+ new_height = int(height * scale_factor)
320
+
321
+ # 調整圖像大小
322
+ scaled_image = image.resize((new_width, new_height), Image.LANCZOS)
323
+
324
+ # 預處理圖像
325
+ image_input = clip_model_manager.preprocess_image(scaled_image)
326
+
327
+ # 獲取圖像特徵
328
+ image_features = clip_model_manager.encode_image(image_input)
329
+
330
+ # 計算相似度
331
+ similarity = clip_model_manager.calculate_similarity(image_features, landmark_text_features)
332
+
333
+ # 找到最佳匹配
334
+ best_idx = similarity[0].argmax().item()
335
+ best_score = similarity[0][best_idx]
336
+
337
+ if best_score >= base_threshold:
338
+ landmark_id, landmark_info = landmark_data_manager.get_landmark_by_index(best_idx)
339
+ if landmark_id:
340
+ pyramid_results.append({
341
+ "landmark_id": landmark_id,
342
+ "landmark_name": landmark_info.get("name", "Unknown"),
343
+ "confidence": float(best_score),
344
+ "scale_factor": scale_factor,
345
+ "aspect_ratio": aspect_ratio,
346
+ "location": landmark_info.get("location", "Unknown Location")
347
+ })
348
+
349
+ # 按置信度排序
350
+ pyramid_results.sort(key=lambda x: x["confidence"], reverse=True)
351
+
352
+ return {
353
+ "is_landmark": len(pyramid_results) > 0,
354
+ "results": pyramid_results,
355
+ "best_result": pyramid_results[0] if pyramid_results else None
356
+ }
357
+
358
+ except Exception as e:
359
+ self.logger.error(f"Error in pyramid analysis: {e}")
360
+ self.logger.error(traceback.format_exc())
361
+ return {
362
+ "is_landmark": False,
363
+ "results": [],
364
+ "best_result": None
365
+ }
image_processor.py CHANGED
@@ -32,6 +32,26 @@ class ImageProcessor:
32
  self.enable_places365 = enable_places365
33
  self.model_instances = {}
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  # Initialize ColorMapper
36
  self.color_mapper = ColorMapper()
37
  print("ColorMapper initialized successfully")
@@ -57,12 +77,12 @@ class ImageProcessor:
57
 
58
  # Initialize SceneAnalyzer with error handling
59
  self.scene_analyzer = None
60
- self.class_names = None # Will be set when first model is loaded
61
 
62
  try:
63
  # Initialize SceneAnalyzer without class_names (will be set later)
64
  self.scene_analyzer = SceneAnalyzer(
65
- class_names=None,
66
  use_llm=self.use_llm,
67
  use_clip=True,
68
  enable_landmark=True,
@@ -365,9 +385,14 @@ class ImageProcessor:
365
  else:
366
  # Update existing scene analyzer with current settings
367
  if result and hasattr(result, 'names'):
368
- self.scene_analyzer.class_names = result.names
 
 
 
369
  if hasattr(self.scene_analyzer, 'spatial_analyzer') and self.scene_analyzer.spatial_analyzer:
370
- self.scene_analyzer.spatial_analyzer.class_names = result.names
 
 
371
 
372
  self.scene_analyzer.enable_landmark = enable_landmark
373
  if hasattr(self.scene_analyzer, 'spatial_analyzer') and self.scene_analyzer.spatial_analyzer:
 
32
  self.enable_places365 = enable_places365
33
  self.model_instances = {}
34
 
35
+ self.coco_class_names = {
36
+ 0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane',
37
+ 5: 'bus', 6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light',
38
+ 10: 'fire hydrant', 11: 'stop sign', 12: 'parking meter', 13: 'bench',
39
+ 14: 'bird', 15: 'cat', 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow',
40
+ 20: 'elephant', 21: 'bear', 22: 'zebra', 23: 'giraffe', 24: 'backpack',
41
+ 25: 'umbrella', 26: 'handbag', 27: 'tie', 28: 'suitcase', 29: 'frisbee',
42
+ 30: 'skis', 31: 'snowboard', 32: 'sports ball', 33: 'kite', 34: 'baseball bat',
43
+ 35: 'baseball glove', 36: 'skateboard', 37: 'surfboard', 38: 'tennis racket',
44
+ 39: 'bottle', 40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife',
45
+ 44: 'spoon', 45: 'bowl', 46: 'banana', 47: 'apple', 48: 'sandwich',
46
+ 49: 'orange', 50: 'broccoli', 51: 'carrot', 52: 'hot dog', 53: 'pizza',
47
+ 54: 'donut', 55: 'cake', 56: 'chair', 57: 'couch', 58: 'potted plant',
48
+ 59: 'bed', 60: 'dining table', 61: 'toilet', 62: 'tv', 63: 'laptop',
49
+ 64: 'mouse', 65: 'remote', 66: 'keyboard', 67: 'cell phone',
50
+ 68: 'microwave', 69: 'oven', 70: 'toaster', 71: 'sink', 72: 'refrigerator',
51
+ 73: 'book', 74: 'clock', 75: 'vase', 76: 'scissors', 77: 'teddy bear',
52
+ 78: 'hair drier', 79: 'toothbrush'
53
+ }
54
+
55
  # Initialize ColorMapper
56
  self.color_mapper = ColorMapper()
57
  print("ColorMapper initialized successfully")
 
77
 
78
  # Initialize SceneAnalyzer with error handling
79
  self.scene_analyzer = None
80
+ self.class_names = self.coco_class_names
81
 
82
  try:
83
  # Initialize SceneAnalyzer without class_names (will be set later)
84
  self.scene_analyzer = SceneAnalyzer(
85
+ class_names=self.coco_class_names,
86
  use_llm=self.use_llm,
87
  use_clip=True,
88
  enable_landmark=True,
 
385
  else:
386
  # Update existing scene analyzer with current settings
387
  if result and hasattr(result, 'names'):
388
+ # 使用檢測結果的類別名稱或回退到預定義映射
389
+ current_class_names = result.names if result.names else self.coco_class_names
390
+
391
+ self.scene_analyzer.class_names = current_class_names
392
  if hasattr(self.scene_analyzer, 'spatial_analyzer') and self.scene_analyzer.spatial_analyzer:
393
+ self.scene_analyzer.spatial_analyzer.update_class_names(current_class_names)
394
+
395
+ logger.info(f"Updated class names in scene analyzer: {list(current_class_names.keys())}")
396
 
397
  self.scene_analyzer.enable_landmark = enable_landmark
398
  if hasattr(self.scene_analyzer, 'spatial_analyzer') and self.scene_analyzer.spatial_analyzer:
indoor_outdoor_classifier.py ADDED
@@ -0,0 +1,755 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import logging
3
+ import traceback
4
+ from typing import Dict, Any, Optional, List
5
+ from configuration_manager import ConfigurationManager
6
+
7
+
8
+ class IndoorOutdoorClassifier:
9
+ """
10
+ Classifies scenes as indoor or outdoor based on visual features and Places365 context.(判斷室內室外)
11
+ 此class會融入PLACES365,使判斷更準確
12
+
13
+ This class implements sophisticated decision logic that combines multiple evidence sources
14
+ including visual scene analysis, structural features, and external scene classification
15
+ data to determine whether a scene is indoor or outdoor.
16
+ """
17
+
18
+ def __init__(self, config_manager: ConfigurationManager):
19
+ """
20
+ Initialize the indoor/outdoor classifier.
21
+
22
+ Args:
23
+ config_manager: Configuration manager instance for accessing thresholds and weights.
24
+ """
25
+ self.config_manager = config_manager
26
+ self.logger = self._setup_logger()
27
+
28
+ # Internal threshold constants for Places365 confidence levels
29
+ self.P365_HIGH_CONF_THRESHOLD = 0.65
30
+ self.P365_MODERATE_CONF_THRESHOLD = 0.4
31
+
32
+ # 以下是絕對室內/室外的基本情況
33
+ self.DEFINITELY_OUTDOOR_KEYWORDS_P365 = [
34
+ "street", "road", "highway", "park", "beach", "mountain", "forest", "field",
35
+ "outdoor", "sky", "coast", "courtyard", "square", "plaza", "bridge",
36
+ "parking_lot", "playground", "stadium", "construction_site", "river", "ocean",
37
+ "desert", "garden", "trail", "intersection", "crosswalk", "sidewalk", "pathway",
38
+ "avenue", "boulevard", "downtown", "city_center", "market_outdoor"
39
+ ]
40
+
41
+ self.DEFINITELY_INDOOR_KEYWORDS_P365 = [
42
+ "bedroom", "office", "kitchen", "library", "classroom", "conference_room", "living_room",
43
+ "bathroom", "hospital", "hotel_room", "cabin", "interior", "museum", "gallery",
44
+ "mall", "market_indoor", "basement", "corridor", "lobby", "restaurant_indoor",
45
+ "bar_indoor", "shop_indoor", "gym_indoor"
46
+ ]
47
+
48
+ def _setup_logger(self) -> logging.Logger:
49
+ """Set up logger for classification operations."""
50
+ logger = logging.getLogger(f"{__name__}.IndoorOutdoorClassifier")
51
+ if not logger.handlers:
52
+ handler = logging.StreamHandler()
53
+ formatter = logging.Formatter(
54
+ '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
55
+ )
56
+ handler.setFormatter(formatter)
57
+ logger.addHandler(handler)
58
+ logger.setLevel(logging.INFO)
59
+ return logger
60
+
61
+ def classify(self, features: Dict[str, Any], places365_info: Optional[Dict] = None) -> Dict[str, Any]:
62
+ """
63
+ Classify scene as indoor or outdoor based on features and Places365 context.
64
+
65
+ Args:
66
+ features: Dictionary containing extracted image features.
67
+ places365_info: Optional Places365 classification information.
68
+
69
+ Returns:
70
+ Dictionary containing classification results including decision, probability,
71
+ feature contributions, and diagnostic information.
72
+ """
73
+ try:
74
+ self.logger.debug("Starting indoor/outdoor classification")
75
+
76
+ # Initialize classification components
77
+ visual_score = 0.0
78
+ feature_contributions = {}
79
+ diagnostics = {}
80
+
81
+ # Extract Places365 information
82
+ p365_context = self._extract_places365_context(places365_info, diagnostics)
83
+
84
+ # Compute visual evidence score
85
+ visual_analysis = self._analyze_visual_evidence(features, diagnostics)
86
+ visual_score = visual_analysis["visual_score"]
87
+ feature_contributions.update(visual_analysis["contributions"])
88
+
89
+ # Incorporate Places365 influence
90
+ p365_analysis = self._analyze_places365_influence(
91
+ p365_context, visual_analysis.get("strong_sky_signal", False), diagnostics
92
+ )
93
+ p365_influence_score = p365_analysis["influence_score"]
94
+ if abs(p365_influence_score) > 0.01:
95
+ feature_contributions["places365_influence_score"] = round(p365_influence_score, 2)
96
+
97
+ # Calculate final score and probability
98
+ final_indoor_score = visual_score + p365_influence_score
99
+ classification_result = self._compute_final_classification(
100
+ final_indoor_score, visual_score, p365_influence_score, diagnostics
101
+ )
102
+
103
+ # Apply Places365 override if conditions are met
104
+ override_result = self._apply_places365_override(
105
+ classification_result, p365_context, diagnostics
106
+ )
107
+
108
+ # Ensure default values for missing contributions
109
+ self._ensure_default_contributions(feature_contributions)
110
+
111
+ # 最終結果
112
+ result = {
113
+ "is_indoor": override_result["is_indoor"],
114
+ "indoor_probability": override_result["indoor_probability"],
115
+ "indoor_score_raw": override_result["final_score"],
116
+ "feature_contributions": feature_contributions,
117
+ "diagnostics": diagnostics
118
+ }
119
+
120
+ self.logger.debug(f"Classification complete: indoor={result['is_indoor']}, "
121
+ f"probability={result['indoor_probability']:.3f}")
122
+
123
+ return result
124
+
125
+ except Exception as e:
126
+ self.logger.error(f"Error in indoor/outdoor classification: {str(e)}")
127
+ self.logger.error(f"Traceback: {traceback.format_exc()}")
128
+ return self._get_default_classification_result()
129
+
130
+ def _extract_places365_context(self, places365_info: Optional[Dict],
131
+ diagnostics: Dict[str, Any]) -> Dict[str, Any]:
132
+ """Extract and validate Places365 context information."""
133
+ context = {
134
+ "mapped_scene": "unknown",
135
+ "is_indoor_from_classification": None,
136
+ "attributes": [],
137
+ "confidence": 0.0,
138
+ "is_indoor": None
139
+ }
140
+
141
+ if places365_info:
142
+ context["mapped_scene"] = places365_info.get('mapped_scene_type', 'unknown').lower()
143
+ context["attributes"] = [attr.lower() for attr in places365_info.get('attributes', [])]
144
+ context["confidence"] = places365_info.get('confidence', 0.0)
145
+ context["is_indoor_from_classification"] = places365_info.get('is_indoor_from_classification', None)
146
+ context["is_indoor"] = places365_info.get('is_indoor', None)
147
+
148
+ diagnostics["p365_context_received"] = (
149
+ f"P365 Scene: {context['mapped_scene']}, P365 SceneConf: {context['confidence']:.2f}, "
150
+ f"P365 DirectIndoor: {context['is_indoor_from_classification']}, "
151
+ f"P365 Attrs: {context['attributes']}"
152
+ )
153
+
154
+ return context
155
+
156
+ def _analyze_visual_evidence(self, features: Dict[str, Any],
157
+ diagnostics: Dict[str, Any]) -> Dict[str, Any]:
158
+ """Analyze visual evidence for indoor/outdoor classification."""
159
+ visual_score = 0.0
160
+ contributions = {}
161
+ strong_sky_signal = False
162
+
163
+ # Sky and openness analysis
164
+ sky_analysis = self._analyze_sky_evidence(features, diagnostics)
165
+ visual_score += sky_analysis["score"]
166
+ if sky_analysis["score"] != 0:
167
+ contributions["sky_openness_features_visual"] = round(sky_analysis["score"], 2)
168
+ strong_sky_signal = sky_analysis["strong_signal"]
169
+
170
+ # Enclosure and structural analysis
171
+ enclosure_analysis = self._analyze_enclosure_evidence(features, strong_sky_signal, diagnostics)
172
+ visual_score += enclosure_analysis["score"]
173
+ if enclosure_analysis["score"] != 0:
174
+ contributions["enclosure_features"] = round(enclosure_analysis["score"], 2)
175
+
176
+ # Brightness uniformity analysis
177
+ uniformity_analysis = self._analyze_brightness_uniformity(features, strong_sky_signal, diagnostics)
178
+ visual_score += uniformity_analysis["score"]
179
+ if uniformity_analysis["score"] != 0:
180
+ contributions["brightness_uniformity_contribution"] = round(uniformity_analysis["score"], 2)
181
+
182
+ # Light source analysis
183
+ light_analysis = self._analyze_light_sources(features, strong_sky_signal, diagnostics)
184
+ visual_score += light_analysis["score"]
185
+ if light_analysis["score"] != 0:
186
+ contributions["light_source_features"] = round(light_analysis["score"], 2)
187
+
188
+ # Color atmosphere analysis
189
+ atmosphere_analysis = self._analyze_color_atmosphere(features, strong_sky_signal, diagnostics)
190
+ visual_score += atmosphere_analysis["score"]
191
+ if atmosphere_analysis["score"] != 0:
192
+ contributions["warm_atmosphere_indoor_visual_contrib"] = round(atmosphere_analysis["score"], 2)
193
+
194
+ # Home environment pattern analysis
195
+ home_analysis = self._analyze_home_environment_pattern(features, strong_sky_signal, diagnostics)
196
+ visual_score += home_analysis["score"]
197
+ if home_analysis["score"] != 0:
198
+ contributions["home_environment_pattern_visual"] = round(home_analysis["score"], 2)
199
+
200
+ # Aerial street pattern analysis
201
+ aerial_analysis = self._analyze_aerial_street_pattern(features, strong_sky_signal, contributions, diagnostics)
202
+ visual_score += aerial_analysis["score"]
203
+ if aerial_analysis["score"] != 0:
204
+ contributions["aerial_street_pattern_visual"] = round(aerial_analysis["score"], 2)
205
+
206
+ diagnostics["visual_indoor_score_subtotal"] = round(visual_score, 3)
207
+
208
+ return {
209
+ "visual_score": visual_score,
210
+ "contributions": contributions,
211
+ "strong_sky_signal": strong_sky_signal
212
+ }
213
+
214
+ def _analyze_sky_evidence(self, features: Dict[str, Any],
215
+ diagnostics: Dict[str, Any]) -> Dict[str, Any]:
216
+ """Analyze sky-related evidence for outdoor classification."""
217
+ sky_evidence_score = 0.0
218
+ strong_sky_signal = False
219
+
220
+ # Extract relevant features
221
+ sky_blue_dominance = features.get("sky_region_blue_dominance", 0.0)
222
+ sky_brightness_ratio = features.get("sky_region_brightness_ratio", 1.0)
223
+ texture_complexity = features.get("top_region_texture_complexity", 0.5)
224
+ openness_top_edge = features.get("openness_top_edge", 0.5)
225
+
226
+ # Get thresholds
227
+ thresholds = self.config_manager.indoor_outdoor_thresholds
228
+ weights = self.config_manager.weighting_factors
229
+
230
+ # Strong blue sky signal
231
+ if sky_blue_dominance > thresholds.sky_blue_dominance_thresh:
232
+ sky_evidence_score -= weights.sky_blue_dominance_w * sky_blue_dominance
233
+ diagnostics["sky_detection_reason_visual"] = f"Visual: Strong sky-like blue ({sky_blue_dominance:.2f})"
234
+ strong_sky_signal = True
235
+
236
+ # Bright top region with low texture
237
+ elif (sky_brightness_ratio > getattr(thresholds, 'sky_brightness_ratio_strong_thresh', 1.35) and
238
+ texture_complexity < getattr(thresholds, 'sky_texture_complexity_clear_thresh', 0.25)):
239
+ outdoor_push = weights.sky_brightness_ratio_w * (sky_brightness_ratio - 1.0)
240
+ sky_evidence_score -= outdoor_push
241
+ sky_evidence_score -= weights.sky_texture_w
242
+ diagnostics["sky_detection_reason_visual"] = (
243
+ f"Visual: Top brighter (ratio:{sky_brightness_ratio:.2f}) & low texture."
244
+ )
245
+ strong_sky_signal = True
246
+
247
+ # High top edge openness
248
+ elif openness_top_edge > getattr(thresholds, 'openness_top_strong_thresh', 0.80):
249
+ sky_evidence_score -= weights.openness_top_w * openness_top_edge
250
+ diagnostics["sky_detection_reason_visual"] = (
251
+ f"Visual: Very high top edge openness ({openness_top_edge:.2f})."
252
+ )
253
+ strong_sky_signal = True
254
+
255
+ # Weak sky signal (cloudy conditions)
256
+ elif (not strong_sky_signal and
257
+ texture_complexity < getattr(thresholds, 'sky_texture_complexity_cloudy_thresh', 0.20) and
258
+ sky_brightness_ratio > getattr(thresholds, 'sky_brightness_ratio_cloudy_thresh', 0.95)):
259
+ sky_evidence_score -= weights.sky_texture_w * (1.0 - texture_complexity) * 0.5
260
+ diagnostics["sky_detection_reason_visual"] = (
261
+ f"Visual: Weak sky signal (low texture, brightish top: {texture_complexity:.2f}), less weight."
262
+ )
263
+
264
+ if strong_sky_signal:
265
+ diagnostics["strong_sky_signal_visual_detected"] = True
266
+
267
+ return {
268
+ "score": sky_evidence_score,
269
+ "strong_signal": strong_sky_signal
270
+ }
271
+
272
+ def _analyze_enclosure_evidence(self, features: Dict[str, Any], strong_sky_signal: bool,
273
+ diagnostics: Dict[str, Any]) -> Dict[str, Any]:
274
+ """Analyze enclosure evidence for indoor classification."""
275
+ enclosure_score = 0.0
276
+
277
+ # Extract features
278
+ ceiling_likelihood = features.get("ceiling_likelihood", 0.0)
279
+ boundary_clarity = features.get("boundary_clarity", 0.0)
280
+ texture_complexity = features.get("top_region_texture_complexity", 0.5)
281
+ openness_top_edge = features.get("openness_top_edge", 0.5)
282
+
283
+ # Get configuration
284
+ thresholds = self.config_manager.indoor_outdoor_thresholds
285
+ weights = self.config_manager.weighting_factors
286
+ override_factors = self.config_manager.override_factors
287
+
288
+ # Ceiling likelihood analysis
289
+ if ceiling_likelihood > thresholds.ceiling_likelihood_thresh:
290
+ current_ceiling_score = weights.ceiling_likelihood_w * ceiling_likelihood
291
+ if strong_sky_signal:
292
+ current_ceiling_score *= override_factors.sky_override_factor_ceiling
293
+ enclosure_score += current_ceiling_score
294
+ diagnostics["indoor_reason_ceiling_visual"] = (
295
+ f"Visual Ceiling: {ceiling_likelihood:.2f}, ScoreCont: {current_ceiling_score:.2f}"
296
+ )
297
+
298
+ # Boundary clarity analysis
299
+ if boundary_clarity > thresholds.boundary_clarity_thresh:
300
+ current_boundary_score = weights.boundary_clarity_w * boundary_clarity
301
+ if strong_sky_signal:
302
+ current_boundary_score *= override_factors.sky_override_factor_boundary
303
+ enclosure_score += current_boundary_score
304
+ diagnostics["indoor_reason_boundary_visual"] = (
305
+ f"Visual Boundary: {boundary_clarity:.2f}, ScoreCont: {current_boundary_score:.2f}"
306
+ )
307
+
308
+ # Complex urban top detection
309
+ if (not strong_sky_signal and texture_complexity > 0.7 and
310
+ openness_top_edge < 0.3 and ceiling_likelihood < 0.35):
311
+ diagnostics["complex_urban_top_visual"] = True
312
+ if boundary_clarity > 0.5:
313
+ enclosure_score *= 0.5
314
+ diagnostics["reduced_enclosure_for_urban_top_visual"] = True
315
+
316
+ return {"score": enclosure_score}
317
+
318
+ def _analyze_brightness_uniformity(self, features: Dict[str, Any], strong_sky_signal: bool,
319
+ diagnostics: Dict[str, Any]) -> Dict[str, Any]:
320
+ """Analyze brightness uniformity patterns."""
321
+ uniformity_score = 0.0
322
+
323
+ # Calculate brightness uniformity
324
+ brightness_std = features.get("brightness_std", 50.0)
325
+ avg_brightness = features.get("avg_brightness", 100.0)
326
+ brightness_uniformity = 1.0 - min(1.0, brightness_std / max(avg_brightness, 1e-5))
327
+ shadow_clarity = features.get("shadow_clarity_score", 0.5)
328
+
329
+ # Get configuration
330
+ thresholds = self.config_manager.indoor_outdoor_thresholds
331
+ weights = self.config_manager.weighting_factors
332
+ override_factors = self.config_manager.override_factors
333
+
334
+ # High uniformity (indoor indicator)
335
+ if brightness_uniformity > thresholds.brightness_uniformity_thresh_indoor:
336
+ uniformity_score = weights.brightness_uniformity_w * brightness_uniformity
337
+ if strong_sky_signal:
338
+ uniformity_score *= override_factors.sky_override_factor_uniformity
339
+
340
+ # Low uniformity (potential outdoor indicator)
341
+ elif brightness_uniformity < thresholds.brightness_uniformity_thresh_outdoor:
342
+ if shadow_clarity > 0.65:
343
+ uniformity_score = -weights.brightness_non_uniformity_outdoor_w * (1.0 - brightness_uniformity)
344
+ elif not strong_sky_signal:
345
+ uniformity_score = weights.brightness_non_uniformity_indoor_penalty_w * (1.0 - brightness_uniformity)
346
+
347
+ return {"score": uniformity_score}
348
+
349
+ def _analyze_light_sources(self, features: Dict[str, Any], strong_sky_signal: bool,
350
+ diagnostics: Dict[str, Any]) -> Dict[str, Any]:
351
+ """Analyze artificial light source patterns."""
352
+ light_score = 0.0
353
+
354
+ # Extract light features
355
+ indoor_light_score = features.get("indoor_light_score", 0.0)
356
+ circular_light_count = features.get("circular_light_count", 0)
357
+ bright_spot_count = features.get("bright_spot_count", 0)
358
+ avg_brightness = features.get("avg_brightness", 100.0)
359
+ gradient_ratio = features.get("gradient_ratio_vertical_horizontal", 1.0)
360
+ edges_density = features.get("edges_density", 0.0)
361
+
362
+ # Get configuration
363
+ thresholds = self.config_manager.indoor_outdoor_thresholds
364
+ weights = self.config_manager.weighting_factors
365
+ override_factors = self.config_manager.override_factors
366
+
367
+ # Circular lights detection
368
+ if circular_light_count >= 1 and not strong_sky_signal:
369
+ light_score += weights.circular_lights_w * circular_light_count
370
+
371
+ # Indoor light score
372
+ elif indoor_light_score > 0.55 and not strong_sky_signal:
373
+ light_score += weights.indoor_light_score_w * indoor_light_score
374
+
375
+ # Many bright spots in dim scenes
376
+ elif (bright_spot_count > thresholds.many_bright_spots_thresh and
377
+ avg_brightness < thresholds.dim_scene_for_spots_thresh and
378
+ not strong_sky_signal):
379
+ light_score += weights.many_bright_spots_indoor_w * min(bright_spot_count / 10.0, 1.5)
380
+
381
+ # Street structure detection
382
+ is_likely_street_structure = (0.7 < gradient_ratio < 1.5) and edges_density > 0.15
383
+
384
+ if is_likely_street_structure and bright_spot_count > 3 and not strong_sky_signal:
385
+ light_score *= 0.2
386
+ diagnostics["street_lights_heuristic_visual"] = True
387
+ elif strong_sky_signal:
388
+ light_score *= override_factors.sky_override_factor_lights
389
+
390
+ return {"score": light_score}
391
+
392
+ def _analyze_color_atmosphere(self, features: Dict[str, Any], strong_sky_signal: bool,
393
+ diagnostics: Dict[str, Any]) -> Dict[str, Any]:
394
+ """Analyze color atmosphere patterns."""
395
+ atmosphere_score = 0.0
396
+
397
+ # Extract features
398
+ color_atmosphere = features.get("color_atmosphere", "neutral")
399
+ avg_brightness = features.get("avg_brightness", 100.0)
400
+ avg_saturation = features.get("avg_saturation", 100.0)
401
+ gradient_ratio = features.get("gradient_ratio_vertical_horizontal", 1.0)
402
+ edges_density = features.get("edges_density", 0.0)
403
+ indoor_light_score = features.get("indoor_light_score", 0.0)
404
+
405
+ # Get configuration
406
+ thresholds = self.config_manager.indoor_outdoor_thresholds
407
+ weights = self.config_manager.weighting_factors
408
+
409
+ # Warm atmosphere analysis
410
+ if (color_atmosphere == "warm" and
411
+ avg_brightness < thresholds.warm_indoor_max_brightness_thresh):
412
+
413
+ # Check exclusion conditions
414
+ is_likely_street_structure = (0.7 < gradient_ratio < 1.5) and edges_density > 0.15
415
+ is_complex_urban_top = diagnostics.get("complex_urban_top_visual", False)
416
+
417
+ if (not strong_sky_signal and not is_complex_urban_top and
418
+ not (is_likely_street_structure and avg_brightness > 80) and
419
+ avg_saturation < 160):
420
+
421
+ if indoor_light_score > 0.05:
422
+ atmosphere_score = weights.warm_atmosphere_indoor_w
423
+
424
+ return {"score": atmosphere_score}
425
+
426
+ def _analyze_home_environment_pattern(self, features: Dict[str, Any], strong_sky_signal: bool,
427
+ diagnostics: Dict[str, Any]) -> Dict[str, Any]:
428
+ """Analyze home/residential environment patterns."""
429
+ home_score = 0.0
430
+
431
+ if strong_sky_signal:
432
+ diagnostics["skipped_home_env_visual_due_to_sky"] = True
433
+ return {"score": 0.0}
434
+
435
+ # Calculate bedroom/home indicators
436
+ bedroom_indicators = 0.0
437
+ brightness_uniformity = features.get("brightness_uniformity", 0.0)
438
+ boundary_clarity = features.get("boundary_clarity", 0.0)
439
+ ceiling_likelihood = features.get("ceiling_likelihood", 0.0)
440
+ bright_spot_count = features.get("bright_spot_count", 0)
441
+ circular_light_count = features.get("circular_light_count", 0)
442
+ warm_ratio = features.get("warm_ratio", 0.0)
443
+ avg_saturation = features.get("avg_saturation", 100.0)
444
+
445
+ # Accumulate indicators
446
+ if brightness_uniformity > 0.65 and boundary_clarity > 0.40:
447
+ bedroom_indicators += 1.1
448
+
449
+ if ceiling_likelihood > 0.35 and (bright_spot_count > 0 or circular_light_count > 0):
450
+ bedroom_indicators += 1.1
451
+
452
+ if warm_ratio > 0.55 and brightness_uniformity > 0.65:
453
+ bedroom_indicators += 1.0
454
+
455
+ if brightness_uniformity > 0.70 and avg_saturation < 60:
456
+ bedroom_indicators += 0.7
457
+
458
+ # Get configuration
459
+ thresholds = self.config_manager.indoor_outdoor_thresholds
460
+ weights = self.config_manager.weighting_factors
461
+
462
+ # Apply scoring based on indicator strength
463
+ if bedroom_indicators >= thresholds.home_pattern_thresh_strong:
464
+ home_score = weights.home_env_strong_w
465
+ elif bedroom_indicators >= thresholds.home_pattern_thresh_moderate:
466
+ home_score = weights.home_env_moderate_w
467
+
468
+ if bedroom_indicators > 0:
469
+ diagnostics["home_environment_pattern_visual_indicators"] = round(bedroom_indicators, 1)
470
+
471
+ return {"score": home_score}
472
+
473
+ def _analyze_aerial_street_pattern(self, features: Dict[str, Any], strong_sky_signal: bool,
474
+ contributions: Dict[str, float],
475
+ diagnostics: Dict[str, Any]) -> Dict[str, Any]:
476
+ """Analyze aerial view street patterns."""
477
+ aerial_score = 0.0
478
+
479
+ # Extract features
480
+ sky_brightness_ratio = features.get("sky_region_brightness_ratio", 1.0)
481
+ texture_complexity = features.get("top_region_texture_complexity", 0.5)
482
+ avg_brightness = features.get("avg_brightness", 100.0)
483
+
484
+ # Get configuration
485
+ thresholds = self.config_manager.indoor_outdoor_thresholds
486
+ weights = self.config_manager.weighting_factors
487
+
488
+ # Aerial street pattern detection
489
+ if (sky_brightness_ratio < thresholds.aerial_top_dark_ratio_thresh and
490
+ texture_complexity > thresholds.aerial_top_complex_thresh and
491
+ avg_brightness > thresholds.aerial_min_avg_brightness_thresh and
492
+ not strong_sky_signal):
493
+
494
+ aerial_score = -weights.aerial_street_w
495
+ diagnostics["aerial_street_pattern_visual_detected"] = True
496
+
497
+ # Reduce enclosure features if aerial pattern detected
498
+ if ("enclosure_features" in contributions and
499
+ contributions["enclosure_features"] > 0):
500
+
501
+ reduction_factor = self.config_manager.override_factors.aerial_enclosure_reduction_factor
502
+ positive_enclosure_score = max(0, contributions["enclosure_features"])
503
+ reduction_amount = positive_enclosure_score * reduction_factor
504
+
505
+ contributions["enclosure_features_reduced_by_aerial"] = round(-reduction_amount, 2)
506
+ contributions["enclosure_features"] = round(
507
+ contributions["enclosure_features"] - reduction_amount, 2
508
+ )
509
+
510
+ return {"score": aerial_score}
511
+
512
+ def _analyze_places365_influence(self, p365_context: Dict[str, Any],
513
+ strong_sky_signal: bool,
514
+ diagnostics: Dict[str, Any]) -> Dict[str, Any]:
515
+ """Analyze Places365 influence on classification."""
516
+ p365_influence_score = 0.0
517
+
518
+ if not p365_context or p365_context["confidence"] < self.P365_MODERATE_CONF_THRESHOLD:
519
+ return {"influence_score": 0.0}
520
+
521
+ # Places365 direct classification influence
522
+ if p365_context["is_indoor_from_classification"] is not None:
523
+ p365_influence_score += self._compute_direct_classification_influence(
524
+ p365_context, strong_sky_signal, diagnostics
525
+ )
526
+
527
+ # Places365 scene context influence
528
+ elif p365_context["confidence"] >= self.P365_MODERATE_CONF_THRESHOLD:
529
+ p365_influence_score += self._compute_scene_context_influence(
530
+ p365_context, strong_sky_signal, diagnostics
531
+ )
532
+
533
+ # Places365 attributes influence
534
+ if p365_context["attributes"] and p365_context["confidence"] > 0.5:
535
+ p365_influence_score += self._compute_attributes_influence(
536
+ p365_context, strong_sky_signal, diagnostics
537
+ )
538
+
539
+ # High confidence street scene boost
540
+ if (p365_context["confidence"] >= 0.85 and
541
+ any(kw in p365_context["mapped_scene"] for kw in ["intersection", "crosswalk", "street", "road"])):
542
+
543
+ additional_outdoor_push = -3.0 * p365_context["confidence"]
544
+ p365_influence_score += additional_outdoor_push
545
+ diagnostics["p365_street_scene_boost"] = (
546
+ f"Additional outdoor push: {additional_outdoor_push:.2f} for street scene: "
547
+ f"{p365_context['mapped_scene']}"
548
+ )
549
+ self.logger.debug(f"High confidence street scene detected - "
550
+ f"{p365_context['mapped_scene']} with confidence {p365_context['confidence']:.3f}")
551
+
552
+ return {"influence_score": p365_influence_score}
553
+
554
+ def _compute_direct_classification_influence(self, p365_context: Dict[str, Any],
555
+ strong_sky_signal: bool,
556
+ diagnostics: Dict[str, Any]) -> float:
557
+ """Compute influence from Places365 direct indoor/outdoor classification."""
558
+ P365_DIRECT_INDOOR_WEIGHT = 3.5
559
+ P365_DIRECT_OUTDOOR_WEIGHT = 4.0
560
+
561
+ confidence = p365_context["confidence"]
562
+ is_indoor = p365_context["is_indoor_from_classification"]
563
+ mapped_scene = p365_context["mapped_scene"]
564
+
565
+ if is_indoor is True:
566
+ current_contrib = P365_DIRECT_INDOOR_WEIGHT * confidence
567
+ diagnostics["p365_influence_source"] = (
568
+ f"P365_DirectIndoor(True,Conf:{confidence:.2f},Scene:{mapped_scene})"
569
+ )
570
+ else:
571
+ current_contrib = -P365_DIRECT_OUTDOOR_WEIGHT * confidence
572
+ diagnostics["p365_influence_source"] = (
573
+ f"P365_DirectIndoor(False,Conf:{confidence:.2f},Scene:{mapped_scene})"
574
+ )
575
+
576
+ # Apply sky override for indoor predictions
577
+ if strong_sky_signal and current_contrib > 0:
578
+ sky_override_factor = self.config_manager.override_factors.sky_override_factor_p365_indoor_decision
579
+ current_contrib *= sky_override_factor
580
+ diagnostics["p365_indoor_push_reduced_by_visual_sky"] = f"Reduced to {current_contrib:.2f}"
581
+
582
+ return current_contrib
583
+
584
+ def _compute_scene_context_influence(self, p365_context: Dict[str, Any],
585
+ strong_sky_signal: bool,
586
+ diagnostics: Dict[str, Any]) -> float:
587
+ """Compute influence from Places365 scene context."""
588
+ P365_SCENE_CONTEXT_INDOOR_WEIGHT = 2.0
589
+ P365_SCENE_CONTEXT_OUTDOOR_WEIGHT = 2.5
590
+
591
+ confidence = p365_context["confidence"]
592
+ mapped_scene = p365_context["mapped_scene"]
593
+
594
+ is_def_indoor = any(kw in mapped_scene for kw in self.DEFINITELY_INDOOR_KEYWORDS_P365)
595
+ is_def_outdoor = any(kw in mapped_scene for kw in self.DEFINITELY_OUTDOOR_KEYWORDS_P365)
596
+
597
+ current_contrib = 0.0
598
+
599
+ if is_def_indoor and not is_def_outdoor:
600
+ current_contrib = P365_SCENE_CONTEXT_INDOOR_WEIGHT * confidence
601
+ diagnostics["p365_influence_source"] = (
602
+ f"P365_SceneContext(Indoor: {mapped_scene}, Conf:{confidence:.2f})"
603
+ )
604
+ elif is_def_outdoor and not is_def_indoor:
605
+ current_contrib = -P365_SCENE_CONTEXT_OUTDOOR_WEIGHT * confidence
606
+ diagnostics["p365_influence_source"] = (
607
+ f"P365_SceneContext(Outdoor: {mapped_scene}, Conf:{confidence:.2f})"
608
+ )
609
+
610
+ # Apply sky override for indoor predictions
611
+ if strong_sky_signal and current_contrib > 0:
612
+ sky_override_factor = self.config_manager.override_factors.sky_override_factor_p365_indoor_decision
613
+ current_contrib *= sky_override_factor
614
+ diagnostics["p365_context_indoor_push_reduced_by_visual_sky"] = f"Reduced to {current_contrib:.2f}"
615
+
616
+ return current_contrib
617
+
618
+ def _compute_attributes_influence(self, p365_context: Dict[str, Any],
619
+ strong_sky_signal: bool,
620
+ diagnostics: Dict[str, Any]) -> float:
621
+ """Compute influence from Places365 attributes."""
622
+ P365_ATTRIBUTE_INDOOR_WEIGHT = 1.0
623
+ P365_ATTRIBUTE_OUTDOOR_WEIGHT = 1.5
624
+
625
+ confidence = p365_context["confidence"]
626
+ attributes = p365_context["attributes"]
627
+
628
+ attr_contrib = 0.0
629
+
630
+ if "indoor" in attributes and "outdoor" not in attributes:
631
+ attr_contrib += P365_ATTRIBUTE_INDOOR_WEIGHT * (confidence * 0.5)
632
+ diagnostics["p365_attr_influence"] = f"+{attr_contrib:.2f} (indoor attr)"
633
+ elif "outdoor" in attributes and "indoor" not in attributes:
634
+ attr_contrib -= P365_ATTRIBUTE_OUTDOOR_WEIGHT * (confidence * 0.5)
635
+ diagnostics["p365_attr_influence"] = f"{attr_contrib:.2f} (outdoor attr)"
636
+
637
+ # Apply sky override for indoor attributes
638
+ if strong_sky_signal and attr_contrib > 0:
639
+ sky_override_factor = self.config_manager.override_factors.sky_override_factor_p365_indoor_decision
640
+ attr_contrib *= sky_override_factor
641
+
642
+ return attr_contrib
643
+
644
+ def _compute_final_classification(self, final_indoor_score: float, visual_score: float,
645
+ p365_influence_score: float, diagnostics: Dict[str, Any]) -> Dict[str, Any]:
646
+ """Compute final classification probability and decision."""
647
+ # Record score breakdown
648
+ diagnostics["final_indoor_score_value"] = round(final_indoor_score, 3)
649
+ diagnostics["final_score_breakdown"] = (
650
+ f"VisualScore: {visual_score:.2f}, P365Influence: {p365_influence_score:.2f}"
651
+ )
652
+
653
+ # Apply sigmoid transformation
654
+ sigmoid_scale = self.config_manager.algorithm_parameters.indoor_score_sigmoid_scale
655
+ indoor_probability = 1 / (1 + np.exp(-final_indoor_score * sigmoid_scale))
656
+
657
+ # Make decision
658
+ decision_threshold = self.config_manager.algorithm_parameters.indoor_decision_threshold
659
+ is_indoor = indoor_probability > decision_threshold
660
+
661
+ return {
662
+ "is_indoor": is_indoor,
663
+ "indoor_probability": indoor_probability,
664
+ "final_score": final_indoor_score
665
+ }
666
+
667
+ def _apply_places365_override(self, classification_result: Dict[str, Any],
668
+ p365_context: Dict[str, Any],
669
+ diagnostics: Dict[str, Any]) -> Dict[str, Any]:
670
+ """Apply Places365 high-confidence override if conditions are met."""
671
+ is_indoor = classification_result["is_indoor"]
672
+ indoor_probability = classification_result["indoor_probability"]
673
+ final_score = classification_result["final_score"]
674
+
675
+ # Check for override conditions
676
+ if not p365_context or p365_context["confidence"] < 0.5:
677
+ diagnostics["final_indoor_probability_calculated"] = round(indoor_probability, 3)
678
+ diagnostics["final_is_indoor_decision"] = bool(is_indoor)
679
+ return classification_result
680
+
681
+ p365_is_indoor_decision = p365_context.get("is_indoor", None)
682
+ confidence = p365_context["confidence"]
683
+
684
+ self.logger.debug(f"Override check: is_indoor={is_indoor}, p365_conf={confidence}, "
685
+ f"p365_raw_is_indoor={p365_is_indoor_decision}")
686
+
687
+ # Apply override for high confidence Places365 decisions
688
+ if p365_is_indoor_decision is not None:
689
+ if p365_is_indoor_decision == False:
690
+ self.logger.debug(f"Applying outdoor override. Original: {is_indoor}")
691
+ original_decision = f"Indoor:{is_indoor}, Prob:{indoor_probability:.3f}, Score:{final_score:.2f}"
692
+
693
+ is_indoor = False
694
+ indoor_probability = 0.02
695
+ final_score = -8.0
696
+
697
+ diagnostics["p365_force_override_applied"] = (
698
+ f"P365 FORCED OUTDOOR (is_indoor: {p365_is_indoor_decision}, Conf: {confidence:.3f})"
699
+ )
700
+ diagnostics["p365_override_original_decision"] = original_decision
701
+ self.logger.info(f"Places365 FORCED OUTDOOR override applied. New is_indoor: {is_indoor}")
702
+
703
+ elif p365_is_indoor_decision == True:
704
+ self.logger.debug(f"Applying indoor override. Original: {is_indoor}")
705
+ original_decision = f"Indoor:{is_indoor}, Prob:{indoor_probability:.3f}, Score:{final_score:.2f}"
706
+
707
+ is_indoor = True
708
+ indoor_probability = 0.98
709
+ final_score = 8.0
710
+
711
+ diagnostics["p365_force_override_applied"] = (
712
+ f"P365 FORCED INDOOR (is_indoor: {p365_is_indoor_decision}, Conf: {confidence:.3f})"
713
+ )
714
+ diagnostics["p365_override_original_decision"] = original_decision
715
+ self.logger.info(f"Places365 FORCED INDOOR override applied. New is_indoor: {is_indoor}")
716
+
717
+ # Record final values
718
+ diagnostics["final_indoor_probability_calculated"] = round(indoor_probability, 3)
719
+ diagnostics["final_is_indoor_decision"] = bool(is_indoor)
720
+
721
+ self.logger.debug(f"Final classification: is_indoor={is_indoor}, score={final_score}, prob={indoor_probability}")
722
+
723
+ return {
724
+ "is_indoor": is_indoor,
725
+ "indoor_probability": indoor_probability,
726
+ "final_score": final_score
727
+ }
728
+
729
+ def _ensure_default_contributions(self, feature_contributions: Dict[str, float]) -> None:
730
+ """Ensure all expected feature contribution keys have default values."""
731
+ default_keys = [
732
+ "sky_openness_features", "enclosure_features",
733
+ "brightness_uniformity_contribution", "light_source_features"
734
+ ]
735
+
736
+ for key in default_keys:
737
+ if key not in feature_contributions:
738
+ feature_contributions[key] = 0.0
739
+
740
+ def _get_default_classification_result(self) -> Dict[str, Any]:
741
+ """Return default classification result in case of errors."""
742
+ return {
743
+ "is_indoor": False,
744
+ "indoor_probability": 0.5,
745
+ "indoor_score_raw": 0.0,
746
+ "feature_contributions": {
747
+ "sky_openness_features": 0.0,
748
+ "enclosure_features": 0.0,
749
+ "brightness_uniformity_contribution": 0.0,
750
+ "light_source_features": 0.0
751
+ },
752
+ "diagnostics": {
753
+ "error": "Classification failed, using default values"
754
+ }
755
+ }
landmark_data_manager.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import logging
3
+ import traceback
4
+ from typing import List, Dict, Tuple, Optional, Union, Any
5
+
6
+ from landmark_data import ALL_LANDMARKS, get_all_landmark_prompts
7
+ from landmark_activities import LANDMARK_ACTIVITIES
8
+
9
+ class LandmarkDataManager:
10
+ """
11
+ 專門處理地標數據的載入、管理和查詢功能,包括地標信息、提示詞和活動建議
12
+ """
13
+
14
+ def __init__(self):
15
+ """
16
+ initialize landmark related
17
+ """
18
+ self.logger = logging.getLogger(__name__)
19
+ self.landmark_data = {}
20
+ self.landmark_prompts = []
21
+ self.landmark_id_to_index = {}
22
+ self.is_enabled = False
23
+
24
+ self._load_landmark_data()
25
+
26
+ def _load_landmark_data(self):
27
+ """
28
+ 載入地標數據和相關資訊
29
+ """
30
+ try:
31
+ self.landmark_data = ALL_LANDMARKS
32
+ self.landmark_prompts = get_all_landmark_prompts()
33
+ self.logger.info(f"Loaded {len(self.landmark_prompts)} landmark prompts for classification")
34
+
35
+ # 創建地標ID到索引的映射,可快速查找
36
+ self.landmark_id_to_index = {landmark_id: i for i, landmark_id in enumerate(ALL_LANDMARKS.keys())}
37
+
38
+ self.is_enabled = True
39
+ self.logger.info(f"Successfully loaded landmark data with {len(self.landmark_data)} landmarks")
40
+
41
+ except ImportError:
42
+ self.logger.warning("landmark_data.py not found. Landmark classification will be limited")
43
+ self.landmark_data = {}
44
+ self.landmark_prompts = []
45
+ self.landmark_id_to_index = {}
46
+ self.is_enabled = False
47
+ except Exception as e:
48
+ self.logger.error(f"Error loading landmark data: {e}")
49
+ self.logger.error(traceback.format_exc())
50
+ self.landmark_data = {}
51
+ self.landmark_prompts = []
52
+ self.landmark_id_to_index = {}
53
+ self.is_enabled = False
54
+
55
+ def get_landmark_prompts(self) -> List[str]:
56
+ """
57
+ 獲取所有地標提示詞
58
+
59
+ Returns:
60
+ List[str]: 地標提示詞列表
61
+ """
62
+ return self.landmark_prompts
63
+
64
+ def get_landmark_by_id(self, landmark_id: str) -> Dict[str, Any]:
65
+ """
66
+ 根據地標ID獲取地標信息
67
+
68
+ Args:
69
+ landmark_id: Landmark ID
70
+
71
+ Returns:
72
+ Dict[str, Any]: 地標詳細信息
73
+ """
74
+ return self.landmark_data.get(landmark_id, {})
75
+
76
+ def get_landmark_by_index(self, index: int) -> Tuple[str, Dict[str, Any]]:
77
+ """
78
+ 根據索引獲取地標信息
79
+
80
+ Args:
81
+ index: 地標在列表中的索引
82
+
83
+ Returns:
84
+ Tuple[str, Dict[str, Any]]: (地標ID, 地標info)
85
+ """
86
+ try:
87
+ landmark_ids = list(self.landmark_data.keys())
88
+ if 0 <= index < len(landmark_ids):
89
+ landmark_id = landmark_ids[index]
90
+ return landmark_id, self.landmark_data[landmark_id]
91
+ else:
92
+ self.logger.warning(f"Index {index} out of range for landmark data")
93
+ return None, {}
94
+ except Exception as e:
95
+ self.logger.error(f"Error getting landmark by index {index}: {e}")
96
+ self.logger.error(traceback.format_exc())
97
+ return None, {}
98
+
99
+ def get_landmark_index(self, landmark_id: str) -> Optional[int]:
100
+ """
101
+ 獲取地標ID對應的index
102
+
103
+ Args:
104
+ landmark_id: 地標ID
105
+
106
+ Returns:
107
+ Optional[int]: 索引,如果不存在則返回None
108
+ """
109
+ return self.landmark_id_to_index.get(landmark_id)
110
+
111
+ def determine_landmark_type(self, landmark_id: str) -> str:
112
+ """
113
+ 自動判斷地標類型,基於地標數據和命名
114
+
115
+ Args:
116
+ landmark_id: 地標ID
117
+
118
+ Returns:
119
+ str: 地標類型,用於調整閾值
120
+ """
121
+ if not landmark_id:
122
+ return "building" # 預設類型
123
+
124
+ try:
125
+ # 獲取地標詳細數據
126
+ landmark_info = self.landmark_data.get(landmark_id, {})
127
+
128
+ # 獲取地標相關文本
129
+ landmark_id_lower = landmark_id.lower()
130
+ landmark_name = landmark_info.get("name", "").lower()
131
+ landmark_location = landmark_info.get("location", "").lower()
132
+ landmark_aliases = [alias.lower() for alias in landmark_info.get("aliases", [])]
133
+
134
+ # 合併所有文本數據用於特徵判斷
135
+ combined_text = " ".join([landmark_id_lower, landmark_name] + landmark_aliases)
136
+
137
+ # 地標類型的特色特徵
138
+ type_features = {
139
+ "skyscraper": ["skyscraper", "tall", "tower", "高樓", "摩天", "大厦", "タワー"],
140
+ "tower": ["tower", "bell", "clock", "塔", "鐘樓", "タワー", "campanile"],
141
+ "monument": ["monument", "memorial", "statue", "紀念", "雕像", "像", "memorial"],
142
+ "natural": ["mountain", "lake", "canyon", "falls", "beach", "山", "湖", "峽谷", "瀑布", "海灘"],
143
+ "temple": ["temple", "shrine", "寺", "神社", "廟"],
144
+ "palace": ["palace", "castle", "宮", "城", "皇宮", "宫殿"],
145
+ "distinctive": ["unique", "leaning", "slanted", "傾斜", "斜", "獨特", "傾く"]
146
+ }
147
+
148
+ # 檢查是否位於亞洲地區
149
+ asian_regions = ["china", "japan", "korea", "taiwan", "singapore", "vietnam", "thailand",
150
+ "hong kong", "中國", "日本", "韓國", "台灣", "新加坡", "越南", "泰國", "香港"]
151
+ is_asian = any(region in landmark_location for region in asian_regions)
152
+
153
+ # 判斷地標類型
154
+ best_type = None
155
+ max_matches = 0
156
+
157
+ for type_name, features in type_features.items():
158
+ # 計算特徵詞匹配數量
159
+ matches = sum(1 for feature in features if feature in combined_text)
160
+ if matches > max_matches:
161
+ max_matches = matches
162
+ best_type = type_name
163
+
164
+ # 處理亞洲地區特例
165
+ if is_asian and best_type == "tower":
166
+ best_type = "skyscraper" # 亞洲地區的塔型建築閾值較低
167
+
168
+ # 特例處理:檢測傾斜建築
169
+ if any(term in combined_text for term in ["leaning", "slanted", "tilt", "inclined", "斜", "傾斜"]):
170
+ return "distinctive" # 傾斜建築需要特殊處理
171
+
172
+ return best_type if best_type and max_matches > 0 else "building" # 預設為一般建築
173
+
174
+ except Exception as e:
175
+ self.logger.error(f"Error determining landmark type for {landmark_id}: {e}")
176
+ self.logger.error(traceback.format_exc())
177
+ return "building"
178
+
179
+ def extract_landmark_specific_info(self, landmark_id: str) -> Dict[str, Any]:
180
+ """
181
+ 提取特定地標的詳細信息,包括特色模板和活動建議
182
+
183
+ Args:
184
+ landmark_id: 地標ID
185
+
186
+ Returns:
187
+ Dict[str, Any]: 地標特定信息
188
+ """
189
+ if not landmark_id or landmark_id == "unknown":
190
+ return {"has_specific_activities": False}
191
+
192
+ specific_info = {"has_specific_activities": False}
193
+
194
+ try:
195
+ # 從 landmark_data 中提取基本信息
196
+ landmark_data_source = self.landmark_data.get(landmark_id)
197
+
198
+ # 處理地標基本數據
199
+ if landmark_data_source:
200
+ # 提取正確的地標名稱
201
+ if "name" in landmark_data_source:
202
+ specific_info["landmark_name"] = landmark_data_source["name"]
203
+
204
+ # 提取所有可用的 prompts 作為特色模板
205
+ if "prompts" in landmark_data_source:
206
+ specific_info["feature_templates"] = landmark_data_source["prompts"][:5]
207
+ specific_info["primary_template"] = landmark_data_source["prompts"][0]
208
+
209
+ # 提取別名info
210
+ if "aliases" in landmark_data_source:
211
+ specific_info["aliases"] = landmark_data_source["aliases"]
212
+
213
+ # 提取位置信息
214
+ if "location" in landmark_data_source:
215
+ specific_info["location"] = landmark_data_source["location"]
216
+
217
+ # 提取其他相關信息
218
+ for key in ["year_built", "architectural_style", "significance", "description"]:
219
+ if key in landmark_data_source:
220
+ specific_info[key] = landmark_data_source[key]
221
+
222
+ # 嘗試從 LANDMARK_ACTIVITIES 中提取活動建議
223
+ try:
224
+ if landmark_id in LANDMARK_ACTIVITIES:
225
+ activities = LANDMARK_ACTIVITIES[landmark_id]
226
+ specific_info["landmark_specific_activities"] = activities
227
+ specific_info["has_specific_activities"] = True
228
+ self.logger.info(f"Found {len(activities)} specific activities for landmark {landmark_id}")
229
+ else:
230
+ self.logger.info(f"No specific activities found for landmark {landmark_id} in LANDMARK_ACTIVITIES")
231
+ specific_info["has_specific_activities"] = False
232
+ except ImportError:
233
+ self.logger.warning("Could not import LANDMARK_ACTIVITIES from landmark_activities")
234
+ specific_info["has_specific_activities"] = False
235
+ except Exception as e:
236
+ self.logger.error(f"Error loading landmark activities for {landmark_id}: {e}")
237
+ self.logger.error(traceback.format_exc())
238
+ specific_info["has_specific_activities"] = False
239
+
240
+ except Exception as e:
241
+ self.logger.error(f"Error extracting landmark specific info for {landmark_id}: {e}")
242
+ self.logger.error(traceback.format_exc())
243
+
244
+ return specific_info
245
+
246
+ def get_landmark_count(self) -> int:
247
+ """
248
+ 獲取地標總數
249
+
250
+ Returns:
251
+ int: 地標數量
252
+ """
253
+ return len(self.landmark_data)
254
+
255
+ def is_landmark_enabled(self) -> bool:
256
+ """
257
+ 檢查地標功能是否啟用
258
+
259
+ Returns:
260
+ bool: 地標功能狀態
261
+ """
262
+ return self.is_enabled
263
+
264
+ def get_all_landmark_ids(self) -> List[str]:
265
+ """
266
+ 獲取所有地標ID列表
267
+
268
+ Returns:
269
+ List[str]: 地標ID列表
270
+ """
271
+ return list(self.landmark_data.keys())
272
+
273
+ def validate_landmark_id(self, landmark_id: str) -> bool:
274
+ """
275
+ 驗證地標ID是否有效
276
+
277
+ Args:
278
+ landmark_id: 要驗證的地標ID
279
+
280
+ Returns:
281
+ bool: ID是否有效
282
+ """
283
+ return landmark_id in self.landmark_data
landmark_processing_manager.py ADDED
@@ -0,0 +1,512 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import logging
3
+ import traceback
4
+ import numpy as np
5
+ from typing import Dict, List, Tuple, Optional, Any
6
+ from PIL import Image
7
+
8
+ from clip_zero_shot_classifier import CLIPZeroShotClassifier
9
+ from landmark_activities import LANDMARK_ACTIVITIES
10
+ from landmark_data import ALL_LANDMARKS
11
+
12
+
13
+ class LandmarkProcessingManager:
14
+ """
15
+ 負責處理所有地標相關的檢測和處理邏輯,包括未知物體的地標識別、
16
+ 地標物體的創建和驗證,以及地標引用的清理。
17
+ """
18
+
19
+ def __init__(self, enable_landmark: bool = True, use_clip: bool = True):
20
+ """
21
+ 初始化地標處理管理器。
22
+
23
+ Args:
24
+ enable_landmark: 是否啟用地標檢測功能
25
+ use_clip: 是否啟用 CLIP 分析功能
26
+ """
27
+ self.logger = logging.getLogger(__name__)
28
+ self.enable_landmark = enable_landmark
29
+ self.use_clip = use_clip
30
+
31
+ # 載入地標相關數據
32
+ self.landmark_activities = {}
33
+ self.all_landmarks = {}
34
+ self._load_landmark_data()
35
+
36
+ # 地標分類器將按需初始化
37
+ self.landmark_classifier = None
38
+
39
+ def _load_landmark_data(self):
40
+ """載入地標相關的數據結構。"""
41
+ try:
42
+ self.landmark_activities = LANDMARK_ACTIVITIES
43
+ self.logger.info("Loaded LANDMARK_ACTIVITIES successfully")
44
+ except ImportError as e:
45
+ self.logger.warning(f"Failed to load LANDMARK_ACTIVITIES: {e}")
46
+ self.landmark_activities = {}
47
+
48
+ try:
49
+ self.all_landmarks = ALL_LANDMARKS
50
+ self.logger.info("Loaded ALL_LANDMARKS successfully")
51
+ except ImportError as e:
52
+ self.logger.warning(f"Failed to load ALL_LANDMARKS: {e}")
53
+ self.all_landmarks = {}
54
+
55
+ def set_landmark_classifier(self, landmark_classifier):
56
+ """
57
+ 設置地標分類器實例。
58
+
59
+ Args:
60
+ landmark_classifier: CLIPZeroShotClassifier 實例
61
+ """
62
+ self.landmark_classifier = landmark_classifier
63
+
64
+ def process_unknown_objects(self, detection_result, detected_objects, clip_analyzer=None):
65
+ """
66
+ 對 YOLO 未能識別或信心度低的物體進行地標檢測。
67
+
68
+ Args:
69
+ detection_result: YOLO 檢測結果
70
+ detected_objects: 已識別的物體列表
71
+ clip_analyzer: CLIP 分析器實例(用於按需初始化地標分類器)
72
+
73
+ Returns:
74
+ tuple: (更新後的物體列表, 地標物體列表)
75
+ """
76
+ if (not self.enable_landmark or not self.use_clip or
77
+ not hasattr(self, 'use_landmark_detection') or not self.use_landmark_detection):
78
+ # 未啟用地標識別時,確保返回的物體列表中不包含任何地標物體
79
+ cleaned_objects = [obj for obj in detected_objects if not obj.get("is_landmark", False)]
80
+ return cleaned_objects, []
81
+
82
+ try:
83
+ # 獲取原始圖像
84
+ original_image = None
85
+ if detection_result is not None and hasattr(detection_result, 'orig_img'):
86
+ original_image = detection_result.orig_img
87
+
88
+ # 檢查原始圖像是否存在
89
+ if original_image is None:
90
+ self.logger.warning("Original image not available for landmark detection")
91
+ return detected_objects, []
92
+
93
+ # 確保原始圖像為 PIL 格式或可轉換為 PIL 格式
94
+ if not isinstance(original_image, Image.Image):
95
+ if isinstance(original_image, np.ndarray):
96
+ try:
97
+ if original_image.ndim == 3 and original_image.shape[2] == 4: # RGBA
98
+ original_image = original_image[:, :, :3] # 轉換為 RGB
99
+ if original_image.ndim == 2: # 灰度圖
100
+ original_image = Image.fromarray(original_image).convert("RGB")
101
+ else: # 假設為 RGB 或 BGR
102
+ original_image = Image.fromarray(original_image)
103
+
104
+ if hasattr(original_image, 'mode') and original_image.mode == 'BGR': # 從 OpenCV 明確將 BGR 轉換為 RGB
105
+ original_image = original_image.convert('RGB')
106
+ except Exception as e:
107
+ self.logger.warning(f"Error converting image for landmark detection: {e}")
108
+ return detected_objects, []
109
+ else:
110
+ self.logger.warning(f"Cannot process image of type {type(original_image)}")
111
+ return detected_objects, []
112
+
113
+ # 獲取圖像維度
114
+ if isinstance(original_image, np.ndarray):
115
+ h, w = original_image.shape[:2]
116
+ elif isinstance(original_image, Image.Image):
117
+ w, h = original_image.size
118
+ else:
119
+ self.logger.warning(f"Unable to determine image dimensions for type {type(original_image)}")
120
+ return detected_objects, []
121
+
122
+ # 收集可能含有地標的區域
123
+ candidate_boxes = []
124
+ low_conf_boxes = []
125
+
126
+ # 即使沒有 YOLO 檢測到的物體,也嘗試進行更詳細的地標分析
127
+ if len(detected_objects) == 0:
128
+ # 創建一個包含整個圖像的框
129
+ full_image_box = [0, 0, w, h]
130
+ low_conf_boxes.append(full_image_box)
131
+ candidate_boxes.append((full_image_box, "full_image"))
132
+
133
+ # 加入網格分析以增加檢測成功率
134
+ grid_size = 2 # 2x2 網格
135
+ for i in range(grid_size):
136
+ for j in range(grid_size):
137
+ # 創建網格框
138
+ grid_box = [
139
+ j * w / grid_size,
140
+ i * h / grid_size,
141
+ (j + 1) * w / grid_size,
142
+ (i + 1) * h / grid_size
143
+ ]
144
+ low_conf_boxes.append(grid_box)
145
+ candidate_boxes.append((grid_box, "grid"))
146
+
147
+ # 創建更大的中心框(覆蓋中心 70% 區域)
148
+ center_box = [
149
+ w * 0.15, h * 0.15,
150
+ w * 0.85, h * 0.85
151
+ ]
152
+ low_conf_boxes.append(center_box)
153
+ candidate_boxes.append((center_box, "center"))
154
+
155
+ self.logger.info("No YOLO detections, attempting detailed landmark analysis with multiple regions")
156
+ else:
157
+ try:
158
+ # 獲取原始 YOLO 檢測結果中的低置信度物體
159
+ if (hasattr(detection_result, 'boxes') and
160
+ hasattr(detection_result.boxes, 'xyxy') and
161
+ hasattr(detection_result.boxes, 'conf') and
162
+ hasattr(detection_result.boxes, 'cls')):
163
+ all_boxes = (detection_result.boxes.xyxy.cpu().numpy()
164
+ if hasattr(detection_result.boxes.xyxy, 'cpu')
165
+ else detection_result.boxes.xyxy)
166
+ all_confs = (detection_result.boxes.conf.cpu().numpy()
167
+ if hasattr(detection_result.boxes.conf, 'cpu')
168
+ else detection_result.boxes.conf)
169
+ all_cls = (detection_result.boxes.cls.cpu().numpy()
170
+ if hasattr(detection_result.boxes.cls, 'cpu')
171
+ else detection_result.boxes.cls)
172
+
173
+ # 收集低置信度區域和可能含有地標的區域(如建築物)
174
+ for i, (box, conf, cls) in enumerate(zip(all_boxes, all_confs, all_cls)):
175
+ is_low_conf = conf < 0.4 and conf > 0.1
176
+
177
+ # 根據物體類別 ID 識別建築物 - 使用通用分類
178
+ common_building_classes = [11, 12, 13, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65] # 常見建築類別 ID
179
+ is_building = int(cls) in common_building_classes
180
+
181
+ # 計算相對面積 - 大物體
182
+ is_large_object = (box[2] - box[0]) * (box[3] - box[1]) > (0.1 * w * h)
183
+
184
+ if is_low_conf or is_building:
185
+ # 確保 box 是一個有效的數組或列表
186
+ if isinstance(box, (list, tuple, np.ndarray)) and len(box) >= 4:
187
+ low_conf_boxes.append(box)
188
+ if is_large_object:
189
+ candidate_boxes.append((box, "building" if is_building else "low_conf"))
190
+ except Exception as e:
191
+ self.logger.error(f"Error processing YOLO detections: {e}")
192
+ traceback.print_exc()
193
+
194
+ # 按需初始化地標分類器
195
+ if not self.landmark_classifier:
196
+ if clip_analyzer and hasattr(clip_analyzer, 'get_clip_instance'):
197
+ try:
198
+ self.logger.info("Initializing landmark classifier for process_unknown_objects")
199
+ model, preprocess, device = clip_analyzer.get_clip_instance()
200
+ self.landmark_classifier = CLIPZeroShotClassifier(device=device)
201
+ except Exception as e:
202
+ self.logger.error(f"Error initializing landmark classifier: {e}")
203
+ return detected_objects, []
204
+ else:
205
+ self.logger.warning("landmark_classifier not available and cannot be initialized")
206
+ return detected_objects, []
207
+
208
+ # 使用智能地標搜索
209
+ landmark_results = None
210
+ try:
211
+ # 確保有有效的框
212
+ if not low_conf_boxes:
213
+ # 如果沒有低置信度框,添加全圖
214
+ low_conf_boxes.append([0, 0, w, h])
215
+
216
+ landmark_results = self.landmark_classifier.intelligent_landmark_search(
217
+ original_image,
218
+ yolo_boxes=low_conf_boxes,
219
+ base_threshold=0.25
220
+ )
221
+ except Exception as e:
222
+ self.logger.error(f"Error in intelligent_landmark_search: {e}")
223
+ traceback.print_exc()
224
+ return detected_objects, []
225
+
226
+ # 處理識別結果
227
+ landmark_objects = []
228
+
229
+ # 如果有效的地標結果
230
+ if landmark_results and landmark_results.get("is_landmark_scene", False):
231
+ for landmark_info in landmark_results.get("detected_landmarks", []):
232
+ try:
233
+ # 使用 landmark_classifier 的閾值判斷
234
+ base_threshold = 0.25 # 基礎閾值
235
+
236
+ # 獲取地標類型並設定閾值
237
+ landmark_type = "architectural" # 預設類型
238
+ type_threshold = 0.5 # 預設閾值
239
+
240
+ # 優先使用 landmark_classifier
241
+ if (hasattr(self.landmark_classifier, '_determine_landmark_type') and
242
+ landmark_info.get("landmark_id")):
243
+ landmark_type = self.landmark_classifier._determine_landmark_type(landmark_info.get("landmark_id"))
244
+ type_threshold = getattr(self.landmark_classifier, 'landmark_type_thresholds', {}).get(landmark_type, 0.5)
245
+ # 否則使用本地方法
246
+ elif hasattr(self, '_determine_landmark_type'):
247
+ landmark_type = self._determine_landmark_type(landmark_info.get("landmark_id", ""))
248
+ # 依據地標類型調整閾值
249
+ if landmark_type == "skyscraper":
250
+ type_threshold = 0.4
251
+ elif landmark_type == "natural":
252
+ type_threshold = 0.6
253
+ # 或者直接從地標 ID 推斷
254
+ else:
255
+ landmark_id = landmark_info.get("landmark_id", "").lower()
256
+ if any(term in landmark_id for term in ["mountain", "canyon", "waterfall", "lake", "river", "natural"]):
257
+ landmark_type = "natural"
258
+ type_threshold = 0.6
259
+ elif any(term in landmark_id for term in ["skyscraper", "building", "tower", "tall"]):
260
+ landmark_type = "skyscraper"
261
+ type_threshold = 0.4
262
+ elif any(term in landmark_id for term in ["monument", "memorial", "statue", "historical"]):
263
+ landmark_type = "monument"
264
+ type_threshold = 0.5
265
+
266
+ effective_threshold = base_threshold * (type_threshold / 0.5)
267
+
268
+ # 如果置信度足夠高
269
+ if landmark_info.get("confidence", 0) > effective_threshold:
270
+ # 獲取邊界框
271
+ if "box" in landmark_info:
272
+ box = landmark_info["box"]
273
+ else:
274
+ # 如果沒有邊界框,使用整個圖像的 90% 區域
275
+ margin_x, margin_y = w * 0.05, h * 0.05
276
+ box = [margin_x, margin_y, w - margin_x, h - margin_y]
277
+
278
+ # 計算中心點和其他必要信息
279
+ center_x = (box[0] + box[2]) / 2
280
+ center_y = (box[1] + box[3]) / 2
281
+ norm_center_x = center_x / w if w > 0 else 0.5
282
+ norm_center_y = center_y / h if h > 0 else 0.5
283
+
284
+ # 獲取區域位置(需要 spatial_analyzer 的支持)
285
+ region = "center" # 預設
286
+
287
+ # 創建地標物體
288
+ landmark_obj = {
289
+ "class_id": (landmark_info.get("landmark_id", "")[:15]
290
+ if isinstance(landmark_info.get("landmark_id", ""), str)
291
+ else "-100"), # 截斷過長的 ID
292
+ "class_name": landmark_info.get("landmark_name", "Unknown Landmark"),
293
+ "confidence": landmark_info.get("confidence", 0.0),
294
+ "box": box,
295
+ "center": (center_x, center_y),
296
+ "normalized_center": (norm_center_x, norm_center_y),
297
+ "size": (box[2] - box[0], box[3] - box[1]),
298
+ "normalized_size": (
299
+ (box[2] - box[0]) / w if w > 0 else 0,
300
+ (box[3] - box[1]) / h if h > 0 else 0
301
+ ),
302
+ "area": (box[2] - box[0]) * (box[3] - box[1]),
303
+ "normalized_area": (
304
+ (box[2] - box[0]) * (box[3] - box[1]) / (w * h) if w * h > 0 else 0
305
+ ),
306
+ "region": region,
307
+ "is_landmark": True,
308
+ "landmark_id": landmark_info.get("landmark_id", ""),
309
+ "location": landmark_info.get("location", "Unknown Location")
310
+ }
311
+
312
+ # 添加額外信息
313
+ for key in ["year_built", "architectural_style", "significance"]:
314
+ if key in landmark_info:
315
+ landmark_obj[key] = landmark_info[key]
316
+
317
+ # 添加地標類型
318
+ landmark_obj["landmark_type"] = landmark_type
319
+
320
+ # 添加到檢測物體列表
321
+ detected_objects.append(landmark_obj)
322
+ landmark_objects.append(landmark_obj)
323
+ self.logger.info(f"Detected landmark: {landmark_info.get('landmark_name', 'Unknown')} with confidence {landmark_info.get('confidence', 0.0):.2f}")
324
+ except Exception as e:
325
+ self.logger.error(f"Error processing landmark: {e}")
326
+ continue
327
+
328
+ return detected_objects, landmark_objects
329
+
330
+ return detected_objects, []
331
+
332
+ except Exception as e:
333
+ self.logger.error(f"Error in landmark detection: {e}")
334
+ traceback.print_exc()
335
+ return detected_objects, []
336
+
337
+ def remove_landmark_references(self, text):
338
+ """
339
+ 從文本中移除所有地標引用。
340
+
341
+ Args:
342
+ text: 輸入文本
343
+
344
+ Returns:
345
+ str: 清除地標引用後的文本
346
+ """
347
+ if not text:
348
+ return text
349
+
350
+ try:
351
+ # 動態收集所有地標名稱和位置
352
+ landmark_names = []
353
+ locations = []
354
+
355
+ for landmark_id, info in self.all_landmarks.items():
356
+ # 收集地標名稱及其別名
357
+ landmark_names.append(info["name"])
358
+ landmark_names.extend(info.get("aliases", []))
359
+
360
+ # 收集地理位置
361
+ if "location" in info:
362
+ location = info["location"]
363
+ locations.append(location)
364
+
365
+ # 處理分離的城市和國家名稱
366
+ parts = location.split(",")
367
+ if len(parts) >= 1:
368
+ locations.append(parts[0].strip())
369
+ if len(parts) >= 2:
370
+ locations.append(parts[1].strip())
371
+
372
+ # 使用正則表達式動態替換所有地標名稱
373
+ for name in landmark_names:
374
+ if name and len(name) > 2: # 避免過短的名稱
375
+ text = re.sub(r'\b' + re.escape(name) + r'\b', "tall structure", text, flags=re.IGNORECASE)
376
+
377
+ # 動態替換所有位置引用
378
+ for location in locations:
379
+ if location and len(location) > 2:
380
+ # 替換常見位置表述模式
381
+ text = re.sub(r'in ' + re.escape(location), "in the urban area", text, flags=re.IGNORECASE)
382
+ text = re.sub(r'of ' + re.escape(location), "of the urban area", text, flags=re.IGNORECASE)
383
+ text = re.sub(r'\b' + re.escape(location) + r'\b', "the urban area", text, flags=re.IGNORECASE)
384
+
385
+ except Exception as e:
386
+ self.logger.warning(f"Error in dynamic landmark reference removal, using generic patterns: {e}")
387
+ # 通用地標描述模式
388
+ landmark_patterns = [
389
+ # 地標地點模式
390
+ (r'an iconic structure in ([A-Z][a-zA-Z\s,]+)', r'an urban structure'),
391
+ (r'a famous (monument|tower|landmark) in ([A-Z][a-zA-Z\s,]+)', r'an urban structure'),
392
+ (r'(the [A-Z][a-zA-Z\s]+ Tower)', r'the tower'),
393
+ (r'(the [A-Z][a-zA-Z\s]+ Building)', r'the building'),
394
+ (r'(the CN Tower)', r'the tower'),
395
+ (r'([A-Z][a-zA-Z\s]+) Tower', r'tall structure'),
396
+
397
+ # 地標位置關係模式
398
+ (r'(centered|built|located|positioned) around the ([A-Z][a-zA-Z\s]+? (Tower|Monument|Landmark))', r'located in this area'),
399
+
400
+ # 地標活動模式
401
+ (r'(sightseeing|guided tours|cultural tourism) (at|around|near) (this landmark|the [A-Z][a-zA-Z\s]+)', r'\1 in this area'),
402
+
403
+ # 一般性地標形容模式
404
+ (r'this (famous|iconic|historic|well-known) (landmark|monument|tower|structure)', r'this urban structure'),
405
+ (r'landmark scene', r'urban scene'),
406
+ (r'tourist destination', r'urban area'),
407
+ (r'tourist attraction', r'urban area')
408
+ ]
409
+
410
+ for pattern, replacement in landmark_patterns:
411
+ text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
412
+
413
+ return text
414
+
415
+ def get_alternative_scene_type(self, landmark_scene_type, detected_objects, scene_scores):
416
+ """
417
+ 為地標場景類型選擇適合的替代類型。
418
+
419
+ Args:
420
+ landmark_scene_type: 原始地標場景類型
421
+ detected_objects: 檢測到的物體列表
422
+ scene_scores: 所有場景類型的分數
423
+
424
+ Returns:
425
+ str: 適合的替代場景類型
426
+ """
427
+ # 1. 嘗試從現有場景分數中找出第二高的非地標場景
428
+ landmark_types = {"tourist_landmark", "natural_landmark", "historical_monument"}
429
+ alternative_scores = {k: v for k, v in scene_scores.items() if k not in landmark_types and v > 0.2}
430
+
431
+ if alternative_scores:
432
+ # 返回分數最高的非地標場景類型
433
+ return max(alternative_scores.items(), key=lambda x: x[1])[0]
434
+
435
+ # 2. 基於物體組合推斷場景類型
436
+ object_counts = {}
437
+ for obj in detected_objects:
438
+ class_name = obj.get("class_name", "")
439
+ if class_name not in object_counts:
440
+ object_counts[class_name] = 0
441
+ object_counts[class_name] += 1
442
+
443
+ # 根據物體組合決定場景類型
444
+ if "car" in object_counts or "truck" in object_counts or "bus" in object_counts:
445
+ # 有車輛,可能是街道或交叉路口
446
+ if "traffic light" in object_counts or "stop sign" in object_counts:
447
+ return "intersection"
448
+ else:
449
+ return "city_street"
450
+
451
+ if "building" in object_counts and object_counts.get("person", 0) > 0:
452
+ # 有建築物和人,可能是商業區
453
+ return "commercial_district"
454
+
455
+ if object_counts.get("person", 0) > 3:
456
+ # 多個行人,可能是行人區
457
+ return "pedestrian_area"
458
+
459
+ if "bench" in object_counts or "potted plant" in object_counts:
460
+ # 有長椅或盆栽,可能是公園區域
461
+ return "park_area"
462
+
463
+ # 3. 根據原始地標場景類型選擇合適的替代場景
464
+ if landmark_scene_type == "natural_landmark":
465
+ return "outdoor_natural_area"
466
+ elif landmark_scene_type == "historical_monument":
467
+ return "urban_architecture"
468
+
469
+ # 默認回退到城市街道
470
+ return "city_street"
471
+
472
+ def extract_landmark_specific_activities(self, landmark_objects):
473
+ """
474
+ 從識別的地標中提取特定活動。
475
+
476
+ Args:
477
+ landmark_objects: 地標物體列表
478
+
479
+ Returns:
480
+ List[str]: 地標特定活動列表
481
+ """
482
+ landmark_specific_activities = []
483
+
484
+ # 優先收集來自識別地標的特定活動
485
+ for lm_obj in landmark_objects:
486
+ lm_id = lm_obj.get("landmark_id")
487
+ if lm_id and lm_id in self.landmark_activities:
488
+ landmark_specific_activities.extend(self.landmark_activities[lm_id])
489
+
490
+ if landmark_specific_activities:
491
+ landmark_names = [lm.get('landmark_name', 'unknown') for lm in landmark_objects if lm.get('is_landmark', False)]
492
+ self.logger.info(f"Added {len(landmark_specific_activities)} landmark-specific activities for {', '.join(landmark_names)}")
493
+
494
+ return landmark_specific_activities
495
+
496
+ def update_enable_landmark_status(self, enable_landmark: bool):
497
+ """
498
+ 更新地標檢測的啟用狀態。
499
+
500
+ Args:
501
+ enable_landmark: 是否啟用地標檢測
502
+ """
503
+ self.enable_landmark = enable_landmark
504
+
505
+ def update_use_landmark_detection_status(self, use_landmark_detection: bool):
506
+ """
507
+ 更新地標檢測使用狀態。
508
+
509
+ Args:
510
+ use_landmark_detection: 是否使用地標檢測
511
+ """
512
+ self.use_landmark_detection = use_landmark_detection
lighting_analyzer.py CHANGED
The diff for this file is too large to render. See raw diff
 
lighting_condition_analyzer.py ADDED
@@ -0,0 +1,854 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import logging
3
+ import traceback
4
+ from typing import Dict, Any, Optional, List, Tuple
5
+ from configuration_manager import ConfigurationManager
6
+
7
+
8
+ class LightingConditionAnalyzer:
9
+ """
10
+ Determines specific lighting conditions and time of day based on scene analysis.
11
+ 此class 會判斷一些光線的特定場景
12
+
13
+ This class analyzes lighting characteristics including natural and artificial illumination,
14
+ color temperature patterns, and temporal indicators to classify scenes into specific
15
+ lighting categories such as day clear, night with lights, indoor artificial, etc.
16
+ """
17
+
18
+ def __init__(self, config_manager: ConfigurationManager):
19
+ """
20
+ Initialize the lighting condition analyzer.
21
+
22
+ Args:
23
+ config_manager: Configuration manager instance for accessing thresholds and parameters.
24
+ """
25
+ self.config_manager = config_manager
26
+ self.logger = self._setup_logger()
27
+
28
+ # Internal threshold constants for Places365 analysis
29
+ self.P365_ATTRIBUTE_CONF_THRESHOLD = 0.60
30
+ self.P365_SCENE_MODERATE_CONF_THRESHOLD = 0.45
31
+ self.P365_SCENE_HIGH_CONF_THRESHOLD = 0.70
32
+
33
+ # Scene type keyword definitions
34
+ self.P365_OUTDOOR_SCENE_KEYWORDS = [
35
+ "street", "road", "highway", "park", "beach", "mountain", "forest", "field",
36
+ "outdoor", "sky", "coast", "courtyard", "square", "plaza", "bridge",
37
+ "parking", "playground", "stadium", "construction", "river", "ocean", "desert",
38
+ "garden", "trail", "natural_landmark", "airport_outdoor", "train_station_outdoor",
39
+ "bus_station_outdoor", "intersection", "crosswalk", "sidewalk", "pathway"
40
+ ]
41
+
42
+ self.P365_INDOOR_RESTAURANT_KEYWORDS = [
43
+ "restaurant", "bar", "cafe", "dining_room", "pub", "bistro", "eatery"
44
+ ]
45
+
46
+ def _setup_logger(self) -> logging.Logger:
47
+ """Set up logger for lighting condition analysis operations."""
48
+ logger = logging.getLogger(f"{__name__}.LightingConditionAnalyzer")
49
+ if not logger.handlers:
50
+ handler = logging.StreamHandler()
51
+ formatter = logging.Formatter(
52
+ '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
53
+ )
54
+ handler.setFormatter(formatter)
55
+ logger.addHandler(handler)
56
+ logger.setLevel(logging.INFO)
57
+ return logger
58
+
59
+ def analyze_lighting_conditions(self, features: Dict[str, Any], is_indoor: bool,
60
+ places365_info: Optional[Dict] = None) -> Dict[str, Any]:
61
+ """
62
+ Determine specific lighting conditions based on features and scene context.
63
+
64
+ Args:
65
+ features: Dictionary containing extracted image features.
66
+ is_indoor: Boolean indicating whether the scene is indoor (from previous classification).
67
+ places365_info: Optional Places365 classification information.
68
+
69
+ Returns:
70
+ Dictionary containing lighting analysis results including time_of_day, confidence,
71
+ and diagnostic information.
72
+ """
73
+ try:
74
+ self.logger.debug(f"Starting lighting analysis for {'indoor' if is_indoor else 'outdoor'} scene")
75
+
76
+ # Initialize analysis results
77
+ time_of_day = "unknown"
78
+ confidence = 0.5
79
+ diagnostics = {}
80
+
81
+ # Extract Places365 context
82
+ p365_context = self._extract_places365_context(places365_info, diagnostics)
83
+
84
+ # Priority 1: Use Places365 attributes if highly confident
85
+ attribute_result = self._analyze_places365_attributes(
86
+ p365_context, is_indoor, features, diagnostics
87
+ )
88
+
89
+ if attribute_result["determined"] and attribute_result["confidence"] >= 0.75:
90
+ self.logger.debug(f"High-confidence Places365 attribute determination: {attribute_result['time_of_day']}")
91
+ return {
92
+ "time_of_day": attribute_result["time_of_day"],
93
+ "confidence": attribute_result["confidence"],
94
+ "diagnostics": diagnostics
95
+ }
96
+
97
+ # Priority 2: Visual feature analysis with Places365 scene context
98
+ visual_result = self._analyze_visual_features(
99
+ features, is_indoor, p365_context, diagnostics
100
+ )
101
+
102
+ time_of_day = visual_result["time_of_day"]
103
+ confidence = visual_result["confidence"]
104
+
105
+ # Combine with attribute result if it exists but wasn't decisive
106
+ if attribute_result["determined"]:
107
+ combined_result = self._combine_attribute_and_visual_results(
108
+ attribute_result, visual_result, diagnostics
109
+ )
110
+ time_of_day = combined_result["time_of_day"]
111
+ confidence = combined_result["confidence"]
112
+
113
+ # Priority 3: Special lighting refinement (neon, sodium vapor)
114
+ refined_result = self._apply_special_lighting_refinement(
115
+ time_of_day, confidence, features, is_indoor, p365_context, diagnostics
116
+ )
117
+
118
+ time_of_day = refined_result["time_of_day"]
119
+ confidence = refined_result["confidence"]
120
+
121
+ # Final confidence clamping
122
+ confidence = min(0.95, max(0.50, confidence))
123
+
124
+ # Record final results
125
+ diagnostics["final_lighting_time_of_day"] = time_of_day
126
+ diagnostics["final_lighting_confidence"] = round(confidence, 3)
127
+
128
+ self.logger.debug(f"Lighting analysis complete: {time_of_day} (confidence: {confidence:.3f})")
129
+
130
+ return {
131
+ "time_of_day": time_of_day,
132
+ "confidence": confidence,
133
+ "diagnostics": diagnostics
134
+ }
135
+
136
+ except Exception as e:
137
+ self.logger.error(f"Error in lighting condition analysis: {str(e)}")
138
+ self.logger.error(f"Traceback: {traceback.format_exc()}")
139
+ return self._get_default_lighting_result()
140
+
141
+ def _extract_places365_context(self, places365_info: Optional[Dict],
142
+ diagnostics: Dict[str, Any]) -> Dict[str, Any]:
143
+ """Extract and validate Places365 context information for lighting analysis."""
144
+ context = {
145
+ "mapped_scene": "unknown",
146
+ "attributes": [],
147
+ "confidence": 0.0
148
+ }
149
+
150
+ if places365_info:
151
+ context["mapped_scene"] = places365_info.get('mapped_scene_type', 'unknown').lower()
152
+ context["attributes"] = [attr.lower() for attr in places365_info.get('attributes', [])]
153
+ context["confidence"] = places365_info.get('confidence', 0.0)
154
+
155
+ diagnostics["p365_context_for_lighting"] = (
156
+ f"P365 Scene: {context['mapped_scene']}, Attrs: {context['attributes']}, "
157
+ f"Conf: {context['confidence']:.2f}"
158
+ )
159
+
160
+ return context
161
+
162
+ def _analyze_places365_attributes(self, p365_context: Dict[str, Any], is_indoor: bool,
163
+ features: Dict[str, Any], diagnostics: Dict[str, Any]) -> Dict[str, Any]:
164
+ """Analyze Places365 attributes for lighting condition determination."""
165
+ if (not p365_context["attributes"] or
166
+ p365_context["confidence"] <= self.P365_ATTRIBUTE_CONF_THRESHOLD):
167
+ return {"determined": False, "time_of_day": "unknown", "confidence": 0.5}
168
+
169
+ confidence = p365_context["confidence"]
170
+ attributes = p365_context["attributes"]
171
+ mapped_scene = p365_context["mapped_scene"]
172
+
173
+ # Outdoor attribute analysis
174
+ if not is_indoor:
175
+ outdoor_result = self._analyze_outdoor_attributes(
176
+ attributes, mapped_scene, confidence, diagnostics
177
+ )
178
+ if outdoor_result["determined"]:
179
+ return outdoor_result
180
+
181
+ # Indoor attribute analysis
182
+ if is_indoor:
183
+ indoor_result = self._analyze_indoor_attributes(
184
+ attributes, mapped_scene, features, confidence, diagnostics
185
+ )
186
+ if indoor_result["determined"]:
187
+ return indoor_result
188
+
189
+ return {"determined": False, "time_of_day": "unknown", "confidence": 0.5}
190
+
191
+ def _analyze_outdoor_attributes(self, attributes: List[str], mapped_scene: str,
192
+ confidence: float, diagnostics: Dict[str, Any]) -> Dict[str, Any]:
193
+ """Analyze Places365 attributes for outdoor lighting conditions."""
194
+ base_confidence_boost = (confidence - self.P365_ATTRIBUTE_CONF_THRESHOLD) * 0.25
195
+
196
+ if "sunny" in attributes or "clear sky" in attributes:
197
+ final_confidence = 0.85 + base_confidence_boost
198
+ diagnostics["reason"] = "P365 attribute: sunny/clear sky (Outdoor)."
199
+ return {
200
+ "determined": True,
201
+ "time_of_day": "day_clear",
202
+ "confidence": final_confidence
203
+ }
204
+
205
+ elif "nighttime" in attributes or "night" in attributes:
206
+ if ("artificial lighting" in attributes or "man-made lighting" in attributes or
207
+ any(kw in mapped_scene for kw in ["street", "city", "road", "urban", "downtown"])):
208
+ final_confidence = 0.82 + base_confidence_boost * 0.8
209
+ diagnostics["reason"] = "P365 attribute: nighttime with artificial/street lights (Outdoor)."
210
+ return {
211
+ "determined": True,
212
+ "time_of_day": "night_with_lights",
213
+ "confidence": final_confidence
214
+ }
215
+ else:
216
+ final_confidence = 0.78 + base_confidence_boost * 0.8
217
+ diagnostics["reason"] = "P365 attribute: nighttime, dark (Outdoor)."
218
+ return {
219
+ "determined": True,
220
+ "time_of_day": "night_dark",
221
+ "confidence": final_confidence
222
+ }
223
+
224
+ elif "cloudy" in attributes or "overcast" in attributes:
225
+ final_confidence = 0.80 + base_confidence_boost
226
+ diagnostics["reason"] = "P365 attribute: cloudy/overcast (Outdoor)."
227
+ return {
228
+ "determined": True,
229
+ "time_of_day": "day_cloudy_overcast",
230
+ "confidence": final_confidence
231
+ }
232
+
233
+ return {"determined": False, "time_of_day": "unknown", "confidence": 0.5}
234
+
235
+ def _analyze_indoor_attributes(self, attributes: List[str], mapped_scene: str,
236
+ features: Dict[str, Any], confidence: float,
237
+ diagnostics: Dict[str, Any]) -> Dict[str, Any]:
238
+ """Analyze Places365 attributes for indoor lighting conditions."""
239
+ base_confidence_boost = (confidence - self.P365_ATTRIBUTE_CONF_THRESHOLD) * 0.20
240
+ avg_brightness = features.get("avg_brightness", 128.0)
241
+
242
+ if "artificial lighting" in attributes or "man-made lighting" in attributes:
243
+ base_indoor_conf = 0.70 + base_confidence_boost
244
+ thresholds = self.config_manager.lighting_thresholds
245
+
246
+ if avg_brightness > thresholds.indoor_bright_thresh:
247
+ time_of_day = "indoor_bright_artificial"
248
+ final_confidence = base_indoor_conf + 0.10
249
+ elif avg_brightness > thresholds.indoor_moderate_thresh:
250
+ time_of_day = "indoor_moderate_artificial"
251
+ final_confidence = base_indoor_conf
252
+ else:
253
+ time_of_day = "indoor_dim_artificial"
254
+ final_confidence = base_indoor_conf - 0.05
255
+
256
+ diagnostics["reason"] = (
257
+ f"P365 attribute: artificial lighting (Indoor), "
258
+ f"brightness based category: {time_of_day}."
259
+ )
260
+ return {
261
+ "determined": True,
262
+ "time_of_day": time_of_day,
263
+ "confidence": final_confidence
264
+ }
265
+
266
+ elif "natural lighting" in attributes:
267
+ is_applicable_scene = (
268
+ self._check_home_environment_pattern(features) or
269
+ any(kw in mapped_scene for kw in ["living_room", "bedroom", "sunroom"])
270
+ )
271
+ if is_applicable_scene:
272
+ final_confidence = 0.80 + base_confidence_boost
273
+ diagnostics["reason"] = "P365 attribute: natural lighting in residential/applicable indoor scene."
274
+ return {
275
+ "determined": True,
276
+ "time_of_day": "indoor_residential_natural",
277
+ "confidence": final_confidence
278
+ }
279
+
280
+ return {"determined": False, "time_of_day": "unknown", "confidence": 0.5}
281
+
282
+ def _analyze_visual_features(self, features: Dict[str, Any], is_indoor: bool,
283
+ p365_context: Dict[str, Any], diagnostics: Dict[str, Any]) -> Dict[str, Any]:
284
+ """Analyze visual features for lighting condition determination."""
285
+ if is_indoor:
286
+ return self._analyze_indoor_visual_features(features, p365_context, diagnostics)
287
+ else:
288
+ return self._analyze_outdoor_visual_features(features, p365_context, diagnostics)
289
+
290
+ def _analyze_indoor_visual_features(self, features: Dict[str, Any], p365_context: Dict[str, Any],
291
+ diagnostics: Dict[str, Any]) -> Dict[str, Any]:
292
+ """Analyze visual features for indoor lighting conditions."""
293
+ avg_brightness = features.get("avg_brightness", 128.0)
294
+ thresholds = self.config_manager.lighting_thresholds
295
+
296
+ # Extract relevant features
297
+ sky_blue_in_sky_region = features.get("sky_region_blue_dominance", 0.0)
298
+ sky_region_is_brighter = features.get("sky_region_brightness_ratio", 1.0) > 1.05
299
+ is_likely_home_environment = self._check_home_environment_pattern(features)
300
+
301
+ # Lighting and structural features
302
+ circular_lights = features.get("circular_light_count", 0)
303
+ bright_spots_overall = features.get("bright_spot_count", 0)
304
+ brightness_uniformity = features.get("brightness_uniformity", 0.0)
305
+ warm_ratio = features.get("warm_ratio", 0.0)
306
+
307
+ # Natural light hints calculation
308
+ natural_light_hints = 0.0
309
+ if sky_blue_in_sky_region > 0.05 and sky_region_is_brighter:
310
+ natural_light_hints += 1.0
311
+ if brightness_uniformity > 0.65 and features.get("brightness_std", 100.0) < 70:
312
+ natural_light_hints += 1.0
313
+ if warm_ratio > 0.15 and avg_brightness > 110:
314
+ natural_light_hints += 0.5
315
+
316
+ # Designer lighting detection
317
+ is_designer_lit = (
318
+ (circular_lights > 0 or bright_spots_overall > 2) and
319
+ brightness_uniformity > 0.6 and warm_ratio > 0.2 and avg_brightness > 90
320
+ )
321
+
322
+ # Brightness-based classification
323
+ if avg_brightness > thresholds.indoor_bright_thresh:
324
+ return self._classify_bright_indoor(
325
+ features, natural_light_hints, is_designer_lit, is_likely_home_environment,
326
+ p365_context, diagnostics
327
+ )
328
+ elif avg_brightness > thresholds.indoor_moderate_thresh:
329
+ return self._classify_moderate_indoor(
330
+ features, is_designer_lit, is_likely_home_environment, p365_context, diagnostics
331
+ )
332
+ else:
333
+ return self._classify_dim_indoor(features, diagnostics)
334
+
335
+ def _classify_bright_indoor(self, features: Dict[str, Any], natural_light_hints: float,
336
+ is_designer_lit: bool, is_likely_home_environment: bool,
337
+ p365_context: Dict[str, Any], diagnostics: Dict[str, Any]) -> Dict[str, Any]:
338
+ """Classify bright indoor lighting conditions."""
339
+ mapped_scene = p365_context["mapped_scene"]
340
+ sky_blue_in_sky_region = features.get("sky_region_blue_dominance", 0.0)
341
+ sky_region_is_brighter = features.get("sky_region_brightness_ratio", 1.0) > 1.05
342
+
343
+ # Natural residential lighting
344
+ if (natural_light_hints >= 1.5 and
345
+ (is_likely_home_environment or any(kw in mapped_scene for kw in ["home", "residential", "living", "bedroom"]))):
346
+ return {
347
+ "time_of_day": "indoor_residential_natural",
348
+ "confidence": 0.82
349
+ }
350
+
351
+ # Designer residential lighting
352
+ elif (is_designer_lit and
353
+ (is_likely_home_environment or any(kw in mapped_scene for kw in ["home", "designer", "modern_interior"]))):
354
+ return {
355
+ "time_of_day": "indoor_designer_residential",
356
+ "confidence": 0.85
357
+ }
358
+
359
+ # Mixed natural/artificial lighting
360
+ elif sky_blue_in_sky_region > 0.03 and sky_region_is_brighter:
361
+ return {
362
+ "time_of_day": "indoor_bright_natural_mix",
363
+ "confidence": 0.78
364
+ }
365
+
366
+ # Pure artificial lighting
367
+ else:
368
+ return {
369
+ "time_of_day": "indoor_bright_artificial",
370
+ "confidence": 0.75
371
+ }
372
+
373
+ def _classify_moderate_indoor(self, features: Dict[str, Any], is_designer_lit: bool,
374
+ is_likely_home_environment: bool, p365_context: Dict[str, Any],
375
+ diagnostics: Dict[str, Any]) -> Dict[str, Any]:
376
+ """Classify moderate brightness indoor lighting conditions."""
377
+ mapped_scene = p365_context["mapped_scene"]
378
+ confidence = p365_context["confidence"]
379
+ warm_ratio = features.get("warm_ratio", 0.0)
380
+ yellow_orange_ratio = features.get("yellow_orange_ratio", 0.0)
381
+
382
+ # Designer residential lighting
383
+ if (is_designer_lit and
384
+ (is_likely_home_environment or any(kw in mapped_scene for kw in ["home", "designer"]))):
385
+ return {
386
+ "time_of_day": "indoor_designer_residential",
387
+ "confidence": 0.78
388
+ }
389
+
390
+ # Restaurant/bar lighting
391
+ elif warm_ratio > 0.35 and yellow_orange_ratio > 0.1:
392
+ return self._classify_restaurant_bar_lighting(
393
+ p365_context, features, diagnostics
394
+ )
395
+
396
+ # Standard moderate artificial
397
+ else:
398
+ return {
399
+ "time_of_day": "indoor_moderate_artificial",
400
+ "confidence": 0.70
401
+ }
402
+
403
+ def _classify_restaurant_bar_lighting(self, p365_context: Dict[str, Any],
404
+ features: Dict[str, Any],
405
+ diagnostics: Dict[str, Any]) -> Dict[str, Any]:
406
+ """Classify restaurant/bar specific lighting conditions."""
407
+ mapped_scene = p365_context["mapped_scene"]
408
+ confidence = p365_context["confidence"]
409
+
410
+ # Strong P365 restaurant/bar confirmation
411
+ if (any(kw in mapped_scene for kw in self.P365_INDOOR_RESTAURANT_KEYWORDS) and
412
+ confidence > self.P365_SCENE_MODERATE_CONF_THRESHOLD):
413
+ diagnostics["visual_analysis_reason"] = (
414
+ "Visual: Moderate warm tones. P365 context confirms restaurant/bar."
415
+ )
416
+ return {
417
+ "time_of_day": "indoor_restaurant_bar",
418
+ "confidence": 0.80 + confidence * 0.15
419
+ }
420
+
421
+ # P365 outdoor conflict detection
422
+ elif (any(kw in mapped_scene for kw in self.P365_OUTDOOR_SCENE_KEYWORDS) and
423
+ confidence > self.P365_SCENE_MODERATE_CONF_THRESHOLD):
424
+ diagnostics["visual_analysis_reason"] = (
425
+ "Visual: Moderate warm. CONFLICT: LA says indoor but P365 scene is outdoor. "
426
+ "Defaulting to general indoor artificial."
427
+ )
428
+ diagnostics["conflict_is_indoor_vs_p365_scene_for_restaurant_bar"] = True
429
+ return {
430
+ "time_of_day": "indoor_moderate_artificial",
431
+ "confidence": 0.55
432
+ }
433
+
434
+ # Neutral P365 context
435
+ else:
436
+ diagnostics["visual_analysis_reason"] = (
437
+ "Visual: Moderate warm tones, typical of restaurant/bar. P365 context neutral or weak."
438
+ )
439
+ return {
440
+ "time_of_day": "indoor_restaurant_bar",
441
+ "confidence": 0.70
442
+ }
443
+
444
+ def _classify_dim_indoor(self, features: Dict[str, Any],
445
+ diagnostics: Dict[str, Any]) -> Dict[str, Any]:
446
+ """Classify dim indoor lighting conditions."""
447
+ warm_ratio = features.get("warm_ratio", 0.0)
448
+ yellow_orange_ratio = features.get("yellow_orange_ratio", 0.0)
449
+
450
+ if warm_ratio > 0.45 and yellow_orange_ratio > 0.15:
451
+ return {
452
+ "time_of_day": "indoor_dim_warm",
453
+ "confidence": 0.75
454
+ }
455
+ else:
456
+ return {
457
+ "time_of_day": "indoor_dim_general",
458
+ "confidence": 0.70
459
+ }
460
+
461
+ def _analyze_outdoor_visual_features(self, features: Dict[str, Any], p365_context: Dict[str, Any],
462
+ diagnostics: Dict[str, Any]) -> Dict[str, Any]:
463
+ """Analyze visual features for outdoor lighting conditions."""
464
+ avg_brightness = features.get("avg_brightness", 128.0)
465
+ thresholds = self.config_manager.lighting_thresholds
466
+
467
+ # P365 enhanced street scene analysis
468
+ street_result = self._analyze_p365_enhanced_street_scenes(
469
+ features, p365_context, diagnostics
470
+ )
471
+ if street_result["determined"]:
472
+ return street_result
473
+
474
+ # Brightness-based outdoor classification
475
+ if avg_brightness < thresholds.outdoor_night_thresh_brightness:
476
+ return self._classify_night_outdoor(features, diagnostics)
477
+ elif (avg_brightness < thresholds.outdoor_dusk_dawn_thresh_brightness and
478
+ self._check_warm_sunset_conditions(features)):
479
+ return self._classify_sunset_sunrise(features, p365_context, diagnostics)
480
+ elif avg_brightness > thresholds.outdoor_day_bright_thresh:
481
+ return self._classify_bright_day_outdoor(features, diagnostics)
482
+ elif avg_brightness > thresholds.outdoor_day_cloudy_thresh:
483
+ return self._classify_cloudy_day_outdoor(features, diagnostics)
484
+ else:
485
+ return self._classify_general_outdoor(features, diagnostics)
486
+
487
+ def _analyze_p365_enhanced_street_scenes(self, features: Dict[str, Any], p365_context: Dict[str, Any],
488
+ diagnostics: Dict[str, Any]) -> Dict[str, Any]:
489
+ """Analyze outdoor scenes with Places365 street context enhancement."""
490
+ mapped_scene = p365_context["mapped_scene"]
491
+ confidence = p365_context["confidence"]
492
+ thresholds = self.config_manager.lighting_thresholds
493
+
494
+ # Check for street scene with warm lighting
495
+ is_street_scene = (
496
+ any(kw in mapped_scene for kw in ["street", "city", "road", "urban", "downtown", "intersection"]) and
497
+ confidence > self.P365_SCENE_MODERATE_CONF_THRESHOLD and
498
+ features.get("color_atmosphere") == "warm"
499
+ )
500
+
501
+ if not is_street_scene:
502
+ return {"determined": False, "time_of_day": "unknown", "confidence": 0.5}
503
+
504
+ avg_brightness = features.get("avg_brightness", 128.0)
505
+ bright_spots_overall = features.get("bright_spot_count", 0)
506
+
507
+ # Night with street lights
508
+ if (avg_brightness < thresholds.outdoor_night_thresh_brightness and
509
+ bright_spots_overall > thresholds.outdoor_night_lights_thresh):
510
+ diagnostics["visual_analysis_reason"] = (
511
+ f"P365 outdoor scene '{mapped_scene}' + visual low-warm light with spots -> night_with_lights."
512
+ )
513
+ return {
514
+ "determined": True,
515
+ "time_of_day": "night_with_lights",
516
+ "confidence": 0.88 + confidence * 0.1
517
+ }
518
+
519
+ # Sunset/sunrise conditions
520
+ elif avg_brightness >= thresholds.outdoor_night_thresh_brightness:
521
+ diagnostics["visual_analysis_reason"] = (
522
+ f"P365 outdoor scene '{mapped_scene}' + visual moderate-warm light -> sunset/sunrise."
523
+ )
524
+ return {
525
+ "determined": True,
526
+ "time_of_day": "sunset_sunrise",
527
+ "confidence": 0.88 + confidence * 0.1
528
+ }
529
+
530
+ # Very dark conditions
531
+ else:
532
+ diagnostics["visual_analysis_reason"] = (
533
+ f"P365 outdoor scene '{mapped_scene}' + visual very low light -> night_dark."
534
+ )
535
+ return {
536
+ "determined": True,
537
+ "time_of_day": "night_dark",
538
+ "confidence": 0.75 + confidence * 0.1
539
+ }
540
+
541
+ def _classify_night_outdoor(self, features: Dict[str, Any],
542
+ diagnostics: Dict[str, Any]) -> Dict[str, Any]:
543
+ """Classify nighttime outdoor conditions."""
544
+ bright_spots_overall = features.get("bright_spot_count", 0)
545
+ dark_pixel_ratio = features.get("dark_pixel_ratio", 0.0)
546
+ thresholds = self.config_manager.lighting_thresholds
547
+
548
+ if bright_spots_overall > thresholds.outdoor_night_lights_thresh:
549
+ confidence = 0.82 + min(0.13, dark_pixel_ratio / 2.5)
550
+ diagnostics["visual_analysis_reason"] = "Visual: Low brightness with light sources (street/car lights)."
551
+ return {
552
+ "time_of_day": "night_with_lights",
553
+ "confidence": confidence
554
+ }
555
+ else:
556
+ confidence = 0.78 + min(0.17, dark_pixel_ratio / 1.8)
557
+ diagnostics["visual_analysis_reason"] = "Visual: Very low brightness outdoor, deep night."
558
+ return {
559
+ "time_of_day": "night_dark",
560
+ "confidence": confidence
561
+ }
562
+
563
+ def _classify_sunset_sunrise(self, features: Dict[str, Any], p365_context: Dict[str, Any],
564
+ diagnostics: Dict[str, Any]) -> Dict[str, Any]:
565
+ """Classify sunset/sunrise outdoor conditions."""
566
+ yellow_orange_ratio = features.get("yellow_orange_ratio", 0.0)
567
+ confidence = 0.75 + min(0.20, yellow_orange_ratio / 1.5)
568
+
569
+ diagnostics["visual_analysis_reason"] = "Visual: Moderate brightness, warm tones -> sunset/sunrise."
570
+
571
+ # P365 natural scene boost
572
+ mapped_scene = p365_context["mapped_scene"]
573
+ p365_confidence = p365_context["confidence"]
574
+
575
+ if (any(kw in mapped_scene for kw in ["beach", "mountain", "lake", "ocean", "desert", "field", "natural_landmark", "sky"]) and
576
+ p365_confidence > self.P365_SCENE_MODERATE_CONF_THRESHOLD):
577
+ confidence = min(0.95, confidence + 0.15)
578
+ diagnostics["visual_analysis_reason"] += f" P365 natural scene '{mapped_scene}' supports."
579
+
580
+ return {
581
+ "time_of_day": "sunset_sunrise",
582
+ "confidence": confidence
583
+ }
584
+
585
+ def _classify_bright_day_outdoor(self, features: Dict[str, Any],
586
+ diagnostics: Dict[str, Any]) -> Dict[str, Any]:
587
+ """Classify bright daytime outdoor conditions."""
588
+ sky_like_blue_in_sky_region = features.get("sky_region_blue_dominance", 0.0)
589
+ sky_region_brightness_ratio = features.get("sky_region_brightness_ratio", 1.0)
590
+ texture_complexity = features.get("top_region_texture_complexity", 0.5)
591
+ thresholds = self.config_manager.lighting_thresholds
592
+
593
+ # Clear sky conditions
594
+ if (sky_like_blue_in_sky_region > thresholds.outdoor_day_blue_thresh or
595
+ (sky_region_brightness_ratio > 1.05 and texture_complexity < 0.4)):
596
+
597
+ confidence = 0.80 + min(0.15, sky_like_blue_in_sky_region * 2 +
598
+ (sky_like_blue_in_sky_region * 1.5 if sky_region_brightness_ratio > 1.05 else 0))
599
+ diagnostics["visual_analysis_reason"] = "Visual: High brightness with blue/sky tones or bright smooth top."
600
+
601
+ return {
602
+ "time_of_day": "day_clear",
603
+ "confidence": confidence
604
+ }
605
+
606
+ # Stadium/floodlit detection
607
+ brightness_uniformity = features.get("brightness_uniformity", 0.0)
608
+ bright_spots_overall = features.get("bright_spot_count", 0)
609
+
610
+ if (brightness_uniformity > 0.70 and
611
+ bright_spots_overall > thresholds.stadium_min_spots_thresh):
612
+ diagnostics["visual_analysis_reason"] = (
613
+ "Visual: Very bright, uniform lighting with multiple sources, suggests floodlights (Outdoor)."
614
+ )
615
+ return {
616
+ "time_of_day": "stadium_or_floodlit_area",
617
+ "confidence": 0.78
618
+ }
619
+
620
+ # General bright day
621
+ diagnostics["visual_analysis_reason"] = "Visual: High brightness outdoor, specific sky features unclear."
622
+ return {
623
+ "time_of_day": "day_bright_general",
624
+ "confidence": 0.68
625
+ }
626
+
627
+ def _classify_cloudy_day_outdoor(self, features: Dict[str, Any],
628
+ diagnostics: Dict[str, Any]) -> Dict[str, Any]:
629
+ """Classify cloudy daytime outdoor conditions."""
630
+ sky_region_brightness_ratio = features.get("sky_region_brightness_ratio", 1.0)
631
+ texture_complexity = features.get("top_region_texture_complexity", 0.5)
632
+ avg_saturation = features.get("avg_saturation", 100.0)
633
+ gray_ratio = features.get("gray_ratio", 0.0)
634
+ brightness_uniformity = features.get("brightness_uniformity", 0.0)
635
+ thresholds = self.config_manager.lighting_thresholds
636
+
637
+ # Overcast conditions
638
+ if (sky_region_brightness_ratio > 1.05 and texture_complexity < 0.45 and avg_saturation < 70):
639
+ confidence = 0.75 + min(0.20, gray_ratio / 1.5 + (brightness_uniformity - 0.5) / 1.5)
640
+ diagnostics["visual_analysis_reason"] = (
641
+ "Visual: Good brightness, uniform bright top, lower saturation -> overcast."
642
+ )
643
+ return {
644
+ "time_of_day": "day_cloudy_overcast",
645
+ "confidence": confidence
646
+ }
647
+
648
+ # Gray cloudy conditions
649
+ elif gray_ratio > thresholds.outdoor_day_gray_thresh:
650
+ confidence = 0.72 + min(0.23, gray_ratio / 1.8)
651
+ diagnostics["visual_analysis_reason"] = "Visual: Good brightness with higher gray tones."
652
+ return {
653
+ "time_of_day": "day_cloudy_gray",
654
+ "confidence": confidence
655
+ }
656
+
657
+ # General bright outdoor
658
+ else:
659
+ diagnostics["visual_analysis_reason"] = "Visual: Bright outdoor, specific type less clear."
660
+ return {
661
+ "time_of_day": "day_bright_general",
662
+ "confidence": 0.68
663
+ }
664
+
665
+ def _classify_general_outdoor(self, features: Dict[str, Any],
666
+ diagnostics: Dict[str, Any]) -> Dict[str, Any]:
667
+ """Classify general outdoor conditions when specific patterns are unclear."""
668
+ color_atmosphere = features.get("color_atmosphere", "neutral")
669
+ yellow_orange_ratio = features.get("yellow_orange_ratio", 0.0)
670
+ sky_like_blue_in_sky_region = features.get("sky_region_blue_dominance", 0.0)
671
+
672
+ # Potential sunset/sunrise with low confidence
673
+ if color_atmosphere == "warm" and yellow_orange_ratio > 0.08:
674
+ diagnostics["visual_analysis_reason"] = (
675
+ "Visual: Outdoor, specific conditions less clear; broader visual cues suggest warm lighting."
676
+ )
677
+ return {
678
+ "time_of_day": "sunset_sunrise_low_confidence",
679
+ "confidence": 0.62
680
+ }
681
+
682
+ # Potential hazy day conditions
683
+ elif sky_like_blue_in_sky_region > 0.02:
684
+ diagnostics["visual_analysis_reason"] = (
685
+ "Visual: Outdoor, specific conditions less clear; some blue tones suggest daylight."
686
+ )
687
+ return {
688
+ "time_of_day": "day_hazy_or_partly_cloudy",
689
+ "confidence": 0.62
690
+ }
691
+
692
+ # Unknown outdoor daylight
693
+ else:
694
+ diagnostics["visual_analysis_reason"] = (
695
+ "Visual: Outdoor, specific conditions less clear; broader visual cues."
696
+ )
697
+ return {
698
+ "time_of_day": "outdoor_unknown_daylight",
699
+ "confidence": 0.58
700
+ }
701
+
702
+ def _apply_commercial_indoor_refinement(self, features: Dict[str, Any], p365_context: Dict[str, Any],
703
+ time_of_day: str, confidence: float) -> Dict[str, Any]:
704
+ """Apply commercial indoor lighting refinement if conditions are met."""
705
+ # Skip if already classified as residential, restaurant, or bar
706
+ if any(category in time_of_day for category in ["residential", "restaurant", "bar"]):
707
+ return {"time_of_day": time_of_day, "confidence": confidence}
708
+
709
+ # Skip if P365 suggests home environment
710
+ mapped_scene = p365_context["mapped_scene"]
711
+ if any(kw in mapped_scene for kw in ["home", "residential"]):
712
+ return {"time_of_day": time_of_day, "confidence": confidence}
713
+
714
+ # Check commercial lighting indicators
715
+ avg_brightness = features.get("avg_brightness", 100.0)
716
+ bright_spots_overall = features.get("bright_spot_count", 0)
717
+ light_dist_uniformity = features.get("light_distribution_uniformity", 0.5)
718
+ ceiling_likelihood = features.get("ceiling_likelihood", 0.0)
719
+ thresholds = self.config_manager.lighting_thresholds
720
+
721
+ if (avg_brightness > thresholds.commercial_min_brightness_thresh and
722
+ bright_spots_overall > thresholds.commercial_min_spots_thresh and
723
+ (light_dist_uniformity > 0.5 or ceiling_likelihood > 0.4)):
724
+
725
+ refined_confidence = 0.70 + min(0.2, bright_spots_overall * 0.02)
726
+ return {
727
+ "time_of_day": "indoor_commercial",
728
+ "confidence": refined_confidence
729
+ }
730
+
731
+ return {"time_of_day": time_of_day, "confidence": confidence}
732
+
733
+ def _apply_special_lighting_refinement(self, time_of_day: str, confidence: float,
734
+ features: Dict[str, Any], is_indoor: bool,
735
+ p365_context: Dict[str, Any],
736
+ diagnostics: Dict[str, Any]) -> Dict[str, Any]:
737
+ """Apply special lighting refinement for neon and sodium vapor lighting."""
738
+ # Apply commercial refinement for indoor scenes first
739
+ if is_indoor:
740
+ commercial_result = self._apply_commercial_indoor_refinement(
741
+ features, p365_context, time_of_day, confidence
742
+ )
743
+ time_of_day = commercial_result["time_of_day"]
744
+ confidence = commercial_result["confidence"]
745
+
746
+ # Check for neon/sodium vapor lighting conditions
747
+ is_current_night_or_dim_warm = "night" in time_of_day or time_of_day == "indoor_dim_warm"
748
+
749
+ if not is_current_night_or_dim_warm:
750
+ return {"time_of_day": time_of_day, "confidence": confidence}
751
+
752
+ # Extract features for neon detection
753
+ yellow_orange_ratio = features.get("yellow_orange_ratio", 0.0)
754
+ bright_spots_overall = features.get("bright_spot_count", 0)
755
+ color_atmosphere = features.get("color_atmosphere", "neutral")
756
+ avg_saturation = features.get("avg_saturation", 0.0)
757
+
758
+ # Get neon detection thresholds
759
+ thresholds = self.config_manager.lighting_thresholds
760
+
761
+ # Check neon lighting conditions
762
+ if (yellow_orange_ratio > thresholds.neon_yellow_orange_thresh and
763
+ bright_spots_overall > thresholds.neon_bright_spots_thresh and
764
+ color_atmosphere == "warm" and
765
+ avg_saturation > thresholds.neon_avg_saturation_thresh):
766
+
767
+ old_time_of_day = time_of_day
768
+ old_confidence = confidence
769
+
770
+ # Check P365 context for neon scenes
771
+ mapped_scene = p365_context["mapped_scene"]
772
+ attributes = p365_context["attributes"]
773
+ is_p365_neon_context = (
774
+ any(kw in mapped_scene for kw in ["neon", "nightclub", "bar_neon"]) or
775
+ "neon" in attributes
776
+ )
777
+
778
+ if is_indoor:
779
+ if (is_p365_neon_context or
780
+ any(kw in mapped_scene for kw in self.P365_INDOOR_RESTAURANT_KEYWORDS)):
781
+ time_of_day = "indoor_neon_lit"
782
+ confidence = max(confidence, 0.80)
783
+ else:
784
+ time_of_day = "indoor_dim_warm_neon_accent"
785
+ confidence = max(confidence, 0.77)
786
+ else:
787
+ if (is_p365_neon_context or
788
+ any(kw in mapped_scene for kw in ["street_night", "city_night", "downtown_night"])):
789
+ time_of_day = "neon_or_sodium_vapor_night"
790
+ confidence = max(confidence, 0.82)
791
+ else:
792
+ time_of_day = "night_with_neon_lights"
793
+ confidence = max(confidence, 0.79)
794
+
795
+ # Record the refinement
796
+ diagnostics["special_lighting_detected"] = (
797
+ f"Refined from {old_time_of_day} (Conf:{old_confidence:.2f}) "
798
+ f"to {time_of_day} (Conf:{confidence:.2f}) due to neon/sodium vapor light characteristics. "
799
+ f"P365 Context: {mapped_scene if is_p365_neon_context else 'N/A'}."
800
+ )
801
+
802
+ return {"time_of_day": time_of_day, "confidence": confidence}
803
+
804
+ def _combine_attribute_and_visual_results(self, attribute_result: Dict[str, Any],
805
+ visual_result: Dict[str, Any],
806
+ diagnostics: Dict[str, Any]) -> Dict[str, Any]:
807
+ """Combine Places365 attribute and visual analysis results."""
808
+ # If visual analysis provided a different and potentially more nuanced result
809
+ if (attribute_result["time_of_day"] != visual_result["time_of_day"] and
810
+ visual_result["confidence"] > 0.65):
811
+
812
+ diagnostics["final_decision_source"] = "Visual features (potentially P365-context-refined)."
813
+ diagnostics["p365_attr_overridden_by_visual"] = (
814
+ f"P365 Attr ToD {attribute_result['time_of_day']} "
815
+ f"(Conf {attribute_result['confidence']:.2f}) was less certain or overridden by "
816
+ f"visual logic result {visual_result['time_of_day']} (Conf {visual_result['confidence']:.2f})."
817
+ )
818
+ return visual_result
819
+
820
+ # Use attribute result if it was more confident
821
+ elif attribute_result["confidence"] >= visual_result["confidence"]:
822
+ diagnostics["final_decision_source"] = "High-confidence P365 attribute."
823
+ return attribute_result
824
+
825
+ # Use visual result
826
+ else:
827
+ diagnostics["final_decision_source"] = "Visual features (potentially P365-context-refined)."
828
+ return visual_result
829
+
830
+ def _check_home_environment_pattern(self, features: Dict[str, Any]) -> bool:
831
+ """Check if features indicate a home/residential environment pattern."""
832
+ thresholds = self.config_manager.indoor_outdoor_thresholds
833
+ return features.get("home_environment_pattern", 0.0) > thresholds.home_pattern_thresh_moderate * 0.7
834
+
835
+ def _check_warm_sunset_conditions(self, features: Dict[str, Any]) -> bool:
836
+ """Check if features indicate warm sunset/sunrise lighting conditions."""
837
+ thresholds = self.config_manager.lighting_thresholds
838
+ yellow_orange_ratio = features.get("yellow_orange_ratio", 0.0)
839
+ color_atmosphere = features.get("color_atmosphere", "neutral")
840
+ sky_brightness_ratio = features.get("sky_region_brightness_ratio", 1.0)
841
+
842
+ return (yellow_orange_ratio > thresholds.outdoor_dusk_dawn_color_thresh and
843
+ color_atmosphere == "warm" and
844
+ sky_brightness_ratio < 1.5)
845
+
846
+ def _get_default_lighting_result(self) -> Dict[str, Any]:
847
+ """Return default lighting analysis result in case of errors."""
848
+ return {
849
+ "time_of_day": "unknown",
850
+ "confidence": 0.5,
851
+ "diagnostics": {
852
+ "error": "Lighting analysis failed, using default values"
853
+ }
854
+ }
llm_enhancer.py CHANGED
@@ -1,1266 +1,504 @@
1
- import re
2
- import os
3
- import torch
4
- from typing import Dict, List, Tuple, Any, Optional
5
  import logging
 
 
 
 
 
 
 
 
6
 
7
  class LLMEnhancer:
8
  """
9
- 負責使用LLM (Large Language Model) 增強場景理解和描述。
10
- 未來可以再整合Llama或其他LLM模型進行場景描述的生成和豐富化。
11
  """
12
 
13
  def __init__(self,
14
- model_path: Optional[str] = None,
15
- tokenizer_path: Optional[str] = None,
16
- device: Optional[str] = None,
17
- max_length: int = 2048,
18
- temperature: float = 0.3,
19
- top_p: float = 0.85):
20
  """
21
- 初始化LLM增強器
 
22
  Args:
23
- model_path: LLM模型的路徑或HuggingFace log in,默認使用Llama 3.2
24
- tokenizer_path: token處理器的路徑,通常與model_path相同
25
- device: 設備檢查 ('cpu'或'cuda')
26
- max_length: 生成文本的最大長度
27
- temperature: 生成文本的溫度(較高比較有創意,較低會偏保守)
28
  top_p: 生成文本時的核心採樣機率閾值
29
  """
30
- self.logger = logging.getLogger("LLMEnhancer")
31
- self.logger.setLevel(logging.INFO)
32
- handler = logging.StreamHandler()
33
- handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
34
- self.logger.addHandler(handler)
 
 
 
35
 
36
- # 默認用 Llama3.2
37
- self.model_path = model_path or "meta-llama/Llama-3.2-3B-Instruct"
38
- self.tokenizer_path = tokenizer_path or self.model_path
 
 
 
 
 
 
 
39
 
40
- # check device
41
- self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
42
- self.logger.info(f"Using device: {self.device}")
43
 
44
- # create parameters
45
- self.max_length = max_length
46
- self.temperature = temperature
47
- self.top_p = top_p
48
 
49
- self.model = None
50
- self.tokenizer = None
51
 
52
- # 追蹤模型調用次數
53
- self.call_count = 0
 
 
 
54
 
55
- self._initialize_prompts()
 
 
56
 
57
- # only if need to load the model
58
- self._model_loaded = False
 
59
 
 
 
 
60
  try:
61
- self.hf_token = os.environ.get("HF_TOKEN")
62
- if self.hf_token:
63
- self.logger.info("Logging in to Hugging Face with token")
64
- from huggingface_hub import login
65
- login(token=self.hf_token)
66
- else:
67
- self.logger.warning("HF_TOKEN not found in environment variables. Access to gated models may be limited.")
68
- except Exception as e:
69
- self.logger.error(f"Error during Hugging Face login: {e}")
70
 
71
- def _load_model(self):
72
- """只在首次需要時加載,使用 8 位量化以節省記憶體"""
73
- if self._model_loaded:
74
- return
75
 
76
- try:
77
- self.logger.info(f"Loading LLM model from {self.model_path} with 8-bit quantization")
78
- import torch
79
- from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
80
- torch.cuda.empty_cache()
81
-
82
- if torch.cuda.is_available():
83
- free_in_GB = torch.cuda.get_device_properties(0).total_memory / 1024**3
84
- print(f"Total GPU memory: {free_in_GB:.2f} GB")
85
-
86
- # 設置 8 位元配置(節省記憶體空間)
87
- quantization_config = BitsAndBytesConfig(
88
- load_in_8bit=True,
89
- llm_int8_enable_fp32_cpu_offload=True
90
- )
91
 
92
- self.tokenizer = AutoTokenizer.from_pretrained(
93
- self.tokenizer_path,
94
- padding_side="left",
95
- use_fast=False,
96
- token=self.hf_token
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  )
98
 
99
- # 特殊標記
100
- self.tokenizer.pad_token = self.tokenizer.eos_token
 
101
 
102
- # 加載 8 位量化模型
103
- self.model = AutoModelForCausalLM.from_pretrained(
104
- self.model_path,
105
- quantization_config=quantization_config,
106
- device_map="auto",
107
- low_cpu_mem_usage=True,
108
- token=self.hf_token
109
- )
110
 
111
- self.logger.info("Model loaded successfully with 8-bit quantization")
112
- self._model_loaded = True
 
113
 
114
- except Exception as e:
115
- self.logger.error(f"Error loading LLM model: {e}")
116
- import traceback
117
- traceback.print_exc()
118
- raise
119
-
120
- def _initialize_prompts(self):
121
- """Return an optimized prompt template specifically for Zephyr model"""
122
- # the critical prompt for the model
123
- self.enhance_description_template = """
124
- <|system|>
125
- You are an expert visual analyst. Your task is to improve the readability and fluency of scene descriptions using STRICT factual accuracy.
126
- Your **top priority is to avoid hallucination** or fabrication. You are working in a computer vision pipeline using object detection (YOLO) and image embeddings. You MUST treat the input object list as a whitelist. Do not speculate beyond this list.
127
- </|system|>
128
- <|user|>
129
- Rewrite the following scene description to be fluent and clear. DO NOT add any objects, events, or spatial relationships that are not explicitly present in the original or object list.
130
- ORIGINAL:
131
- {original_description}
132
- CRITICAL RULES:
133
- 1. NEVER assume room type, object function, or scene purpose unless directly stated.
134
- 2. NEVER invent object types. You are limited to: {object_list}
135
- 3. NEVER speculate on object quantity. If the description says "10 people" , DO NOT say "dozens" or "many". Maintain the original quantity unless specified.
136
- 4. Use terms like "in the scene", "visible in the background", or "positioned in the lower left" instead of assuming direction or layout logic.
137
- 5. You MAY describe confirmed materials, colors, and composition style if visually obvious and non-speculative.
138
- 6. Write 2–4 complete, well-structured sentences with punctuation.
139
- 7. Final output MUST be a single fluent paragraph of 60–200 words (not longer).
140
- 8. Begin your response directly with the scene description. Do NOT include any introductory phrases, explanations, or formatting indicators.
141
- 9. Ensure grammatical completeness in all sentences. Each sentence must have a complete subject and predicate structure.
142
- 10. Vary sentence structures naturally while maintaining grammatical accuracy. Avoid incomplete phrases or dangling modifiers.
143
- 11. Limit repetition of descriptive verbs and spatial indicators to maintain text diversity and readability.
144
- 12. Create natural spatial flow by connecting object descriptions organically rather than listing positions mechanically.
145
- 13. Use transitional phrases to connect ideas smoothly, varying expression patterns throughout the description.
146
- 14. End with a conclusive observation about atmosphere, style, or overall impression rather than restating layout information.
147
- 15. When describing quantities or arrangements, use only information explicitly confirmed by the object detection system.
148
- </|user|>
149
- <|assistant|>
150
- """
151
-
152
- # 錯誤檢測的prompt
153
- self.verify_detection_template = """
154
- Task: You are an advanced vision system that verifies computer vision detections for accuracy.
155
- Analyze the following detection results and identify any potential errors or inconsistencies:
156
- SCENE TYPE: {scene_type}
157
- SCENE NAME: {scene_name}
158
- CONFIDENCE: {confidence:.2f}
159
- DETECTED OBJECTS: {detected_objects}
160
- CLIP ANALYSIS RESULTS:
161
- {clip_analysis}
162
- Possible Errors to Check:
163
- 1. Objects misidentified (e.g., architectural elements labeled as vehicles)
164
- 2. Cultural elements misunderstood (e.g., Asian temple structures labeled as boats)
165
- 3. Objects that seem out of place for this type of scene
166
- 4. Inconsistencies between different detection systems
167
- If you find potential errors, list them clearly with explanations. If the detections seem reasonable, state that they appear accurate.
168
- Verification Results:
169
- """
170
-
171
- # 無檢測處理的prompt
172
- self.no_detection_template = """
173
- Task: You are an advanced scene understanding system analyzing an image where standard object detection failed to identify specific objects.
174
- Based on advanced image embeddings (CLIP analysis), we have the following information:
175
- MOST LIKELY SCENE: {top_scene} (confidence: {top_confidence:.2f})
176
- VIEWPOINT: {viewpoint}
177
- LIGHTING: {lighting_condition}
178
- CULTURAL ANALYSIS: {cultural_analysis}
179
- Create a detailed description of what might be in this scene, considering:
180
- 1. The most likely type of location or setting
181
- 2. Possible architectural or natural elements present
182
- 3. The lighting and atmosphere
183
- 4. Potential cultural or regional characteristics
184
- Your description should be natural, flowing, and offer insights into what the image likely contains despite the lack of specific object detection.
185
- Scene Description:
186
- """
187
-
188
- def _clean_llama_response(self, response: str) -> str:
189
- """處理 Llama 模型特有的輸出格式問題"""
190
- # 首先應用通用清理
191
- response = self._clean_model_response(response)
192
-
193
- # 移除 Llama 常見的前綴短語
194
- prefixes_to_remove = [
195
- "Here's the enhanced description:",
196
- "Enhanced description:",
197
- "Here is the enhanced scene description:",
198
- "I've enhanced the description while preserving all factual details:"
199
- ]
200
-
201
- for prefix in prefixes_to_remove:
202
- if response.lower().startswith(prefix.lower()):
203
- response = response[len(prefix):].strip()
204
-
205
- # 移除可能的後綴說明
206
- suffixes_to_remove = [
207
- "I've maintained all the key factual elements",
208
- "I've preserved all the factual details",
209
- "All factual elements have been maintained"
210
- ]
211
-
212
- for suffix in suffixes_to_remove:
213
- if response.lower().endswith(suffix.lower()):
214
- response = response[:response.rfind(suffix)].strip()
215
-
216
- return response
217
-
218
- # For Future Usage
219
- def _detect_scene_type(self, detected_objects: List[Dict]) -> str:
220
- """
221
- Detect scene type based on object distribution and patterns
222
- """
223
- # Default scene type
224
- scene_type = "intersection"
225
 
226
- # Count objects by class
227
- object_counts = {}
228
- for obj in detected_objects:
229
- class_name = obj.get("class_name", "")
230
- if class_name not in object_counts:
231
- object_counts[class_name] = 0
232
- object_counts[class_name] += 1
233
 
234
- # 辨識人
235
- people_count = object_counts.get("person", 0)
 
 
 
 
 
236
 
237
- # 交通工具的
238
- car_count = object_counts.get("car", 0)
239
- bus_count = object_counts.get("bus", 0)
240
- truck_count = object_counts.get("truck", 0)
241
- total_vehicles = car_count + bus_count + truck_count
 
 
 
 
 
 
 
 
 
 
 
 
 
242
 
243
- # Simple scene type detection logic
244
- if people_count > 8 and total_vehicles < 2:
245
- scene_type = "pedestrian_crossing"
246
- elif people_count > 5 and total_vehicles > 2:
247
- scene_type = "busy_intersection"
248
- elif people_count < 3 and total_vehicles > 3:
249
- scene_type = "traffic_junction"
250
 
251
- return scene_type
 
 
 
252
 
253
- def _clean_scene_type(self, scene_type: str) -> str:
254
- """清理場景類型,使其更適合用於提示詞"""
255
- if not scene_type:
256
- return "scene"
257
 
258
- # replace underline to space or sometime capital letter
259
- if '_' in scene_type:
260
- return ' '.join(word.capitalize() for word in scene_type.split('_'))
 
 
261
 
262
- return scene_type
 
 
263
 
264
- def _clean_model_response(self, response: str) -> str:
265
- """清理模型回應以移除常見的標記和前綴"""
266
- # 移除任何可能殘留的系統樣式標記
267
- response = re.sub(r'<\|.*?\|>', '', response)
268
 
269
- # 移除任何 "This european_plaza" 或類似前綴
270
- response = re.sub(r'^This [a-z_]+\s+', '', response)
 
 
 
 
 
 
 
 
 
 
271
 
272
- # 確保響應以大寫字母開頭
273
- if response and not response[0].isupper():
274
- response = response[0].upper() + response[1:]
275
 
276
- return response.strip()
 
 
 
 
277
 
278
- def reset_context(self):
279
- """在處理新圖像前重置模型上下文"""
280
- if self._model_loaded:
281
- # 清除 GPU 緩存
282
- torch.cuda.empty_cache()
283
- self.logger.info("Model context reset")
284
- else:
285
- self.logger.info("Model not loaded, no context to reset")
286
-
287
- def _remove_introduction_sentences(self, response: str) -> str:
288
- """remove introduction sentences"""
289
- # 識別常見的介紹性模式
290
- intro_patterns = [
291
- r'^Here is the (?:rewritten|enhanced) .*?description:',
292
- r'^The (?:rewritten|enhanced) description:',
293
- r'^Here\'s the (?:rewritten|enhanced) description of .*?:'
294
- ]
295
-
296
- for pattern in intro_patterns:
297
- if re.match(pattern, response, re.IGNORECASE):
298
- # 找到冒號後的內容
299
- parts = re.split(r':', response, 1)
300
- if len(parts) > 1:
301
- return parts[1].strip()
302
-
303
- return response
304
 
305
- def enhance_description(self, scene_data: Dict[str, Any]) -> str:
306
- """場景描述增強器,處理各種場景類型並保留視角與光照資訊,並作為總窗口可運用於其他class"""
307
- try:
308
- # 重置上下文
309
- self.reset_context()
310
 
311
- # 確保模型已加載
312
- if not self._model_loaded:
313
- self._load_model()
314
 
315
- # extract original description
316
- original_desc = scene_data.get("original_description", "")
317
- if not original_desc:
318
- return "No original description provided."
319
 
320
- # get scene type 並標準化
321
- scene_type = scene_data.get("scene_type", "unknown scene")
322
- scene_type = self._clean_scene_type(scene_type)
323
 
324
- # 提取檢測到的物件並過濾低信心度物件
325
- detected_objects = scene_data.get("detected_objects", [])
326
- filtered_objects = []
327
 
328
- # 高信心度閾值,嚴格過濾物件
 
 
 
 
329
  high_confidence_threshold = 0.65
330
 
331
- for obj in detected_objects:
332
- confidence = obj.get("confidence", 0)
333
- class_name = obj.get("class_name", "")
334
-
335
- # 為特殊類別設置更高閾值
336
- special_classes = ["airplane", "helicopter", "boat"]
337
- if class_name in special_classes:
338
- if confidence < 0.75: # 為這些類別設置更高閾值
339
- continue
340
-
341
- # 只保留高信心度物件
342
- if confidence >= high_confidence_threshold:
343
- filtered_objects.append(obj)
344
-
345
- # 優先使用��入的物體統計信息,如果不存在則計算
346
  object_statistics = scene_data.get("object_statistics", {})
347
  object_counts = {}
348
 
349
  if object_statistics:
350
- # 使用預計算的統計資訊,確保數量準確
351
  for class_name, stats in object_statistics.items():
352
  if stats.get("count", 0) > 0 and stats.get("avg_confidence", 0) >= high_confidence_threshold:
353
  object_counts[class_name] = stats["count"]
354
  else:
355
  # 回退到原有的計算方式
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
356
  for obj in filtered_objects:
357
  class_name = obj.get("class_name", "")
358
  if class_name not in object_counts:
359
  object_counts[class_name] = 0
360
  object_counts[class_name] += 1
361
 
362
- # 將物件格式化為更精確的描述
363
- high_confidence_objects = ", ".join([
364
  f"{count} {obj}{'s' if count > 1 else ''}"
365
  for obj, count in object_counts.items()
366
  ])
367
 
368
- # 如果沒有高信心度物件,回退到使用原始描述中的關鍵詞
369
- if not high_confidence_objects:
370
- # 從原始描述中提取物件提及
371
- object_keywords = self._extract_objects_from_description(original_desc)
372
- high_confidence_objects = ", ".join(object_keywords) if object_keywords else "objects visible in the scene"
373
-
374
- # 保留原始描述中的關鍵視角信息
375
- perspective = self._extract_perspective_from_description(original_desc)
376
-
377
- # 提取光照資訊
378
- lighting_description = "unknown lighting"
379
- if "lighting_info" in scene_data:
380
- lighting_info = scene_data.get("lighting_info", {})
381
- time_of_day = lighting_info.get("time_of_day", "unknown")
382
- is_indoor = lighting_info.get("is_indoor", False)
383
- lighting_description = f"{'indoor' if is_indoor else 'outdoor'} {time_of_day} lighting"
384
-
385
- # 創建prompt,整合所有關鍵資訊
386
- prompt = self.enhance_description_template.format(
387
- scene_type=scene_type,
388
- object_list=high_confidence_objects,
389
- original_description=original_desc,
390
- perspective=perspective,
391
- lighting_description=lighting_description
392
- )
393
 
394
- # 生成增強描述
395
- self.logger.info("Generating LLM response...")
396
- response = self._generate_llm_response(prompt)
397
 
398
- # 檢查回應完整性的更嚴格標準
399
- is_landmark_only = (
400
- scene_data.get("scene_type") in ["tourist_landmark", "natural_landmark", "historical_monument"] and
401
- (not scene_data.get("detected_objects") or len(scene_data.get("detected_objects", [])) <= 1)
402
- )
403
 
404
- # 如果是只有地標的情況,調整相關邏輯
405
- if is_landmark_only:
406
- # 確保原始描述不為空
407
- original_desc = scene_data.get("original_description", "")
408
- if not original_desc or len(original_desc.strip()) < 10:
409
- # 從場景類型和地標信息生成基本描述
410
- scene_type = scene_data.get("scene_type", "unknown")
411
- scene_name = scene_data.get("scene_name", "Unknown")
412
- if "primary_landmark" in scene_data:
413
- landmark_name = scene_data["primary_landmark"].get("name", "unnamed landmark")
414
- original_desc = f"A {scene_type.replace('_', ' ')} scene featuring {landmark_name}."
415
- else:
416
- original_desc = f"A {scene_type.replace('_', ' ')} scene."
417
-
418
- # 更新場景數據
419
- scene_data["original_description"] = original_desc
420
-
421
- # 檢查回應完整性的更嚴格標準 (保持不變)
422
- is_incomplete = (
423
- len(response) < 100 or # too short
424
- (len(response) < 200 and "." not in response[-30:]) or # 結尾沒有適當的標點符號
425
- any(response.endswith(phrase) for phrase in ["in the", "with the", "and the"]) # 以不完整短語結尾
426
- )
427
 
428
  max_retries = 3
429
  attempts = 0
430
- while attempts < max_retries and is_incomplete:
431
- self.logger.warning(f"Generated incomplete response, retrying... Attempt {attempts+1}/{max_retries}")
 
 
432
  # 重新生成
433
- response = self._generate_llm_response(prompt)
 
434
  attempts += 1
435
 
436
- # 重新檢查完整性
437
- is_incomplete = (len(response) < 100 or
438
- (len(response) < 200 and "." not in response[-30:]) or
439
- any(response.endswith(phrase) for phrase in ["in the", "with the", "and the"]))
440
-
441
  if not response or len(response.strip()) < 10:
442
  self.logger.warning("Generated response was empty or too short, returning original description")
443
  return original_desc
444
 
445
- # 使用與模型相符的清理方法
446
- if "llama" in self.model_path.lower():
447
- result = self._clean_llama_response(response)
448
- else:
449
- result = self._clean_model_response(response)
450
-
451
- # 移除介紹性type句子
452
- result = self._remove_introduction_sentences(result)
453
-
454
- # 移除explanation
455
- result = self._remove_explanatory_notes(result)
456
-
457
- # fact check
458
- result = self._verify_factual_accuracy(original_desc, result, high_confidence_objects)
459
-
460
- # 確保場景類型和視角一致性
461
- result = self._ensure_scene_type_consistency(result, scene_type, original_desc)
462
- if perspective and perspective.lower() not in result.lower():
463
- result = f"{perspective}, {result[0].lower()}{result[1:]}"
464
-
465
- final_result = str(result)
466
- if not final_result or len(final_result.strip()) < 20:
467
- self.logger.warning(f"WARNING: LLM enhanced description is empty or too short!")
468
- self.logger.info(f"Original description: {original_desc[:50]}...")
469
- self.logger.info(f"Input data: scene_type={scene_data.get('scene_type')}, objects={len(scene_data.get('detected_objects', []))}")
470
- else:
471
- self.logger.info(f"LLM enhanced description generated successfully ({len(final_result)} chars)")
472
-
473
- return final_result
474
 
475
  except Exception as e:
476
- self.logger.error(f"Enhancement failed: {str(e)}")
477
- import traceback
478
- self.logger.error(traceback.format_exc())
479
- return original_desc # 發生任何錯誤時返回原始描述
480
-
481
- def _verify_factual_accuracy(self, original: str, generated: str, object_list: str) -> str:
482
- """驗證生成的描述不包含原始描述或物體列表中沒有的信息,並檢測重複用詞問題"""
483
-
484
- # 將原始描述和物體列表合併為授權詞彙源
485
- authorized_content = original.lower() + " " + object_list.lower()
486
-
487
- # 提取生成描述中具有實質意義的名詞
488
- # 創建常見地點、文化和地域詞彙的列表
489
- location_terms = ["plaza", "square", "market", "mall", "avenue", "boulevard"]
490
- cultural_terms = ["european", "asian", "american", "african", "western", "eastern"]
491
-
492
- # 檢查生成文本中的每個詞
493
- for term in location_terms + cultural_terms:
494
- # 僅當該詞出現在生成文本但不在授權內容中時進行替換
495
- if term in generated.lower() and term not in authorized_content:
496
- # 根據詞語類型選擇適當的替換詞
497
- if term in location_terms:
498
- replacement = "area"
499
- else:
500
- replacement = "scene"
501
-
502
- # 使用正則表達式進行完整詞匹配替換
503
- pattern = re.compile(r'\b' + term + r'\b', re.IGNORECASE)
504
- generated = pattern.sub(replacement, generated)
505
-
506
- # 檢查描述性詞彙重複問題
507
- repetitive_patterns = [
508
- (r'\b(visible)\b.*?\b(visible)\b', 'Multiple uses of "visible" detected'),
509
- (r'\b(positioned)\b.*?\b(positioned)\b', 'Multiple uses of "positioned" detected'),
510
- (r'\b(located)\b.*?\b(located)\b', 'Multiple uses of "located" detected'),
511
- (r'\b(situated)\b.*?\b(situated)\b', 'Multiple uses of "situated" detected'),
512
- (r'\b(appears)\b.*?\b(appears)\b', 'Multiple uses of "appears" detected'),
513
- (r'\b(features)\b.*?\b(features)\b', 'Multiple uses of "features" detected'),
514
- (r'\bThis\s+(\w+)\s+.*?\bThis\s+\1\b', 'Repetitive sentence structure detected')
515
- ]
516
-
517
- # 定義替換詞典,提供多樣化的表達方式
518
- replacement_dict = {
519
- 'visible': ['present', 'evident', 'apparent', 'observable'],
520
- 'positioned': ['arranged', 'placed', 'set', 'organized'],
521
- 'located': ['found', 'placed', 'situated', 'established'],
522
- 'situated': ['placed', 'positioned', 'arranged', 'set'],
523
- 'appears': ['seems', 'looks', 'presents', 'exhibits'],
524
- 'features': ['includes', 'contains', 'displays', 'showcases']
525
- }
526
-
527
- for pattern, issue in repetitive_patterns:
528
- matches = list(re.finditer(pattern, generated, re.IGNORECASE | re.DOTALL))
529
- if matches:
530
- self.logger.warning(f"Text quality issue detected: {issue}")
531
-
532
- # 針對特定重複詞彙進行替換
533
- for word in replacement_dict.keys():
534
- if word in issue.lower():
535
- word_pattern = re.compile(r'\b' + word + r'\b', re.IGNORECASE)
536
- word_matches = list(word_pattern.finditer(generated))
537
-
538
- # 保留第一次出現,替換後續出現
539
- for i, match in enumerate(word_matches[1:], 1):
540
- if i <= len(replacement_dict[word]):
541
- replacement = replacement_dict[word][(i-1) % len(replacement_dict[word])]
542
-
543
- # 保持原始大小寫格式
544
- if match.group().isupper():
545
- replacement = replacement.upper()
546
- elif match.group().istitle():
547
- replacement = replacement.capitalize()
548
-
549
- # 執行替換
550
- generated = generated[:match.start()] + replacement + generated[match.end():]
551
- # 重新計算後續匹配位置
552
- word_matches = list(word_pattern.finditer(generated))
553
- break
554
-
555
- return generated
556
-
557
 
558
  def verify_detection(self,
559
- detected_objects: List[Dict],
560
- clip_analysis: Dict[str, Any],
561
- scene_type: str,
562
- scene_name: str,
563
- confidence: float) -> Dict[str, Any]:
564
  """
565
  驗證並可能修正YOLO的檢測結果
 
566
  Args:
567
  detected_objects: YOLO檢測到的物體列表
568
  clip_analysis: CLIP分析結果
569
  scene_type: 識別的場景類型
570
  scene_name: 場景名稱
571
  confidence: 場景分類的信心度
 
572
  Returns:
573
  Dict: 包含驗證結果和建議的字典
574
  """
575
- # 確保模型已加載
576
- self._load_model()
577
-
578
- # 格式化數據
579
- objects_str = self._format_objects_for_prompt(detected_objects)
580
- clip_str = self._format_clip_results(clip_analysis)
581
-
582
- # 構建提示
583
- prompt = self.verify_detection_template.format(
584
- scene_type=scene_type,
585
- scene_name=scene_name,
586
- confidence=confidence,
587
- detected_objects=objects_str,
588
- clip_analysis=clip_str
589
- )
590
-
591
- # 調用LLM進行驗證
592
- verification_result = self._generate_llm_response(prompt)
593
-
594
- # 解析驗證結果
595
- result = {
596
- "verification_text": verification_result,
597
- "has_errors": "appear accurate" not in verification_result.lower(),
598
- "corrected_objects": None
599
- }
600
-
601
- return result
602
-
603
- def _remove_explanatory_notes(self, response: str) -> str:
604
- """移除解釋性注釋、說明和其他非描述性內容"""
605
-
606
- # 識別常見的注釋和解釋模式
607
- note_patterns = [
608
- r'(?:^|\n)Note:.*?(?:\n|$)',
609
- r'(?:^|\n)I have (?:followed|adhered to|ensured).*?(?:\n|$)',
610
- r'(?:^|\n)This description (?:follows|adheres to|maintains).*?(?:\n|$)',
611
- r'(?:^|\n)The enhanced description (?:maintains|preserves).*?(?:\n|$)'
612
- ]
613
-
614
- # 尋找第一段完整的描述內容
615
- paragraphs = [p.strip() for p in response.split('\n\n') if p.strip()]
616
-
617
- # 如果只有一個段落,檢查並清理它
618
- if len(paragraphs) == 1:
619
- for pattern in note_patterns:
620
- paragraphs[0] = re.sub(pattern, '', paragraphs[0], flags=re.IGNORECASE)
621
- return paragraphs[0].strip()
622
-
623
- # 如果有多個段落,識別並移除注釋段落
624
- content_paragraphs = []
625
- for paragraph in paragraphs:
626
- is_note = False
627
- for pattern in note_patterns:
628
- if re.search(pattern, paragraph, flags=re.IGNORECASE):
629
- is_note = True
630
- break
631
-
632
- # 檢查段落是否以常見的注釋詞開頭
633
- if paragraph.lower().startswith(('note:', 'please note:', 'remember:')):
634
- is_note = True
635
-
636
- if not is_note:
637
- content_paragraphs.append(paragraph)
638
-
639
- # 返回清理後的內容
640
- return '\n\n'.join(content_paragraphs).strip()
641
 
642
  def handle_no_detection(self, clip_analysis: Dict[str, Any]) -> str:
643
  """
644
  處理YOLO未檢測到物體的情況
 
645
  Args:
646
  clip_analysis: CLIP分析結果
 
647
  Returns:
648
  str: 生成的場景描述
649
  """
650
- # 確保模型已加載
651
- self._load_model()
652
 
653
- # 提取CLIP結果
654
- top_scene, top_confidence = clip_analysis.get("top_scene", ("unknown", 0))
655
- viewpoint = clip_analysis.get("viewpoint", ("standard", 0))[0]
656
- lighting = clip_analysis.get("lighting_condition", ("unknown", 0))[0]
657
 
658
- # 格式化文化分析
659
- cultural_str = self._format_cultural_analysis(clip_analysis.get("cultural_analysis", {}))
660
 
661
- # 構建提示
662
- prompt = self.no_detection_template.format(
663
- top_scene=top_scene,
664
- top_confidence=top_confidence,
665
- viewpoint=viewpoint,
666
- lighting_condition=lighting,
667
- cultural_analysis=cultural_str
668
- )
669
 
670
- # 調用LLM生成描述
671
- description = self._generate_llm_response(prompt)
672
 
673
- # 優化輸出
674
- return self._clean_llm_response(description)
 
 
 
675
 
676
- def _clean_input_text(self, text: str) -> str:
 
 
 
 
 
 
 
 
677
  """
678
- 對輸入文本進行通用的格式清理,處理常見的格式問題。
679
- Args:
680
- text: 輸入文本
681
  Returns:
682
- 清理後的文本
683
  """
684
- if not text:
685
- return ""
686
-
687
- # 清理格式的問題
688
- # 1. 處理連續標點符號問題
689
- text = re.sub(r'([.,;:!?])\1+', r'\1', text)
690
-
691
- # 2. 修復不完整句子的標點(如 "Something," 後沒有繼續接續下去)
692
- text = re.sub(r',\s*$', '.', text)
693
-
694
- # 3. 修復如 "word." 後未加空格即接下一句的問題
695
- text = re.sub(r'([.!?])([A-Z])', r'\1 \2', text)
696
 
697
- # 4. 移除多餘空格
698
- text = re.sub(r'\s+', ' ', text).strip()
699
-
700
- # 5. 確保句子正確結束(句尾加句號)
701
- if text and not text[-1] in '.!?':
702
- text += '.'
703
 
704
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
705
 
706
- def _fact_check_description(self, original_desc: str, enhanced_desc: str, scene_type: str, detected_objects: List[str]) -> str:
707
  """
708
- 驗證並可能修正增強後的描述,確保有保持事實準確性。
709
- Args:
710
- original_desc: 原始場景描述
711
- enhanced_desc: 增強後的描述待驗證
712
- scene_type: 場景類型
713
- detected_objects: 檢測到的物體名稱列表
714
  Returns:
715
- 經過事實檢查的描述
716
  """
717
- # 如果增強描述為空或太短,返回原始描述
718
- if not enhanced_desc or len(enhanced_desc) < 30:
719
- return original_desc
720
-
721
- # 1. 檢查數值一致性(如人數、物體數量等)
722
- # 從原始描述中提取數字和相關名詞
723
- number_patterns = [
724
- (r'(\d+)\s+(people|person|pedestrians|individuals)', r'\1', r'\2'), # 人數
725
- (r'(\d+)\s+(cars|vehicles|automobiles)', r'\1', r'\2'), # 車輛數
726
- (r'(\d+)\s+(buildings|structures)', r'\1', r'\2') # 建築數
727
- ]
728
-
729
- # 檢查原始描述中的每個數字
730
- for pattern, num_group, word_group in number_patterns:
731
- original_matches = re.finditer(pattern, original_desc, re.IGNORECASE)
732
- for match in original_matches:
733
- number = match.group(1)
734
- noun = match.group(2)
735
-
736
- # 檢查增強描述中是否保留了這個數字
737
- # 創建一個更通用的模式來檢查增強描述中是否包含此數字和對象類別
738
- enhanced_pattern = r'(\d+)\s+(' + re.escape(noun) + r'|' + re.escape(noun.rstrip('s')) + r'|' + re.escape(noun + 's') + r')'
739
- enhanced_matches = list(re.finditer(enhanced_pattern, enhanced_desc, re.IGNORECASE))
740
-
741
- if not enhanced_matches:
742
- # 數字+名詞未在增強描述中找到
743
- plural_form = noun if noun.endswith('s') or number == '1' else noun + 's'
744
- if enhanced_desc.startswith("This") or enhanced_desc.startswith("The"):
745
- enhanced_desc = enhanced_desc.replace("This ", f"This scene with {number} {plural_form} ", 1)
746
- enhanced_desc = enhanced_desc.replace("The ", f"The scene with {number} {plural_form} ", 1)
747
- else:
748
- enhanced_desc = f"The scene includes {number} {plural_form}. " + enhanced_desc
749
- elif enhanced_matches and match.group(1) != number:
750
- # 存在但數字不一致,就要更正數字
751
- for ematch in enhanced_matches:
752
- wrong_number = ematch.group(1)
753
- enhanced_desc = enhanced_desc.replace(f"{wrong_number} {ematch.group(2)}", f"{number} {ematch.group(2)}")
754
-
755
- # 2. 檢查視角的一致性
756
- perspective_terms = {
757
- "aerial": ["aerial", "bird's-eye", "overhead", "top-down", "above", "looking down"],
758
- "ground": ["street-level", "ground level", "eye-level", "standing"],
759
- "indoor": ["inside", "interior", "indoor", "within"],
760
- "close-up": ["close-up", "detailed view", "close shot"]
761
- }
762
-
763
- # 確定原始視角
764
- original_perspective = None
765
- for persp, terms in perspective_terms.items():
766
- if any(term in original_desc.lower() for term in terms):
767
- original_perspective = persp
768
- break
769
-
770
- # 檢查是否保留了視角方面
771
- if original_perspective:
772
- enhanced_has_perspective = any(term in enhanced_desc.lower() for term in perspective_terms[original_perspective])
773
-
774
- if not enhanced_has_perspective:
775
- # 添加之前缺的視角方面
776
- perspective_prefixes = {
777
- "aerial": "From an aerial perspective, ",
778
- "ground": "From street level, ",
779
- "indoor": "In this indoor setting, ",
780
- "close-up": "In this close-up view, "
781
- }
782
-
783
- prefix = perspective_prefixes.get(original_perspective, "")
784
- if prefix:
785
- if enhanced_desc[0].isupper():
786
- enhanced_desc = prefix + enhanced_desc[0].lower() + enhanced_desc[1:]
787
- else:
788
- enhanced_desc = prefix + enhanced_desc
789
-
790
- # 3. 檢查場景類型一致性
791
- if scene_type and scene_type.lower() != "unknown" and scene_type.lower() not in enhanced_desc.lower():
792
- # 添加場景類型
793
- if enhanced_desc.startswith("This ") or enhanced_desc.startswith("The "):
794
- # 避免產生 "This scene" 和 "This intersection" 的重複
795
- if "scene" in enhanced_desc[:15].lower():
796
- fixed_type = scene_type.lower()
797
- enhanced_desc = enhanced_desc.replace("scene", fixed_type, 1)
798
- else:
799
- enhanced_desc = enhanced_desc.replace("This ", f"This {scene_type} ", 1)
800
- enhanced_desc = enhanced_desc.replace("The ", f"The {scene_type} ", 1)
801
- else:
802
- enhanced_desc = f"This {scene_type} " + enhanced_desc
803
 
804
- # 4. 確保文字長度適當,這邊的限制要與prompt相同,否則會產生矛盾
805
- words = enhanced_desc.split()
806
- if len(words) > 200:
807
- # 找尋接近字數限制的句子結束處
808
- truncated = ' '.join(words[:200])
809
- last_period = max(truncated.rfind('.'), truncated.rfind('!'), truncated.rfind('?'))
810
 
811
- if last_period > 0:
812
- enhanced_desc = truncated[:last_period+1]
813
- else:
814
- enhanced_desc = truncated + '.'
815
-
816
- return enhanced_desc
817
-
818
- def _extract_perspective_from_description(self, description: str) -> str:
819
- """從原始描述中提取視角/透視信息"""
820
- perspective_terms = {
821
- "aerial": ["aerial perspective", "aerial view", "bird's-eye view", "overhead view", "from above"],
822
- "ground": ["ground level", "eye level", "street level"],
823
- "indoor": ["indoor setting", "inside", "interior"]
824
- }
825
-
826
- for persp_type, terms in perspective_terms.items():
827
- for term in terms:
828
- if term.lower() in description.lower():
829
- return term
830
-
831
- return ""
832
-
833
- def _extract_objects_from_description(self, description: str) -> List[str]:
834
- """從原始描述中提取物件提及"""
835
- # 常見物件正則表達式模式
836
- object_patterns = [
837
- r'(\d+)\s+(people|persons|pedestrians|individuals)',
838
- r'(\d+)\s+(cars|vehicles|automobiles)',
839
- r'(\d+)\s+(buildings|structures)',
840
- r'(\d+)\s+(plants|potted plants|flowers)',
841
- r'(\d+)\s+(beds|furniture|tables|chairs)'
842
- ]
843
-
844
- extracted_objects = []
845
-
846
- for pattern in object_patterns:
847
- matches = re.finditer(pattern, description, re.IGNORECASE)
848
- for match in matches:
849
- number = match.group(1)
850
- object_type = match.group(2)
851
- extracted_objects.append(f"{number} {object_type}")
852
-
853
- return extracted_objects
854
-
855
- def _ensure_scene_type_consistency(self, description: str, scene_type: str, original_desc: str) -> str:
856
- """確保描述中的場景類型與指定的場景類型一致"""
857
- # 禁止使用的錯誤場景詞列表
858
- prohibited_scene_words = ["plaza", "square", "european", "asian", "american"]
859
-
860
- # 檢查是否包含禁止的場景詞
861
- for word in prohibited_scene_words:
862
- if word in description.lower() and word not in original_desc.lower() and word not in scene_type.lower():
863
- # 替換錯誤場景詞為正確場景類型
864
- pattern = re.compile(r'\b' + word + r'\b', re.IGNORECASE)
865
- description = pattern.sub(scene_type, description)
866
-
867
- # 確保場景類型在描述中被提及
868
- if scene_type.lower() not in description.lower():
869
- # 尋找通用場景詞並替換
870
- for general_term in ["scene", "area", "place", "location"]:
871
- if general_term in description.lower():
872
- pattern = re.compile(r'\b' + general_term + r'\b', re.IGNORECASE)
873
- description = pattern.sub(scene_type, description, count=1)
874
- break
875
- else:
876
- # 如果沒有找到通用詞,在開頭添加場景類型
877
- if description.startswith("The "):
878
- description = description.replace("The ", f"The {scene_type} ", 1)
879
- elif description.startswith("This "):
880
- description = description.replace("This ", f"This {scene_type} ", 1)
881
- else:
882
- description = f"This {scene_type} " + description
883
 
884
- return description
 
 
885
 
886
- def _generate_llm_response(self, prompt: str) -> str:
887
- """生成 LLM 的回應"""
888
- self._load_model()
889
 
 
 
 
890
  try:
891
- self.call_count += 1
892
- self.logger.info(f"LLM call #{self.call_count}")
893
-
894
- # 清除 GPU 緩存
895
- torch.cuda.empty_cache()
896
-
897
- # 設置固定種子以提高一致性
898
- torch.manual_seed(42)
899
 
900
- # 準備輸入
901
- inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=self.max_length).to(self.device)
 
 
 
 
 
902
 
903
- # 根據模型類型調整參數
904
- generation_params = {
905
- "max_new_tokens": 120,
906
- "pad_token_id": self.tokenizer.eos_token_id,
907
- "attention_mask": inputs.attention_mask,
908
- "use_cache": True,
909
- }
910
 
911
- # 為 Llama 模型設置特定參數
912
- if "llama" in self.model_path.lower():
913
- generation_params.update({
914
- "temperature": 0.35, # 不要太高, 否則模型可能會太有主觀意見
915
- "max_new_tokens": 600,
916
- "do_sample": True,
917
- "top_p": 0.75,
918
- "repetition_penalty": 1.5, # 重複的懲罰權重,可避免掉重複字
919
- "num_beams": 5 ,
920
- "length_penalty": 1,
921
- "no_repeat_ngram_size": 3
922
- })
923
 
924
- else:
925
- # 如果用其他模型的參數
926
- generation_params.update({
927
- "temperature": 0.6,
928
- "max_new_tokens": 300,
929
- "top_p": 0.9,
930
- "do_sample": True,
931
- "num_beams": 1,
932
- "repetition_penalty": 1.05
933
- })
934
-
935
- # 生成回應
936
- with torch.no_grad():
937
- outputs = self.model.generate(inputs.input_ids, **generation_params)
938
-
939
- # 解碼完整輸出
940
- full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
941
-
942
- # 提取生成的響應部分
943
- assistant_tag = "<|assistant|>"
944
- if assistant_tag in full_response:
945
- response = full_response.split(assistant_tag)[-1].strip()
946
-
947
- # 檢查是否有未閉合的 <|assistant|>
948
- user_tag = "<|user|>"
949
- if user_tag in response:
950
- response = response.split(user_tag)[0].strip()
951
- else:
952
- # 移除輸入提示
953
- input_text = self.tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)
954
- response = full_response
955
- if response.startswith(input_text):
956
- response = response[len(input_text):].strip()
957
 
958
- # 確保不返回空的回應
959
- if not response or len(response.strip()) < 10:
960
- self.logger.warning("response is too short or empty")
961
- return "No detailed description could be generated."
962
-
963
- return response
964
 
965
  except Exception as e:
966
- self.logger.error(f"生成 LLM 響應時出錯: {str(e)}")
967
- import traceback
968
- self.logger.error(traceback.format_exc())
969
- return "Unable to generate enhanced description."
970
-
971
- def _clean_llm_response(self, response: str) -> str:
972
- """
973
- Clean the LLM response to ensure the output contains only clean descriptive text.
974
- Sometimes it will not only display the description but display tags, notes...etc
975
- Args:
976
- response: Original response from the LLM
977
- Returns:
978
- Cleaned description text
979
- """
980
- if not response:
981
- return ""
982
-
983
- # Save original response as backup
984
- original_response = response
985
-
986
- # 1. Extract content between markers (if present)
987
- output_start = response.find("[OUTPUT_START]")
988
- output_end = response.find("[OUTPUT_END]")
989
- if output_start != -1 and output_end != -1 and output_end > output_start:
990
- response = response[output_start + len("[OUTPUT_START]"):output_end].strip()
991
-
992
- # 2. Remove all remaining section markers and instructions
993
- section_markers = [
994
- r'\[.*?\]', # [any text]
995
- r'OUTPUT_START\s*:|OUTPUT_END\s*:', # OUTPUT_START: or OUTPUT_END:
996
- r'ENHANCED DESCRIPTION\s*:', # ENHANCED DESCRIPTION:
997
- r'Scene Type\s*:.*?(?=\n|$)', # Scene Type: text
998
- r'Original Description\s*:.*?(?=\n|$)', # Original Description: text
999
- r'GOOD\s*:|BAD\s*:', # GOOD: or BAD:
1000
- r'PROBLEM\s*:.*?(?=\n|$)', # PROBLEM: text
1001
- r'</?\|(?:assistant|system|user)\|>', # Dialog markers
1002
- r'\(Note:.*?\)', # Notes in parentheses
1003
- r'\(.*?I\'ve.*?\)', # Common explanatory content
1004
- r'\(.*?as per your request.*?\)' # References to instructions
1005
- ]
1006
-
1007
- for marker in section_markers:
1008
- response = re.sub(marker, '', response, flags=re.IGNORECASE)
1009
-
1010
- # 2.5. Deal with Here is...
1011
- intro_prefixes = [
1012
- r'^Here\s+is\s+(?:a\s+|the\s+)?(?:rewritten\s+|enhanced\s+)?scene\s+description.*?:\s*',
1013
- r'^The\s+(?:rewritten\s+|enhanced\s+)?(?:scene\s+)?description\s+is.*?:\s*',
1014
- r'^Here\'s\s+(?:a\s+|the\s+)?(?:rewritten\s+|enhanced\s+)?description.*?:\s*'
1015
- ]
1016
-
1017
- for prefix_pattern in intro_prefixes:
1018
- response = re.sub(prefix_pattern, '', response, flags=re.IGNORECASE)
1019
-
1020
- # 3. Remove common prefixes and suffixes
1021
- prefixes_to_remove = [
1022
- "Enhanced Description:",
1023
- "Scene Description:",
1024
- "Description:",
1025
- "Here is the enhanced description:",
1026
- "Here's the enhanced description:",
1027
- "Here is a rewritten scene description that adheres to the provided critical rules:",
1028
- "Here is the rewritten scene description:",
1029
- "Here's a rewritten scene description:",
1030
- "The rewritten scene description is as follows:"
1031
- ]
1032
-
1033
- for prefix in prefixes_to_remove:
1034
- if response.lower().startswith(prefix.lower()):
1035
- response = response[len(prefix):].strip()
1036
-
1037
- # 4. Remove any Context tags or text containing Context
1038
- response = re.sub(r'<\s*Context:.*?>', '', response)
1039
- response = re.sub(r'Context:.*?(?=\n|$)', '', response)
1040
- response = re.sub(r'Note:.*?(?=\n|$)', '', response, flags=re.IGNORECASE)
1041
-
1042
- # 5. Clean improper scene type references
1043
- scene_type_pattern = r'This ([a-zA-Z_]+) (features|shows|displays|contains)'
1044
- match = re.search(scene_type_pattern, response)
1045
- if match and '_' in match.group(1):
1046
- fixed_text = f"This scene {match.group(2)}"
1047
- response = re.sub(scene_type_pattern, fixed_text, response)
1048
-
1049
- # 6. Reduce dash usage for more natural punctuation
1050
- response = re.sub(r'—', ', ', response)
1051
- response = re.sub(r' - ', ', ', response)
1052
-
1053
- # 7. Remove excess whitespace and line breaks
1054
- response = response.replace('\r', ' ')
1055
- response = re.sub(r'\n+', ' ', response) # 將所有換行符替換為空格
1056
- response = re.sub(r'\s{2,}', ' ', response) # 將多個空格替換為單個空格
1057
-
1058
- # 8. Remove Markdown formatting
1059
- response = re.sub(r'\*\*|\*|__|\|', '', response) # Remove Markdown indicators
1060
-
1061
- # 9. Detect and remove sentence duplicates
1062
- sentences = re.split(r'(?<=[.!?])\s+', response)
1063
- unique_sentences = []
1064
- seen_content = set()
1065
-
1066
- for sentence in sentences:
1067
- # Skip empty sentences
1068
- if not sentence.strip():
1069
- continue
1070
-
1071
- # Create simplified version for comparison (lowercase, no punctuation)
1072
- simplified = re.sub(r'[^\w\s]', '', sentence.lower())
1073
- simplified = ' '.join(simplified.split()) # Standardize whitespace
1074
-
1075
- # Check if we've seen a similar sentence
1076
- is_duplicate = False
1077
- for existing in seen_content:
1078
- if len(simplified) > 10 and (existing in simplified or simplified in existing):
1079
- is_duplicate = True
1080
- break
1081
-
1082
- if not is_duplicate and simplified:
1083
- unique_sentences.append(sentence)
1084
- seen_content.add(simplified)
1085
-
1086
- # Recombine unique sentences
1087
- response = ' '.join(unique_sentences)
1088
-
1089
- # 9.5. Advanced repetition detection and replacement
1090
- repetitive_descriptors = ['visible', 'positioned', 'located', 'situated', 'appears', 'features', 'shows', 'displays']
1091
- word_usage_count = {}
1092
-
1093
- # Count occurrences of each repetitive descriptor
1094
- for word in repetitive_descriptors:
1095
- count = len(re.findall(r'\b' + word + r'\b', response, re.IGNORECASE))
1096
- if count > 1:
1097
- word_usage_count[word] = count
1098
-
1099
- # Replace excessive repetitions with varied alternatives
1100
- replacement_alternatives = {
1101
- 'visible': ['present', 'evident', 'apparent', 'observable'],
1102
- 'positioned': ['arranged', 'placed', 'set', 'organized'],
1103
- 'located': ['found', 'placed', 'situated', 'established'],
1104
- 'situated': ['placed', 'positioned', 'arranged', 'set'],
1105
- 'appears': ['seems', 'looks', 'presents', 'exhibits'],
1106
- 'features': ['includes', 'contains', 'displays', 'showcases'],
1107
- 'shows': ['reveals', 'presents', 'exhibits', 'demonstrates'],
1108
- 'displays': ['presents', 'exhibits', 'shows', 'reveals']
1109
- }
1110
-
1111
- for word, count in word_usage_count.items():
1112
- if count > 1 and word in replacement_alternatives:
1113
- # Find all occurrences
1114
- pattern = re.compile(r'\b' + word + r'\b', re.IGNORECASE)
1115
- matches = list(pattern.finditer(response))
1116
-
1117
- # Replace subsequent occurrences (keep first one)
1118
- for i, match in enumerate(matches[1:], 1):
1119
- if i <= len(replacement_alternatives[word]):
1120
- replacement = replacement_alternatives[word][(i-1) % len(replacement_alternatives[word])]
1121
- # Maintain original case pattern
1122
- if match.group().isupper():
1123
- replacement = replacement.upper()
1124
- elif match.group().istitle():
1125
- replacement = replacement.capitalize()
1126
-
1127
- response = response[:match.start()] + replacement + response[match.end():]
1128
- # Update remaining matches positions
1129
- offset = len(replacement) - len(match.group())
1130
- matches = list(pattern.finditer(response))
1131
-
1132
- # 10. Ensure word count is within limits (50-150 words)
1133
- words = response.split()
1134
- if len(words) > 200:
1135
- # Find sentence ending near the word limit
1136
- truncated = ' '.join(words[:200])
1137
- last_period = max(truncated.rfind('.'), truncated.rfind('!'), truncated.rfind('?'))
1138
-
1139
- if last_period > 0:
1140
- response = truncated[:last_period+1]
1141
- else:
1142
- response = truncated + "."
1143
-
1144
- # 11. Check sentence completeness
1145
- if response and not response.strip()[-1] in ['.', '!', '?']:
1146
- # Find the last preposition or conjunction
1147
- common_prepositions = ["into", "onto", "about", "above", "across", "after", "along", "around", "at", "before", "behind", "below", "beneath", "beside", "between", "beyond", "by", "down", "during", "except", "for", "from", "in", "inside", "near", "of", "off", "on", "over", "through", "to", "toward", "under", "up", "upon", "with", "within"]
1148
-
1149
- # Check if ending with preposition or conjunction
1150
- last_word = response.strip().split()[-1].lower() if response.strip().split() else ""
1151
- if last_word in common_prepositions or last_word in ["and", "or", "but"]:
1152
- # Find the last complete sentence
1153
- last_period = max(response.rfind('.'), response.rfind('!'), response.rfind('?'))
1154
- if last_period > 0:
1155
- response = response[:last_period+1]
1156
- else:
1157
- # If no complete sentence found, modify the ending
1158
- words = response.strip().split()
1159
- if words:
1160
- # Remove the last preposition or conjunction
1161
- response = " ".join(words[:-1]) + "."
1162
-
1163
- # 12. Grammar completeness check
1164
- incomplete_patterns = [
1165
- r'\b(fine|the)\s+(the\s+)?(?:urban|area|scene)\b(?!\s+\w)', # 檢測不完整的片語
1166
- r'\b(and|or|but|with|from|in|at|on)\s*[.!?]', # 介詞後直接結束
1167
- r'\b\w+\s+\1\b' # 重複詞語檢測
1168
- ]
1169
-
1170
- for pattern in incomplete_patterns:
1171
- if re.search(pattern, response, re.IGNORECASE):
1172
- # 移除有問題的片段或進行修正
1173
- response = re.sub(pattern, '', response, flags=re.IGNORECASE)
1174
- response = re.sub(r'\s{2,}', ' ', response) # 清理多餘空格
1175
-
1176
- # 13. Ensure haven't over-filtered
1177
- if not response or len(response) < 40:
1178
- # Try to get the first meaningful paragraph from the original response
1179
- paragraphs = [p for p in original_response.split('\n\n') if p.strip()]
1180
- if paragraphs:
1181
- # Choose the longest paragraph as it's most likely the actual description
1182
- best_para = max(paragraphs, key=len)
1183
- # Clean it using a subset of the above rules
1184
- best_para = re.sub(r'\[.*?\]', '', best_para) # Remove [SECTION] markers
1185
- best_para = re.sub(r'\s{2,}', ' ', best_para).strip() # Clean whitespace
1186
-
1187
- if len(best_para) >= 40:
1188
- return best_para
1189
-
1190
- # If still no good content, return a simple message
1191
- return "Unable to generate a valid enhanced description."
1192
-
1193
- # 14. Final cleaning - catch any missed special cases
1194
- response = re.sub(r'</?\|.*?\|>', '', response) # Any remaining tags
1195
- response = re.sub(r'\(.*?\)', '', response) # Any remaining parenthetical content
1196
- response = re.sub(r'Note:.*?(?=\n|$)', '', response, flags=re.IGNORECASE) # Any remaining notes
1197
-
1198
- # Ensure proper spacing after punctuation
1199
- response = re.sub(r'([.!?])([A-Z])', r'\1 \2', response)
1200
-
1201
- # Ensure first letter is capitalized
1202
- if response and response[0].islower():
1203
- response = response[0].upper() + response[1:]
1204
-
1205
- # 15. 統一格式 - 確保輸出始終是單一段落
1206
- response = re.sub(r'\s*\n\s*', ' ', response) # 將所有換行符替換為空格
1207
- response = ' '.join(response.split())
1208
-
1209
- return response.strip()
1210
-
1211
- def _format_objects_for_prompt(self, objects: List[Dict]) -> str:
1212
- """格式化物體列表以用於提示"""
1213
- if not objects:
1214
- return "No objects detected"
1215
-
1216
- formatted = []
1217
- for obj in objects:
1218
- formatted.append(f"{obj['class_name']} (confidence: {obj['confidence']:.2f})")
1219
-
1220
- return "\n- " + "\n- ".join(formatted)
1221
-
1222
-
1223
- def _format_clip_results(self, clip_analysis: Dict) -> str:
1224
- """格式化CLIP分析結果以用於提示"""
1225
- if not clip_analysis or "error" in clip_analysis:
1226
- return "No CLIP analysis available"
1227
-
1228
- parts = ["CLIP Analysis Results:"]
1229
-
1230
- # 加上頂級場景
1231
- top_scene, confidence = clip_analysis.get("top_scene", ("unknown", 0))
1232
- parts.append(f"- Most likely scene: {top_scene} (confidence: {confidence:.2f})")
1233
-
1234
- # 加上視角
1235
- viewpoint, vp_conf = clip_analysis.get("viewpoint", ("standard", 0))
1236
- parts.append(f"- Camera viewpoint: {viewpoint} (confidence: {vp_conf:.2f})")
1237
-
1238
- # 加上物體組合
1239
- if "object_combinations" in clip_analysis:
1240
- combos = []
1241
- for combo, score in clip_analysis["object_combinations"][:3]:
1242
- combos.append(f"{combo} ({score:.2f})")
1243
- parts.append(f"- Object combinations: {', '.join(combos)}")
1244
-
1245
- # 加上文化分析
1246
- if "cultural_analysis" in clip_analysis:
1247
- parts.append("- Cultural analysis:")
1248
- for culture_type, data in clip_analysis["cultural_analysis"].items():
1249
- best_desc = data.get("best_description", "")
1250
- desc_conf = data.get("confidence", 0)
1251
- parts.append(f" * {culture_type}: {best_desc} ({desc_conf:.2f})")
1252
-
1253
- return "\n".join(parts)
1254
-
1255
- def _format_cultural_analysis(self, cultural_analysis: Dict) -> str:
1256
- """格式化文化分析結果"""
1257
- if not cultural_analysis:
1258
- return "No specific cultural elements detected"
1259
-
1260
- parts = []
1261
- for culture_type, data in cultural_analysis.items():
1262
- best_desc = data.get("best_description", "")
1263
- desc_conf = data.get("confidence", 0)
1264
- parts.append(f"{culture_type}: {best_desc} (confidence: {desc_conf:.2f})")
1265
-
1266
- return "\n".join(parts)
 
 
 
 
 
1
  import logging
2
+ import traceback
3
+ from typing import Dict, List, Any, Optional
4
+
5
+ from model_manager import ModelManager
6
+ from prompt_template_manager import PromptTemplateManager
7
+ from response_processor import ResponseProcessor
8
+ from text_quality_validator import TextQualityValidator
9
+ from landmark_data import ALL_LANDMARKS
10
 
11
  class LLMEnhancer:
12
  """
13
+ LLM增強器的主要窗口,協調模型管理、提示模板、回應處理和品質驗證等組件。
14
+ 提供統一的接口來處理場景描述增強、檢測結果驗證和無檢測情況處理。
15
  """
16
 
17
  def __init__(self,
18
+ model_path: Optional[str] = None,
19
+ tokenizer_path: Optional[str] = None,
20
+ device: Optional[str] = None,
21
+ max_length: int = 2048,
22
+ temperature: float = 0.3,
23
+ top_p: float = 0.85):
24
  """
25
+ 初始化LLM增強器門面
26
+
27
  Args:
28
+ model_path: LLM模型的路徑或HuggingFace模型名稱,預設使用Llama 3.2
29
+ tokenizer_path: tokenizer的路徑,通常與model_path相同
30
+ device: 運行設備 ('cpu'或'cuda'),None時自動檢測
31
+ max_length: 輸入文本的最大長度
32
+ temperature: 生成文本的溫度參數
33
  top_p: 生成文本時的核心採樣機率閾值
34
  """
35
+ # 設置專屬logger
36
+ self.logger = logging.getLogger(self.__class__.__name__)
37
+ if not self.logger.handlers:
38
+ handler = logging.StreamHandler()
39
+ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
40
+ handler.setFormatter(formatter)
41
+ self.logger.addHandler(handler)
42
+ self.logger.setLevel(logging.INFO)
43
 
44
+ try:
45
+ # 初始化四個核心組件
46
+ self.model_manager = ModelManager(
47
+ model_path=model_path,
48
+ tokenizer_path=tokenizer_path,
49
+ device=device,
50
+ max_length=max_length,
51
+ temperature=temperature,
52
+ top_p=top_p
53
+ )
54
 
55
+ self.prompt_manager = PromptTemplateManager()
56
+ self.response_processor = ResponseProcessor()
57
+ self.quality_validator = TextQualityValidator()
58
 
59
+ # 保存模型路徑以供後續使用
60
+ self.model_path = model_path or "meta-llama/Llama-3.2-3B-Instruct"
 
 
61
 
62
+ self.logger.info("LLMEnhancer facade initialized successfully")
 
63
 
64
+ except Exception as e:
65
+ error_msg = f"Failed to initialize LLMEnhancer facade: {str(e)}"
66
+ self.logger.error(error_msg)
67
+ self.logger.error(traceback.format_exc())
68
+ raise Exception(error_msg) from e
69
 
70
+ def enhance_description(self, scene_data: Dict[str, Any]) -> str:
71
+ """
72
+ 場景描述增強器主要入口方法,整合所有組件來處理場景描述增強
73
 
74
+ Args:
75
+ scene_data: 包含場景資訊的字典,包括原始描述、檢測物件 (含 is_landmark)、
76
+ 場景類型、時間/光線資訊等
77
 
78
+ Returns:
79
+ str: 增強後的場景描述
80
+ """
81
  try:
82
+ self.logger.info("Starting scene description enhancement")
 
 
 
 
 
 
 
 
83
 
84
+ # 1. 重置模型上下文
85
+ self.model_manager.reset_context()
 
 
86
 
87
+ # 2. 取出原始描述
88
+ original_desc = scene_data.get("original_description", "")
89
+ if not original_desc:
90
+ self.logger.warning("No original description provided")
91
+ return "No original description provided."
 
 
 
 
 
 
 
 
 
 
92
 
93
+ # 3. 準備物件統計資訊
94
+ object_list = self._prepare_object_statistics(scene_data)
95
+ if not object_list:
96
+ object_keywords = self.quality_validator.extract_objects_from_description(original_desc)
97
+ object_list = ", ".join(object_keywords) if object_keywords else "objects visible in the scene"
98
+
99
+ # 4. 檢測地標並準備地標資訊
100
+ landmark_info = self._extract_landmark_info(scene_data)
101
+
102
+ # 5. 將地標資訊加入scene_data
103
+ enhanced_scene_data = scene_data.copy()
104
+ if landmark_info:
105
+ enhanced_scene_data["landmark_location_info"] = landmark_info
106
+
107
+ # 6. 生成 prompt
108
+ prompt = self.prompt_manager.format_enhancement_prompt_with_landmark(
109
+ scene_data=enhanced_scene_data,
110
+ object_list=object_list,
111
+ original_description=original_desc
112
  )
113
 
114
+ # 7. 生成 LLM 回應
115
+ self.logger.info("Generating LLM response")
116
+ response = self.model_manager.generate_response(prompt)
117
 
118
+ # 8. 處理不完整回應(重試機制)
119
+ response = self._handle_incomplete_response(response, prompt, original_desc)
 
 
 
 
 
 
120
 
121
+ # 9. 清理 LLM 回應
122
+ model_type = self.model_path
123
+ raw_cleaned = self.response_processor.clean_response(response, model_type)
124
 
125
+ # 10. 移除解釋性注釋
126
+ cleaned_response = self.response_processor.remove_explanatory_notes(raw_cleaned)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
+ # 11. 事實準確性驗證
129
+ try:
130
+ cleaned_response = self.quality_validator.verify_factual_accuracy(
131
+ original_desc, cleaned_response, object_list
132
+ )
133
+ except Exception:
134
+ self.logger.warning("Fact verification failed; using response without verification")
135
 
136
+ # 12. 場景類型一致性確保
137
+ scene_type = scene_data.get("scene_type", "unknown scene")
138
+ word_count = len(cleaned_response.split())
139
+ if word_count >= 5 and scene_type.lower() not in cleaned_response.lower():
140
+ cleaned_response = self.quality_validator.ensure_scene_type_consistency(
141
+ cleaned_response, scene_type, original_desc
142
+ )
143
 
144
+ # 13. 視角一致性處理
145
+ perspective = self.quality_validator.extract_perspective_from_description(original_desc)
146
+ if perspective and perspective.lower() not in cleaned_response.lower():
147
+ cleaned_response = f"{perspective}, {cleaned_response[0].lower()}{cleaned_response[1:]}"
148
+
149
+ # 14. 最終驗證:如果結果過短,嘗試fallback
150
+ final_result = cleaned_response.strip()
151
+ if not final_result or len(final_result) < 20:
152
+ self.logger.warning("Enhanced description too short; attempting fallback")
153
+
154
+ # Fallback prompt
155
+ fallback_scene_data = enhanced_scene_data.copy()
156
+ fallback_scene_data["is_fallback"] = True
157
+ fallback_prompt = self.prompt_manager.format_enhancement_prompt_with_landmark(
158
+ scene_data=fallback_scene_data,
159
+ object_list=object_list,
160
+ original_description=original_desc
161
+ )
162
 
163
+ fallback_resp = self.model_manager.generate_response(fallback_prompt)
164
+ fallback_cleaned = self.response_processor.clean_response(fallback_resp, model_type)
165
+ fallback_cleaned = self.response_processor.remove_explanatory_notes(fallback_cleaned)
 
 
 
 
166
 
167
+ final_result = fallback_cleaned.strip()
168
+ if not final_result or len(final_result) < 20:
169
+ self.logger.warning("Fallback also insufficient; returning original")
170
+ return original_desc
171
 
172
+ # 15. display enhanced description
173
+ self.logger.info(f"Scene description enhancement completed successfully ({len(final_result)} chars)")
174
+ return final_result
 
175
 
176
+ except Exception as e:
177
+ error_msg = f"Enhancement failed: {str(e)}"
178
+ self.logger.error(error_msg)
179
+ self.logger.error(traceback.format_exc())
180
+ return scene_data.get("original_description", "Unable to enhance description")
181
 
182
+ def _extract_landmark_info(self, scene_data: Dict[str, Any]) -> Optional[Dict[str, str]]:
183
+ """
184
+ 提取地標資訊,但不構建prompt內容
185
 
186
+ Args:
187
+ scene_data: 場景資料字典
 
 
188
 
189
+ Returns:
190
+ Optional[Dict[str, str]]: 地標資訊字典,包含name和location,如果沒有地標則返回None
191
+ """
192
+ try:
193
+ # 檢查是否有地標
194
+ lm_id_in_data = scene_data.get("landmark_id")
195
+ if not lm_id_in_data:
196
+ # 從檢測物件中尋找地標
197
+ for obj in scene_data.get("detected_objects", []):
198
+ if obj.get("is_landmark") and obj.get("landmark_id"):
199
+ lm_id_in_data = obj["landmark_id"]
200
+ break
201
 
202
+ # 如果沒有檢測到地標,返回None
203
+ if not lm_id_in_data:
204
+ return None
205
 
206
+ # 從landmark_data.py提取地標資訊
207
+ if lm_id_in_data in ALL_LANDMARKS:
208
+ lm_info = ALL_LANDMARKS[lm_id_in_data]
209
+ landmark_name = scene_data.get("scene_name", lm_info.get("name", lm_id_in_data))
210
+ landmark_location = lm_info.get("location", "")
211
 
212
+ if landmark_location:
213
+ return {
214
+ "name": landmark_name,
215
+ "location": landmark_location,
216
+ "landmark_id": lm_id_in_data
217
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
 
219
+ return None
 
 
 
 
220
 
221
+ except Exception as e:
222
+ self.logger.error(f"Error extracting landmark info: {str(e)}")
223
+ return None
224
 
 
 
 
 
225
 
226
+ def _prepare_object_statistics(self, scene_data: Dict[str, Any]) -> str:
227
+ """
228
+ 準備物件統計資訊用於提示詞生成
229
 
230
+ Args:
231
+ scene_data: 場景資料字典
 
232
 
233
+ Returns:
234
+ str: 格式化的物件統計資訊
235
+ """
236
+ try:
237
+ # 高信心度閾值
238
  high_confidence_threshold = 0.65
239
 
240
+ # 優先使用預計算的統計資訊
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
  object_statistics = scene_data.get("object_statistics", {})
242
  object_counts = {}
243
 
244
  if object_statistics:
 
245
  for class_name, stats in object_statistics.items():
246
  if stats.get("count", 0) > 0 and stats.get("avg_confidence", 0) >= high_confidence_threshold:
247
  object_counts[class_name] = stats["count"]
248
  else:
249
  # 回退到原有的計算方式
250
+ detected_objects = scene_data.get("detected_objects", [])
251
+ filtered_objects = []
252
+
253
+ for obj in detected_objects:
254
+ confidence = obj.get("confidence", 0)
255
+ class_name = obj.get("class_name", "")
256
+
257
+ # 為特殊類別設置更高閾值
258
+ special_classes = ["airplane", "helicopter", "boat"]
259
+ if class_name in special_classes:
260
+ if confidence < 0.75:
261
+ continue
262
+
263
+ if confidence >= high_confidence_threshold:
264
+ filtered_objects.append(obj)
265
+
266
  for obj in filtered_objects:
267
  class_name = obj.get("class_name", "")
268
  if class_name not in object_counts:
269
  object_counts[class_name] = 0
270
  object_counts[class_name] += 1
271
 
272
+ # 格式化物件描述
273
+ return ", ".join([
274
  f"{count} {obj}{'s' if count > 1 else ''}"
275
  for obj, count in object_counts.items()
276
  ])
277
 
278
+ except Exception as e:
279
+ self.logger.error(f"Object statistics preparation failed: {str(e)}")
280
+ return "objects visible in the scene"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
 
282
+ def _handle_incomplete_response(self, response: str, prompt: str, original_desc: str) -> str:
283
+ """
284
+ 處理不完整的回應,必要時重新生成
285
 
286
+ Args:
287
+ response: 原始回應
288
+ prompt: 使用的提示詞
289
+ original_desc: 原始描述
 
290
 
291
+ Returns:
292
+ str: 處理後的回應
293
+ """
294
+ try:
295
+ # 檢查回應完整性
296
+ is_complete, issue = self.quality_validator.validate_response_completeness(response)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
297
 
298
  max_retries = 3
299
  attempts = 0
300
+
301
+ while not is_complete and attempts < max_retries:
302
+ self.logger.warning(f"Incomplete response detected ({issue}), retrying... Attempt {attempts+1}/{max_retries}")
303
+
304
  # 重新生成
305
+ response = self.model_manager.generate_response(prompt)
306
+ is_complete, issue = self.quality_validator.validate_response_completeness(response)
307
  attempts += 1
308
 
 
 
 
 
 
309
  if not response or len(response.strip()) < 10:
310
  self.logger.warning("Generated response was empty or too short, returning original description")
311
  return original_desc
312
 
313
+ return response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
 
315
  except Exception as e:
316
+ self.logger.error(f"Incomplete response handling failed: {str(e)}")
317
+ return response # 返回原始回應
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
 
319
  def verify_detection(self,
320
+ detected_objects: List[Dict],
321
+ clip_analysis: Dict[str, Any],
322
+ scene_type: str,
323
+ scene_name: str,
324
+ confidence: float) -> Dict[str, Any]:
325
  """
326
  驗證並可能修正YOLO的檢測結果
327
+
328
  Args:
329
  detected_objects: YOLO檢測到的物體列表
330
  clip_analysis: CLIP分析結果
331
  scene_type: 識別的場景類型
332
  scene_name: 場景名稱
333
  confidence: 場景分類的信心度
334
+
335
  Returns:
336
  Dict: 包含驗證結果和建議的字典
337
  """
338
+ try:
339
+ self.logger.info("Starting detection verification")
340
+
341
+ # 格式化驗證提示
342
+ prompt = self.prompt_manager.format_verification_prompt(
343
+ detected_objects=detected_objects,
344
+ clip_analysis=clip_analysis,
345
+ scene_type=scene_type,
346
+ scene_name=scene_name,
347
+ confidence=confidence
348
+ )
349
+
350
+ # 調用LLM進行驗證
351
+ verification_result = self.model_manager.generate_response(prompt)
352
+
353
+ # 清理回應
354
+ cleaned_result = self.response_processor.clean_response(verification_result, self.model_path)
355
+
356
+ # 解析驗證結果
357
+ result = {
358
+ "verification_text": cleaned_result,
359
+ "has_errors": "appear accurate" not in cleaned_result.lower(),
360
+ "corrected_objects": None
361
+ }
362
+
363
+ self.logger.info("Detection verification completed")
364
+ return result
365
+
366
+ except Exception as e:
367
+ error_msg = f"Detection verification failed: {str(e)}"
368
+ self.logger.error(error_msg)
369
+ self.logger.error(traceback.format_exc())
370
+ return {
371
+ "verification_text": "Verification failed",
372
+ "has_errors": False,
373
+ "corrected_objects": None
374
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
375
 
376
  def handle_no_detection(self, clip_analysis: Dict[str, Any]) -> str:
377
  """
378
  處理YOLO未檢測到物體的情況
379
+
380
  Args:
381
  clip_analysis: CLIP分析結果
382
+
383
  Returns:
384
  str: 生成的場景描述
385
  """
386
+ try:
387
+ self.logger.info("Handling no detection scenario")
388
 
389
+ # 格式化無檢測提示
390
+ prompt = self.prompt_manager.format_no_detection_prompt(clip_analysis)
 
 
391
 
392
+ # 調用LLM生成描述
393
+ description = self.model_manager.generate_response(prompt)
394
 
395
+ # 清理回應
396
+ cleaned_description = self.response_processor.clean_response(description, self.model_path)
 
 
 
 
 
 
397
 
398
+ self.logger.info("No detection handling completed")
399
+ return cleaned_description
400
 
401
+ except Exception as e:
402
+ error_msg = f"No detection handling failed: {str(e)}"
403
+ self.logger.error(error_msg)
404
+ self.logger.error(traceback.format_exc())
405
+ return "Unable to generate scene description"
406
 
407
+ def reset_context(self):
408
+ """重置LLM模型上下文"""
409
+ try:
410
+ self.model_manager.reset_context()
411
+ self.logger.info("LLM context reset completed")
412
+ except Exception as e:
413
+ self.logger.error(f"Context reset failed: {str(e)}")
414
+
415
+ def get_call_count(self) -> int:
416
  """
417
+ 獲取模型調用次數
418
+
 
419
  Returns:
420
+ int: 調用次數
421
  """
422
+ return self.model_manager.get_call_count()
 
 
 
 
 
 
 
 
 
 
 
423
 
424
+ def get_model_info(self) -> Dict[str, Any]:
425
+ """
426
+ 獲取模型和組件資訊
 
 
 
427
 
428
+ Returns:
429
+ Dict[str, Any]: 包含所有組件狀態的綜合資訊
430
+ """
431
+ try:
432
+ return {
433
+ "model_manager": self.model_manager.get_model_info(),
434
+ "prompt_manager": self.prompt_manager.get_template_info(),
435
+ "response_processor": self.response_processor.get_processor_info(),
436
+ "quality_validator": self.quality_validator.get_validator_info(),
437
+ "facade_status": "initialized"
438
+ }
439
+ except Exception as e:
440
+ self.logger.error(f"Failed to get component info: {str(e)}")
441
+ return {"facade_status": "error", "error_message": str(e)}
442
 
443
+ def is_model_loaded(self) -> bool:
444
  """
445
+ 檢查模型是否已載入
446
+
 
 
 
 
447
  Returns:
448
+ bool: 模型載入狀態
449
  """
450
+ return self.model_manager.is_model_loaded()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
451
 
452
+ def get_current_device(self) -> str:
453
+ """
454
+ 獲取當前運行設備
 
 
 
455
 
456
+ Returns:
457
+ str: 當前設備名稱
458
+ """
459
+ return self.model_manager.get_current_device()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
460
 
461
+ def _detect_scene_type(self, detected_objects: List[Dict]) -> str:
462
+ """
463
+ 基於物件分佈和模式檢測場景類型
464
 
465
+ Args:
466
+ detected_objects: 檢測到的物件列表
 
467
 
468
+ Returns:
469
+ str: 檢測到的場景類型
470
+ """
471
  try:
472
+ # 預設場景類型
473
+ scene_type = "intersection"
 
 
 
 
 
 
474
 
475
+ # 計算物件數量
476
+ object_counts = {}
477
+ for obj in detected_objects:
478
+ class_name = obj.get("class_name", "")
479
+ if class_name not in object_counts:
480
+ object_counts[class_name] = 0
481
+ object_counts[class_name] += 1
482
 
483
+ # 人數統計
484
+ people_count = object_counts.get("person", 0)
 
 
 
 
 
485
 
486
+ # 交通工具統計
487
+ car_count = object_counts.get("car", 0)
488
+ bus_count = object_counts.get("bus", 0)
489
+ truck_count = object_counts.get("truck", 0)
490
+ total_vehicles = car_count + bus_count + truck_count
 
 
 
 
 
 
 
491
 
492
+ # 簡單的場景類型檢測邏輯
493
+ if people_count > 8 and total_vehicles < 2:
494
+ scene_type = "pedestrian_crossing"
495
+ elif people_count > 5 and total_vehicles > 2:
496
+ scene_type = "busy_intersection"
497
+ elif people_count < 3 and total_vehicles > 3:
498
+ scene_type = "traffic_junction"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
499
 
500
+ return scene_type
 
 
 
 
 
501
 
502
  except Exception as e:
503
+ self.logger.error(f"Scene type detection failed: {str(e)}")
504
+ return "intersection"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model_manager.py ADDED
@@ -0,0 +1,358 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import logging
4
+ from typing import Dict, Optional, Any
5
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
6
+ from huggingface_hub import login
7
+
8
+ class ModelLoadingError(Exception):
9
+ """Custom exception for model loading failures"""
10
+ pass
11
+
12
+
13
+ class ModelGenerationError(Exception):
14
+ """Custom exception for model generation failures"""
15
+ pass
16
+
17
+
18
+ class ModelManager:
19
+ """
20
+ 負責LLM模型的載入、設備管理和文本生成。
21
+ 管理模型、記憶體優化和設備配置。
22
+ """
23
+
24
+ def __init__(self,
25
+ model_path: Optional[str] = None,
26
+ tokenizer_path: Optional[str] = None,
27
+ device: Optional[str] = None,
28
+ max_length: int = 2048,
29
+ temperature: float = 0.3,
30
+ top_p: float = 0.85):
31
+ """
32
+ 初始化模型管理器
33
+
34
+ Args:
35
+ model_path: LLM模型的路徑或HuggingFace模型名稱,默認使用Llama 3.2
36
+ tokenizer_path: tokenizer的路徑,通常與model_path相同
37
+ device: 運行設備 ('cpu'或'cuda'),None時自動檢測
38
+ max_length: 輸入文本的最大長度
39
+ temperature: 生成文本的溫度參數
40
+ top_p: 生成文本時的核心採樣機率閾值
41
+ """
42
+ # 設置專屬logger
43
+ self.logger = logging.getLogger(self.__class__.__name__)
44
+ if not self.logger.handlers:
45
+ handler = logging.StreamHandler()
46
+ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
47
+ handler.setFormatter(formatter)
48
+ self.logger.addHandler(handler)
49
+ self.logger.setLevel(logging.INFO)
50
+
51
+ # 模型配置
52
+ self.model_path = model_path or "meta-llama/Llama-3.2-3B-Instruct"
53
+ self.tokenizer_path = tokenizer_path or self.model_path
54
+
55
+ # 設備管理
56
+ self.device = self._detect_device(device)
57
+ self.logger.info(f"Device selected: {self.device}")
58
+
59
+ # 生成參數
60
+ self.max_length = max_length
61
+ self.temperature = temperature
62
+ self.top_p = top_p
63
+
64
+ # 模型狀態
65
+ self.model = None
66
+ self.tokenizer = None
67
+ self._model_loaded = False
68
+ self.call_count = 0
69
+
70
+ # HuggingFace認證
71
+ self.hf_token = self._setup_huggingface_auth()
72
+
73
+ def _detect_device(self, device: Optional[str]) -> str:
74
+ """
75
+ 檢測並設置運行設備
76
+
77
+ Args:
78
+ device: 用戶指定的設備,None時自動檢測
79
+
80
+ Returns:
81
+ str: ('cuda' or 'cpu')
82
+ """
83
+ if device:
84
+ if device == 'cuda' and not torch.cuda.is_available():
85
+ self.logger.warning("CUDA requested but not available, falling back to CPU")
86
+ return 'cpu'
87
+ return device
88
+
89
+ detected_device = 'cuda' if torch.cuda.is_available() else 'cpu'
90
+
91
+ if detected_device == 'cuda':
92
+ gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)
93
+ self.logger.info(f"CUDA detected with {gpu_memory:.2f} GB GPU memory")
94
+
95
+ return detected_device
96
+
97
+ def _setup_huggingface_auth(self) -> Optional[str]:
98
+ """
99
+ 設置HuggingFace認證
100
+
101
+ Returns:
102
+ Optional[str]: HuggingFace token,如果可用
103
+ """
104
+ hf_token = os.environ.get("HF_TOKEN")
105
+
106
+ if hf_token:
107
+ try:
108
+ login(token=hf_token)
109
+ self.logger.info("Successfully authenticated with HuggingFace")
110
+ return hf_token
111
+ except Exception as e:
112
+ self.logger.error(f"HuggingFace authentication failed: {e}")
113
+ return None
114
+ else:
115
+ self.logger.warning("HF_TOKEN not found. Access to gated models may be limited")
116
+ return None
117
+
118
+ def _load_model(self):
119
+ """
120
+ 載入LLM模型和tokenizer,使用8位量化以節省記憶體
121
+
122
+ Raises:
123
+ ModelLoadingError: 當模型載入失敗時
124
+ """
125
+ if self._model_loaded:
126
+ return
127
+
128
+ try:
129
+ self.logger.info(f"Loading model from {self.model_path} with 8-bit quantization")
130
+
131
+ # 清理GPU記憶體
132
+ self._clear_gpu_cache()
133
+
134
+ # 設置8位量化配置
135
+ quantization_config = BitsAndBytesConfig(
136
+ load_in_8bit=True,
137
+ llm_int8_enable_fp32_cpu_offload=True
138
+ )
139
+
140
+ # 載入tokenizer
141
+ self.tokenizer = AutoTokenizer.from_pretrained(
142
+ self.tokenizer_path,
143
+ padding_side="left",
144
+ use_fast=False,
145
+ token=self.hf_token
146
+ )
147
+
148
+ # 設置特殊標記
149
+ if self.tokenizer.pad_token is None:
150
+ self.tokenizer.pad_token = self.tokenizer.eos_token
151
+
152
+ # 載入模型
153
+ self.model = AutoModelForCausalLM.from_pretrained(
154
+ self.model_path,
155
+ quantization_config=quantization_config,
156
+ device_map="auto",
157
+ low_cpu_mem_usage=True,
158
+ token=self.hf_token
159
+ )
160
+
161
+ self._model_loaded = True
162
+ self.logger.info("Model loaded successfully")
163
+
164
+ except Exception as e:
165
+ error_msg = f"Failed to load model: {str(e)}"
166
+ self.logger.error(error_msg)
167
+ raise ModelLoadingError(error_msg) from e
168
+
169
+ def _clear_gpu_cache(self):
170
+ """清理GPU記憶體緩存"""
171
+ if torch.cuda.is_available():
172
+ torch.cuda.empty_cache()
173
+ self.logger.debug("GPU cache cleared")
174
+
175
+ def generate_response(self, prompt: str, **generation_kwargs) -> str:
176
+ """
177
+ 生成LLM回應
178
+
179
+ Args:
180
+ prompt: 輸入提示詞
181
+ **generation_kwargs: 額外的生成參數,可覆蓋預設值
182
+
183
+ Returns:
184
+ str: 生成的回應文本
185
+
186
+ Raises:
187
+ ModelGenerationError: 當生成失敗時
188
+ """
189
+ # 確保模型已載入
190
+ if not self._model_loaded:
191
+ self._load_model()
192
+
193
+ try:
194
+ self.call_count += 1
195
+ self.logger.info(f"Generating response (call #{self.call_count})")
196
+
197
+ # clean GPU
198
+ self._clear_gpu_cache()
199
+
200
+ # 設置固定種子以提高一致性
201
+ torch.manual_seed(42)
202
+
203
+ # prepare input
204
+ inputs = self.tokenizer(
205
+ prompt,
206
+ return_tensors="pt",
207
+ truncation=True,
208
+ max_length=self.max_length
209
+ ).to(self.device)
210
+
211
+ # 準備生成參數
212
+ generation_params = self._prepare_generation_params(**generation_kwargs)
213
+ generation_params.update({
214
+ "pad_token_id": self.tokenizer.eos_token_id,
215
+ "attention_mask": inputs.attention_mask,
216
+ "use_cache": True,
217
+ })
218
+
219
+ # resposne
220
+ with torch.no_grad():
221
+ outputs = self.model.generate(inputs.input_ids, **generation_params)
222
+
223
+ # 解碼回應
224
+ full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
225
+ response = self._extract_generated_response(full_response, prompt)
226
+
227
+ if not response or len(response.strip()) < 10:
228
+ raise ModelGenerationError("Generated response is too short or empty")
229
+
230
+ self.logger.info(f"Response generated successfully ({len(response)} characters)")
231
+ return response
232
+
233
+ except Exception as e:
234
+ error_msg = f"Text generation failed: {str(e)}"
235
+ self.logger.error(error_msg)
236
+ raise ModelGenerationError(error_msg) from e
237
+
238
+ def _prepare_generation_params(self, **kwargs) -> Dict[str, Any]:
239
+ """
240
+ 準備生成參數,支援模型特定的優化
241
+
242
+ Args:
243
+ **kwargs: 用戶提供的生成參數
244
+
245
+ Returns:
246
+ Dict[str, Any]: 完整的生成參數配置
247
+ """
248
+ # basic parameters
249
+ params = {
250
+ "max_new_tokens": 120,
251
+ "temperature": self.temperature,
252
+ "top_p": self.top_p,
253
+ "do_sample": True,
254
+ }
255
+
256
+ # 針對Llama模型的特殊優化
257
+ if "llama" in self.model_path.lower():
258
+ params.update({
259
+ "max_new_tokens": 600,
260
+ "temperature": 0.35, # not too big
261
+ "top_p": 0.75,
262
+ "repetition_penalty": 1.5,
263
+ "num_beams": 5,
264
+ "length_penalty": 1,
265
+ "no_repeat_ngram_size": 3
266
+ })
267
+ else:
268
+ params.update({
269
+ "max_new_tokens": 300,
270
+ "temperature": 0.6,
271
+ "top_p": 0.9,
272
+ "num_beams": 1,
273
+ "repetition_penalty": 1.05
274
+ })
275
+
276
+ # 用戶參數覆蓋預設值
277
+ params.update(kwargs)
278
+
279
+ return params
280
+
281
+ def _extract_generated_response(self, full_response: str, prompt: str) -> str:
282
+ """
283
+ 從完整回應中提取生成的部分
284
+
285
+ Args:
286
+ full_response: 模型的完整輸出
287
+ prompt: 原始提示詞
288
+
289
+ Returns:
290
+ str: 提取的生成回應
291
+ """
292
+ # 尋找assistant標記
293
+ assistant_tag = "<|assistant|>"
294
+ if assistant_tag in full_response:
295
+ response = full_response.split(assistant_tag)[-1].strip()
296
+
297
+ # 檢查是否有未閉合的user標記
298
+ user_tag = "<|user|>"
299
+ if user_tag in response:
300
+ response = response.split(user_tag)[0].strip()
301
+
302
+ return response
303
+
304
+ # 移除輸入提示詞
305
+ if full_response.startswith(prompt):
306
+ return full_response[len(prompt):].strip()
307
+
308
+ return full_response.strip()
309
+
310
+ def reset_context(self):
311
+ """重置模型上下文,清理GPU緩存"""
312
+ if self._model_loaded:
313
+ self._clear_gpu_cache()
314
+ self.logger.info("Model context reset")
315
+ else:
316
+ self.logger.info("Model not loaded, no context to reset")
317
+
318
+ def get_current_device(self) -> str:
319
+ """
320
+ 獲取當前運行設備
321
+
322
+ Returns:
323
+ str: 當前設備名稱
324
+ """
325
+ return self.device
326
+
327
+ def is_model_loaded(self) -> bool:
328
+ """
329
+ 檢查模型是否已載入
330
+
331
+ Returns:
332
+ bool: 模型載入狀態
333
+ """
334
+ return self._model_loaded
335
+
336
+ def get_call_count(self) -> int:
337
+ """
338
+ 獲取模型調用次數
339
+
340
+ Returns:
341
+ int: 調用次數
342
+ """
343
+ return self.call_count
344
+
345
+ def get_model_info(self) -> Dict[str, Any]:
346
+ """
347
+ 獲取模型信息
348
+
349
+ Returns:
350
+ Dict[str, Any]: 包含模型路徑、設備、載入狀態等信息
351
+ """
352
+ return {
353
+ "model_path": self.model_path,
354
+ "device": self.device,
355
+ "is_loaded": self._model_loaded,
356
+ "call_count": self.call_count,
357
+ "has_hf_token": self.hf_token is not None
358
+ }
object_description_generator.py ADDED
@@ -0,0 +1,1266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import traceback
3
+ from typing import Dict, List, Tuple, Optional, Any
4
+ import numpy as np
5
+
6
+ class ObjectDescriptionError(Exception):
7
+ """物件描述生成過程中的自定義異常"""
8
+ pass
9
+
10
+
11
+ class ObjectDescriptionGenerator:
12
+ """
13
+ 物件描述生成器 - 負責將檢測到的物件轉換為自然語言描述
14
+
15
+ 該類別處理物件相關的所有描述生成邏輯,包括重要物件的識別、
16
+ 空間位置描述、物件列表格式化以及描述文本的優化。
17
+ """
18
+
19
+ def __init__(self,
20
+ min_prominence_score: float = 0.1,
21
+ max_categories_to_return: int = 5,
22
+ max_total_objects: int = 7,
23
+ confidence_threshold_for_description: float = 0.25,
24
+ region_analyzer: Optional[Any] = None):
25
+ """
26
+ 初始化物件描述生成器
27
+
28
+ Args:
29
+ min_prominence_score: 物件顯著性的最低分數閾值
30
+ max_categories_to_return: 返回的物件類別最大數量
31
+ max_total_objects: 返回的物件總數上限
32
+ confidence_threshold_for_description: 用於描述的置信度閾值
33
+ """
34
+ self.logger = logging.getLogger(self.__class__.__name__)
35
+
36
+ self.min_prominence_score = min_prominence_score
37
+ self.max_categories_to_return = max_categories_to_return
38
+ self.max_total_objects = max_total_objects
39
+ self.confidence_threshold_for_description = confidence_threshold_for_description
40
+ self.region_analyzer = region_analyzer
41
+
42
+ self.logger.info("ObjectDescriptionGenerator initialized with prominence_score=%.2f, "
43
+ "max_categories=%d, max_objects=%d, confidence_threshold=%.2f",
44
+ min_prominence_score, max_categories_to_return,
45
+ max_total_objects, confidence_threshold_for_description)
46
+
47
+ def get_prominent_objects(self, detected_objects: List[Dict],
48
+ min_prominence_score: float = 0.5,
49
+ max_categories_to_return: Optional[int] = None) -> List[Dict]:
50
+ """
51
+ 獲取最重要的物件,基於置信度、大小和位置計算重要性評分
52
+
53
+ Args:
54
+ detected_objects: 檢測到的物件列表
55
+ min_prominence_score: 最小重要性分數閾值,範圍 0.0-1.0
56
+ max_categories_to_return: 可選的最大返回類別數量限制
57
+
58
+ Returns:
59
+ List[Dict]: 按重要性排序的物件列表
60
+ """
61
+ try:
62
+ if not detected_objects:
63
+ return []
64
+
65
+ prominent_objects = []
66
+
67
+ for obj in detected_objects:
68
+ # 計算重要性評分
69
+ prominence_score = self._calculate_prominence_score(obj)
70
+
71
+ # 只保留超過閾值的物件
72
+ if prominence_score >= min_prominence_score:
73
+ obj_copy = obj.copy()
74
+ obj_copy['prominence_score'] = prominence_score
75
+ prominent_objects.append(obj_copy)
76
+
77
+ # 按重要性評分排序(從高到低)
78
+ prominent_objects.sort(key=lambda x: x.get('prominence_score', 0), reverse=True)
79
+
80
+ # 如果指定了最大類別數量限制,進行過濾
81
+ if max_categories_to_return is not None and max_categories_to_return > 0:
82
+ categories_seen = set()
83
+ filtered_objects = []
84
+
85
+ for obj in prominent_objects:
86
+ class_name = obj.get("class_name", "unknown")
87
+
88
+ # 如果是新類別且未達到限制
89
+ if class_name not in categories_seen:
90
+ if len(categories_seen) < max_categories_to_return:
91
+ categories_seen.add(class_name)
92
+ filtered_objects.append(obj)
93
+ else:
94
+ # 已見過的類別,直接添加
95
+ filtered_objects.append(obj)
96
+
97
+ return filtered_objects
98
+
99
+ return prominent_objects
100
+
101
+ except Exception as e:
102
+ self.logger.error(f"Error calculating prominent objects: {str(e)}")
103
+ return []
104
+
105
+ def set_region_analyzer(self, region_analyzer: Any) -> None:
106
+ """
107
+ 設置RegionAnalyzer,用於標準化空間描述生成
108
+
109
+ Args:
110
+ region_analyzer: RegionAnalyzer實例
111
+ """
112
+ try:
113
+ self.region_analyzer = region_analyzer
114
+ self.logger.info("RegionAnalyzer instance set for ObjectDescriptionGenerator")
115
+ except Exception as e:
116
+ self.logger.warning(f"Error setting RegionAnalyzer: {str(e)}")
117
+
118
+ def _get_standardized_spatial_description(self, obj: Dict) -> str:
119
+ """
120
+ 使用RegionAnalyzer生成標準化空間描述的內部方法
121
+
122
+ Args:
123
+ obj: 物件字典
124
+
125
+ Returns:
126
+ str: 標準化空間描述,失敗時���回空字串
127
+ """
128
+ try:
129
+ if hasattr(self, 'region_analyzer') and self.region_analyzer:
130
+ region = obj.get("region", "")
131
+ object_type = obj.get("class_name", "")
132
+
133
+ if hasattr(self.region_analyzer, 'get_contextual_spatial_description'):
134
+ return self.region_analyzer.get_contextual_spatial_description(region, object_type)
135
+ elif hasattr(self.region_analyzer, 'get_spatial_description_phrase'):
136
+ return self.region_analyzer.get_spatial_description_phrase(region)
137
+
138
+ return ""
139
+
140
+ except Exception as e:
141
+ self.logger.warning(f"Error getting standardized spatial description: {str(e)}")
142
+ if object_type:
143
+ return f"visible in the scene"
144
+ return "present in the view"
145
+
146
+ def _calculate_prominence_score(self, obj: Dict) -> float:
147
+ """
148
+ 計算物件的重要性評分
149
+
150
+ Args:
151
+ obj: 物件字典,包含檢測信息
152
+
153
+ Returns:
154
+ float: 重要性評分 (0.0-1.0)
155
+ """
156
+ try:
157
+ # 基礎置信度評分 (權重: 40%)
158
+ confidence = obj.get("confidence", 0.5)
159
+ confidence_score = confidence * 0.4
160
+
161
+ # 大小評分 (權重: 30%)
162
+ normalized_area = obj.get("normalized_area", 0.1)
163
+ # 使用對數縮放避免過大物件主導評分
164
+ size_score = min(np.log(normalized_area * 10 + 1) / np.log(11), 1.0) * 0.3
165
+
166
+ # 位置評分 (權重: 20%)
167
+ # 中心區域的物件通常更重要
168
+ center_x, center_y = obj.get("normalized_center", [0.5, 0.5])
169
+ distance_from_center = np.sqrt((center_x - 0.5)**2 + (center_y - 0.5)**2)
170
+ position_score = (1 - min(distance_from_center * 2, 1.0)) * 0.2
171
+
172
+ # 類別重要性評分 (權重: 10%)
173
+ class_importance = self._get_class_importance(obj.get("class_name", "unknown"))
174
+ class_score = class_importance * 0.1
175
+
176
+ total_score = confidence_score + size_score + position_score + class_score
177
+
178
+ # 確保評分在有效範圍內
179
+ return max(0.0, min(1.0, total_score))
180
+
181
+ except Exception as e:
182
+ self.logger.warning(f"Error calculating prominence score for object: {str(e)}")
183
+ return 0.5 # 返回中等評分作為備用
184
+
185
+ def _get_class_importance(self, class_name: str) -> float:
186
+ """
187
+ 根據物件類別返回重要性係數
188
+
189
+ Args:
190
+ class_name: 物件類別名稱
191
+
192
+ Returns:
193
+ float: 類別重要性係數 (0.0-1.0)
194
+ """
195
+ # 高重要性物件(人、車輛、建築)
196
+ high_importance = ["person", "car", "truck", "bus", "motorcycle", "bicycle", "building"]
197
+
198
+ # 中等重要性物件(家具、電器)
199
+ medium_importance = ["chair", "couch", "tv", "laptop", "refrigerator", "dining table", "bed"]
200
+
201
+ # 低重要性物件(小物品、配件)
202
+ low_importance = ["handbag", "backpack", "umbrella", "cell phone", "remote", "mouse"]
203
+
204
+ class_name_lower = class_name.lower()
205
+
206
+ if any(item in class_name_lower for item in high_importance):
207
+ return 1.0
208
+ elif any(item in class_name_lower for item in medium_importance):
209
+ return 0.7
210
+ elif any(item in class_name_lower for item in low_importance):
211
+ return 0.4
212
+ else:
213
+ return 0.6 # 預設中等重要性
214
+
215
+ def format_object_list_for_description(self,
216
+ objects: List[Dict],
217
+ use_indefinite_article_for_one: bool = False,
218
+ count_threshold_for_generalization: int = -1,
219
+ max_types_to_list: int = 5) -> str:
220
+ """
221
+ 將物件列表格式化為人類可讀的字符串,包含計數信息
222
+
223
+ Args:
224
+ objects: 物件字典列表,每個應包含 'class_name'
225
+ use_indefinite_article_for_one: 單個物件是否使用 "a/an",否則使用 "one"
226
+ count_threshold_for_generalization: 超過此計數時使用通用術語,-1表示精確計數
227
+ max_types_to_list: 列表中包含的不同物件類型最大數量
228
+
229
+ Returns:
230
+ str: 格式化的物件描述字符串
231
+ """
232
+ try:
233
+ if not objects:
234
+ return "no specific objects clearly identified"
235
+
236
+ counts: Dict[str, int] = {}
237
+ for obj in objects:
238
+ name = obj.get("class_name", "unknown object")
239
+ if name == "unknown object" or not name:
240
+ continue
241
+ counts[name] = counts.get(name, 0) + 1
242
+
243
+ if not counts:
244
+ return "no specific objects clearly identified"
245
+
246
+ descriptions = []
247
+ # 按計數降序然後按名稱升序排序,��制物件類型數量
248
+ sorted_counts = sorted(counts.items(), key=lambda item: (-item[1], item[0]))[:max_types_to_list]
249
+
250
+ for name, count in sorted_counts:
251
+ if count == 1:
252
+ if use_indefinite_article_for_one:
253
+ if name[0].lower() in 'aeiou':
254
+ descriptions.append(f"an {name}")
255
+ else:
256
+ descriptions.append(f"a {name}")
257
+ else:
258
+ descriptions.append(f"one {name}")
259
+ else:
260
+ # 處理複數形式
261
+ plural_name = name
262
+ if name.endswith("y") and not name.lower().endswith(("ay", "ey", "iy", "oy", "uy")):
263
+ plural_name = name[:-1] + "ies"
264
+ elif name.endswith(("s", "sh", "ch", "x", "z")):
265
+ plural_name = name + "es"
266
+ elif not name.endswith("s"):
267
+ plural_name = name + "s"
268
+
269
+ if count_threshold_for_generalization != -1 and count > count_threshold_for_generalization:
270
+ if count <= count_threshold_for_generalization + 3:
271
+ descriptions.append(f"several {plural_name}")
272
+ else:
273
+ descriptions.append(f"many {plural_name}")
274
+ else:
275
+ descriptions.append(f"{count} {plural_name}")
276
+
277
+ if not descriptions:
278
+ return "no specific objects clearly identified"
279
+
280
+ if len(descriptions) == 1:
281
+ return descriptions[0]
282
+ elif len(descriptions) == 2:
283
+ return f"{descriptions[0]} and {descriptions[1]}"
284
+ else:
285
+ # 使用牛津逗號格式
286
+ return ", ".join(descriptions[:-1]) + f", and {descriptions[-1]}"
287
+
288
+ except Exception as e:
289
+ self.logger.warning(f"Error formatting object list: {str(e)}")
290
+ return "various objects"
291
+
292
+ def get_spatial_description(self, obj: Dict, image_width: Optional[int] = None,
293
+ image_height: Optional[int] = None,
294
+ region_analyzer: Optional[Any] = None) -> str:
295
+ """
296
+ 為物件生成空間位置描述
297
+
298
+ Args:
299
+ obj: 物件字典
300
+ image_width: 可選的圖像寬度
301
+ image_height: 可選的圖像高度
302
+ region_analyzer: 可選的RegionAnalyzer實例,用於生成標準化描述
303
+
304
+ Returns:
305
+ str: 空間描述字符串,空值region時返回空字串
306
+ """
307
+ try:
308
+ region = obj.get("region") or ""
309
+
310
+ # 處理空值或無效region,直接返回空字串避免不完整描述
311
+ if not region.strip() or region == "unknown":
312
+ # 根據物件類型提供合適的預設位置描述
313
+ if object_type and any(vehicle in object_type.lower() for vehicle in ["car", "truck", "bus"]):
314
+ return "positioned in the scene"
315
+ elif object_type and "person" in object_type.lower():
316
+ return "present in the area"
317
+ else:
318
+ return "located in the scene"
319
+
320
+ # 如果提供了RegionAnalyzer實例,使用其標準化方法
321
+ if region_analyzer and hasattr(region_analyzer, 'get_spatial_description_phrase'):
322
+ object_type = obj.get("class_name", "")
323
+ if hasattr(region_analyzer, 'get_contextual_spatial_description'):
324
+ spatial_desc = region_analyzer.get_contextual_spatial_description(region, object_type)
325
+ else:
326
+ spatial_desc = region_analyzer.get_spatial_description_phrase(region)
327
+
328
+ if spatial_desc:
329
+ return spatial_desc
330
+
331
+ # 備用邏輯:使用改進的內建映射
332
+ clean_region = region.replace('_', ' ').strip().lower()
333
+
334
+ region_map = {
335
+ "top left": "in the upper left area",
336
+ "top center": "in the upper area",
337
+ "top right": "in the upper right area",
338
+ "middle left": "on the left side",
339
+ "middle center": "in the center",
340
+ "center": "in the center",
341
+ "middle right": "on the right side",
342
+ "bottom left": "in the lower left area",
343
+ "bottom center": "in the lower area",
344
+ "bottom right": "in the lower right area"
345
+ }
346
+
347
+ # 直接映射匹配
348
+ if clean_region in region_map:
349
+ return region_map[clean_region]
350
+
351
+ # 模糊匹配處理
352
+ if "top" in clean_region and "left" in clean_region:
353
+ return "in the upper left area"
354
+ elif "top" in clean_region and "right" in clean_region:
355
+ return "in the upper right area"
356
+ elif "bottom" in clean_region and "left" in clean_region:
357
+ return "in the lower left area"
358
+ elif "bottom" in clean_region and "right" in clean_region:
359
+ return "in the lower right area"
360
+ elif "top" in clean_region:
361
+ return "in the upper area"
362
+ elif "bottom" in clean_region:
363
+ return "in the lower area"
364
+ elif "left" in clean_region:
365
+ return "on the left side"
366
+ elif "right" in clean_region:
367
+ return "on the right side"
368
+ elif "center" in clean_region or "middle" in clean_region:
369
+ return "in the center"
370
+
371
+ # 如果region無法識別,使用normalized_center作為最後備用
372
+ norm_center = obj.get("normalized_center")
373
+ if norm_center and image_width and image_height:
374
+ x_norm, y_norm = norm_center
375
+ h_pos = "left" if x_norm < 0.4 else "right" if x_norm > 0.6 else "center"
376
+ v_pos = "upper" if y_norm < 0.4 else "lower" if y_norm > 0.6 else "center"
377
+
378
+ if h_pos == "center" and v_pos == "center":
379
+ return "in the center"
380
+ return f"in the {v_pos} {h_pos} area"
381
+
382
+ # 如果所有方法都失敗,返回空字串
383
+ return ""
384
+
385
+ except Exception as e:
386
+ self.logger.warning(f"Error generating spatial description: {str(e)}")
387
+ return ""
388
+
389
+ def optimize_object_description(self, description: str) -> str:
390
+ """
391
+ 優化物件描述,避免重複列舉相同物件
392
+
393
+ Args:
394
+ description: 原始描述文本
395
+
396
+ Returns:
397
+ str: 優化後的描述文本
398
+ """
399
+ try:
400
+ import re
401
+
402
+ # 處理床鋪重複描述
403
+ if "bed in the room" in description:
404
+ description = description.replace("a bed in the room", "a bed")
405
+
406
+ # 處理重複的物件列表
407
+ object_lists = re.findall(r'with ([^\.]+?)(?:\.|\band\b)', description)
408
+
409
+ for obj_list in object_lists:
410
+ # 計算每個物件出現次數
411
+ items = re.findall(r'([a-zA-Z\s]+)(?:,|\band\b|$)', obj_list)
412
+ item_counts = {}
413
+
414
+ for item in items:
415
+ item = item.strip()
416
+ if item and item not in ["and", "with"]:
417
+ if item not in item_counts:
418
+ item_counts[item] = 0
419
+ item_counts[item] += 1
420
+
421
+ # 生成優化後的物件列表
422
+ if item_counts:
423
+ new_items = []
424
+ for item, count in item_counts.items():
425
+ if count > 1:
426
+ new_items.append(f"{count} {item}s")
427
+ else:
428
+ new_items.append(item)
429
+
430
+ # 格式化新列表
431
+ if len(new_items) == 1:
432
+ new_list = new_items[0]
433
+ elif len(new_items) == 2:
434
+ new_list = f"{new_items[0]} and {new_items[1]}"
435
+ else:
436
+ new_list = ", ".join(new_items[:-1]) + f", and {new_items[-1]}"
437
+
438
+ # 替換原始列表
439
+ description = description.replace(obj_list, new_list)
440
+
441
+ return description
442
+
443
+ except Exception as e:
444
+ self.logger.warning(f"Error optimizing object description: {str(e)}")
445
+ return description
446
+
447
+ def generate_dynamic_everyday_description(self,
448
+ detected_objects: List[Dict],
449
+ lighting_info: Optional[Dict] = None,
450
+ viewpoint: str = "eye_level",
451
+ spatial_analysis: Optional[Dict] = None,
452
+ image_dimensions: Optional[Tuple[int, int]] = None,
453
+ places365_info: Optional[Dict] = None,
454
+ object_statistics: Optional[Dict] = None) -> str:
455
+ """
456
+ 為日常場景動態生成描述,基於所有相關的檢測物件、計數和上下文
457
+
458
+ Args:
459
+ detected_objects: 檢測到的物件列表
460
+ lighting_info: 照明信息
461
+ viewpoint: 視角類型
462
+ spatial_analysis: 空間分析結果
463
+ image_dimensions: 圖像尺寸
464
+ places365_info: Places365場景分類信息
465
+ object_statistics: 物件統計信息
466
+
467
+ Returns:
468
+ str: 動態生成的場景描述
469
+ """
470
+ try:
471
+ description_segments = []
472
+ image_width, image_height = image_dimensions if image_dimensions else (None, None)
473
+
474
+ self.logger.debug(f"Generating dynamic description for {len(detected_objects)} objects, "
475
+ f"viewpoint: {viewpoint}, lighting: {lighting_info is not None}")
476
+
477
+ # 1. 整體氛圍(照明和視角)
478
+ ambiance_parts = []
479
+ if lighting_info:
480
+ time_of_day = lighting_info.get("time_of_day", "unknown lighting")
481
+ is_indoor = lighting_info.get("is_indoor")
482
+ ambiance_statement = "This is"
483
+ if is_indoor is True:
484
+ ambiance_statement += " an indoor scene"
485
+ elif is_indoor is False:
486
+ ambiance_statement += " an outdoor scene"
487
+ else:
488
+ ambiance_statement += " a scene"
489
+
490
+ # remove underline
491
+ readable_lighting = f"with {time_of_day.replace('_', ' ')} lighting conditions"
492
+ ambiance_statement += f", likely {readable_lighting}."
493
+ ambiance_parts.append(ambiance_statement)
494
+
495
+ if viewpoint and viewpoint != "eye_level":
496
+ if not ambiance_parts:
497
+ ambiance_parts.append(f"From {viewpoint.replace('_', ' ')}, the general layout of the scene is observed.")
498
+ else:
499
+ ambiance_parts[-1] = ambiance_parts[-1].rstrip('.') + f", viewed from {viewpoint.replace('_', ' ')}."
500
+
501
+ if ambiance_parts:
502
+ description_segments.append(" ".join(ambiance_parts))
503
+
504
+ # 2. 描述所有檢測到的物件,按類別分組,使用準確計數和位置
505
+ if not detected_objects:
506
+ if not description_segments:
507
+ description_segments.append("A general scene is visible, but no specific objects were clearly identified.")
508
+ else:
509
+ description_segments.append("Within this setting, no specific objects were clearly identified.")
510
+ else:
511
+ objects_by_class: Dict[str, List[Dict]] = {}
512
+
513
+ # 使用置信度過濾
514
+ confident_objects = [obj for obj in detected_objects
515
+ if obj.get("confidence", 0) >= self.confidence_threshold_for_description]
516
+
517
+ if not confident_objects:
518
+ no_confident_obj_msg = "While some elements might be present, no objects were identified with sufficient confidence for a detailed description."
519
+ if not description_segments:
520
+ description_segments.append(no_confident_obj_msg)
521
+ else:
522
+ description_segments.append(no_confident_obj_msg.lower().capitalize())
523
+ else:
524
+ if object_statistics:
525
+ # 使用預計算的統計信息,採用動態的信心度
526
+ for class_name, stats in object_statistics.items():
527
+ count = stats.get("count", 0)
528
+ avg_confidence = stats.get("avg_confidence", 0)
529
+
530
+ # 動態調整置信度閾值
531
+ dynamic_threshold = self.confidence_threshold_for_description
532
+ if class_name in ["potted plant", "vase", "clock", "book"]:
533
+ dynamic_threshold = max(0.15, self.confidence_threshold_for_description * 0.6)
534
+ elif count >= 3:
535
+ dynamic_threshold = max(0.2, self.confidence_threshold_for_description * 0.8)
536
+
537
+ if count > 0 and avg_confidence >= dynamic_threshold:
538
+ matching_objects = [obj for obj in confident_objects if obj.get("class_name") == class_name]
539
+ if not matching_objects:
540
+ matching_objects = [obj for obj in detected_objects
541
+ if obj.get("class_name") == class_name and obj.get("confidence", 0) >= dynamic_threshold]
542
+
543
+ if matching_objects:
544
+ actual_count = min(stats["count"], len(matching_objects))
545
+ objects_by_class[class_name] = matching_objects[:actual_count]
546
+ else:
547
+ # 備用邏輯,同樣使用動態閾值
548
+ for obj in confident_objects:
549
+ name = obj.get("class_name", "unknown object")
550
+ if name == "unknown object" or not name:
551
+ continue
552
+ if name not in objects_by_class:
553
+ objects_by_class[name] = []
554
+ objects_by_class[name].append(obj)
555
+
556
+ if not objects_by_class:
557
+ description_segments.append("No common objects were confidently identified for detailed description.")
558
+ else:
559
+ # 物件組排序函數
560
+ def sort_key_object_groups(item_tuple: Tuple[str, List[Dict]]):
561
+ class_name_key, obj_group_list = item_tuple
562
+ priority = 3
563
+ count = len(obj_group_list)
564
+
565
+ # 確保類別名稱已標準化
566
+ normalized_class_name = self._normalize_object_class_name(class_name_key)
567
+
568
+ # 動態優先級
569
+ if normalized_class_name == "person":
570
+ priority = 0
571
+ elif normalized_class_name in ["dining table", "chair", "sofa", "bed"]:
572
+ priority = 1
573
+ elif normalized_class_name in ["car", "bus", "truck", "traffic light"]:
574
+ priority = 2
575
+ elif count >= 3:
576
+ priority = max(1, priority - 1)
577
+ elif normalized_class_name in ["potted plant", "vase", "clock", "book"] and count >= 2:
578
+ priority = 2
579
+
580
+ avg_area = sum(o.get("normalized_area", 0.0) for o in obj_group_list) / len(obj_group_list) if obj_group_list else 0
581
+ quantity_bonus = min(count / 5.0, 1.0)
582
+
583
+ return (priority, -len(obj_group_list), -avg_area, -quantity_bonus)
584
+
585
+ # remove duplicate
586
+ deduplicated_objects_by_class = {}
587
+ processed_positions = []
588
+
589
+ for class_name, group_of_objects in objects_by_class.items():
590
+ unique_objects = []
591
+
592
+ for obj in group_of_objects:
593
+ obj_position = obj.get("normalized_center", [0.5, 0.5])
594
+ is_duplicate = False
595
+
596
+ for processed_pos in processed_positions:
597
+ position_distance = abs(obj_position[0] - processed_pos[0]) + abs(obj_position[1] - processed_pos[1])
598
+ if position_distance < 0.15:
599
+ is_duplicate = True
600
+ break
601
+
602
+ if not is_duplicate:
603
+ unique_objects.append(obj)
604
+ processed_positions.append(obj_position)
605
+
606
+ if unique_objects:
607
+ deduplicated_objects_by_class[class_name] = unique_objects
608
+
609
+ objects_by_class = deduplicated_objects_by_class
610
+ sorted_object_groups = sorted(objects_by_class.items(), key=sort_key_object_groups)
611
+
612
+ object_clauses = []
613
+
614
+ for class_name, group_of_objects in sorted_object_groups:
615
+ count = len(group_of_objects)
616
+ if count == 0:
617
+ continue
618
+
619
+ # 標準化class name
620
+ normalized_class_name = self._normalize_object_class_name(class_name)
621
+
622
+ # 使用統計信息確保準確的數量描述
623
+ if object_statistics and class_name in object_statistics:
624
+ actual_count = object_statistics[class_name]["count"]
625
+ formatted_name_with_exact_count = self._format_object_count_description(
626
+ normalized_class_name, actual_count
627
+ )
628
+ else:
629
+ formatted_name_with_exact_count = self._format_object_count_description(
630
+ normalized_class_name, count
631
+ )
632
+
633
+ if formatted_name_with_exact_count == "no specific objects clearly identified" or not formatted_name_with_exact_count:
634
+ continue
635
+
636
+ # 確定群組的集體位置
637
+ location_description_suffix = ""
638
+ if count == 1:
639
+ spatial_desc = self.get_spatial_description(group_of_objects[0], image_width, image_height, self.region_analyzer)
640
+ if spatial_desc:
641
+ location_description_suffix = f"is {spatial_desc}"
642
+ else:
643
+ distinct_regions = sorted(list(set(obj.get("region", "") for obj in group_of_objects if obj.get("region"))))
644
+ valid_regions = [r for r in distinct_regions if r and r != "unknown" and r.strip()]
645
+ if not valid_regions:
646
+ location_description_suffix = "is positioned in the scene"
647
+ elif len(valid_regions) == 1:
648
+ spatial_desc = self.get_spatial_description_phrase(valid_regions[0])
649
+ location_description_suffix = f"is primarily {spatial_desc}" if spatial_desc else "is positioned in the scene"
650
+ elif len(valid_regions) == 2:
651
+ clean_region1 = valid_regions[0].replace('_', ' ')
652
+ clean_region2 = valid_regions[1].replace('_', ' ')
653
+ location_description_suffix = f"is mainly across the {clean_region1} and {clean_region2} areas"
654
+ else:
655
+ location_description_suffix = "is distributed in various parts of the scene"
656
+ else:
657
+ distinct_regions = sorted(list(set(obj.get("region", "") for obj in group_of_objects if obj.get("region"))))
658
+ valid_regions = [r for r in distinct_regions if r and r != "unknown" and r.strip()]
659
+ if not valid_regions:
660
+ location_description_suffix = "are visible in the scene"
661
+ elif len(valid_regions) == 1:
662
+ clean_region = valid_regions[0].replace('_', ' ')
663
+ location_description_suffix = f"are primarily in the {clean_region} area"
664
+ elif len(valid_regions) == 2:
665
+ clean_region1 = valid_regions[0].replace('_', ' ')
666
+ clean_region2 = valid_regions[1].replace('_', ' ')
667
+ location_description_suffix = f"are mainly across the {clean_region1} and {clean_region2} areas"
668
+ else:
669
+ location_description_suffix = "are distributed in various parts of the scene"
670
+
671
+ # 首字母大寫
672
+ formatted_name_capitalized = formatted_name_with_exact_count[0].upper() + formatted_name_with_exact_count[1:]
673
+ object_clauses.append(f"{formatted_name_capitalized} {location_description_suffix}")
674
+
675
+ if object_clauses:
676
+ if not description_segments:
677
+ if object_clauses:
678
+ first_clause = object_clauses.pop(0)
679
+ description_segments.append(first_clause + ".")
680
+ else:
681
+ if object_clauses:
682
+ description_segments.append("The scene features:")
683
+
684
+ if object_clauses:
685
+ joined_object_clauses = ". ".join(object_clauses)
686
+ if joined_object_clauses and not joined_object_clauses.endswith("."):
687
+ joined_object_clauses += "."
688
+ description_segments.append(joined_object_clauses)
689
+
690
+ elif not description_segments:
691
+ return "The image depicts a scene, but specific objects could not be described with confidence or detail."
692
+
693
+ # 最終組裝和格式化
694
+ raw_description = ""
695
+ for i, segment in enumerate(filter(None, description_segments)):
696
+ segment = segment.strip()
697
+ if not segment:
698
+ continue
699
+
700
+ if not raw_description:
701
+ raw_description = segment
702
+ else:
703
+ if not raw_description.endswith(('.', '!', '?')):
704
+ raw_description += "."
705
+ raw_description += " " + (segment[0].upper() + segment[1:] if len(segment) > 1 else segment.upper())
706
+
707
+ if raw_description and not raw_description.endswith(('.', '!', '?')):
708
+ raw_description += "."
709
+
710
+ if not raw_description or len(raw_description.strip()) < 20:
711
+ if 'confident_objects' in locals() and confident_objects:
712
+ return "The scene contains several detected objects, but a detailed textual description could not be fully constructed."
713
+ else:
714
+ return "A general scene is depicted with no objects identified with high confidence."
715
+
716
+ return raw_description
717
+
718
+ except Exception as e:
719
+ error_msg = f"Error generating dynamic everyday description: {str(e)}"
720
+ self.logger.error(f"{error_msg}\n{traceback.format_exc()}")
721
+ raise ObjectDescriptionError(error_msg) from e
722
+
723
+ def _format_object_count_description(self, class_name: str, count: int) -> str:
724
+ """
725
+ 格式化物件數量描述,提供多樣化的表達方式
726
+
727
+ Args:
728
+ class_name: 標準化後的類別名稱
729
+ count: 物件數量
730
+
731
+ Returns:
732
+ str: 格式化的數量描述
733
+ """
734
+ try:
735
+ if count <= 0:
736
+ return ""
737
+
738
+ # 單數情況
739
+ if count == 1:
740
+ article = "an" if class_name[0].lower() in 'aeiou' else "a"
741
+ return f"{article} {class_name}"
742
+
743
+ # 複數情況
744
+ plural_form = self._get_plural_form(class_name)
745
+
746
+ # 根據數量選擇不同的表達方式
747
+ if count == 2:
748
+ return f"two {plural_form}"
749
+ elif count == 3:
750
+ return f"three {plural_form}"
751
+ elif count <= 5:
752
+ return f"{count} {plural_form}"
753
+ elif count <= 10:
754
+ return f"several {plural_form}"
755
+ else:
756
+ return f"numerous {plural_form}"
757
+
758
+ except Exception as e:
759
+ self.logger.warning(f"Error formatting object count for '{class_name}': {str(e)}")
760
+ return f"{count} {class_name}s" if count > 1 else class_name
761
+
762
+ def _get_plural_form(self, word: str) -> str:
763
+ """
764
+ 獲取詞彙的複數形式
765
+
766
+ Args:
767
+ word: 單數詞彙
768
+
769
+ Returns:
770
+ str: 複數形式
771
+ """
772
+ try:
773
+ # 特殊複數形式
774
+ irregular_plurals = {
775
+ 'person': 'people',
776
+ 'child': 'children',
777
+ 'foot': 'feet',
778
+ 'tooth': 'teeth',
779
+ 'mouse': 'mice',
780
+ 'man': 'men',
781
+ 'woman': 'women'
782
+ }
783
+
784
+ if word.lower() in irregular_plurals:
785
+ return irregular_plurals[word.lower()]
786
+
787
+ # 規則複數形式
788
+ if word.endswith(('s', 'sh', 'ch', 'x', 'z')):
789
+ return word + 'es'
790
+ elif word.endswith('y') and word[-2] not in 'aeiou':
791
+ return word[:-1] + 'ies'
792
+ elif word.endswith('f'):
793
+ return word[:-1] + 'ves'
794
+ elif word.endswith('fe'):
795
+ return word[:-2] + 'ves'
796
+ else:
797
+ return word + 's'
798
+
799
+ except Exception as e:
800
+ self.logger.warning(f"Error getting plural form for '{word}': {str(e)}")
801
+ return word + 's'
802
+
803
+ def _normalize_object_class_name(self, class_name: str) -> str:
804
+ """
805
+ 標準化物件類別名稱,確保輸出自然語言格式
806
+
807
+ Args:
808
+ class_name: 原始類別名稱
809
+
810
+ Returns:
811
+ str: 標準化後的類別名稱
812
+ """
813
+ try:
814
+ if not class_name or not isinstance(class_name, str):
815
+ return "object"
816
+
817
+ # 移除可能的技術性前綴或後綴
818
+ import re
819
+ normalized = re.sub(r'^(class_|id_|type_)', '', class_name.lower())
820
+ normalized = re.sub(r'(_class|_id|_type)$', '', normalized)
821
+
822
+ # 將下劃線和連字符替換為空格
823
+ normalized = normalized.replace('_', ' ').replace('-', ' ')
824
+
825
+ # 移除多餘空格
826
+ normalized = ' '.join(normalized.split())
827
+
828
+ # 特殊類別名稱的標準化映射
829
+ class_name_mapping = {
830
+ 'traffic light': 'traffic light',
831
+ 'stop sign': 'stop sign',
832
+ 'fire hydrant': 'fire hydrant',
833
+ 'dining table': 'dining table',
834
+ 'potted plant': 'potted plant',
835
+ 'tv monitor': 'television',
836
+ 'cell phone': 'mobile phone',
837
+ 'wine glass': 'wine glass',
838
+ 'hot dog': 'hot dog',
839
+ 'teddy bear': 'teddy bear',
840
+ 'hair drier': 'hair dryer',
841
+ 'toothbrush': 'toothbrush'
842
+ }
843
+
844
+ return class_name_mapping.get(normalized, normalized)
845
+
846
+ except Exception as e:
847
+ self.logger.warning(f"Error normalizing class name '{class_name}': {str(e)}")
848
+ return class_name if isinstance(class_name, str) else "object"
849
+
850
+ def generate_basic_details(self, scene_type: str, detected_objects: List[Dict]) -> str:
851
+ """
852
+ 當模板不可用時生成基本詳細信息
853
+
854
+ Args:
855
+ scene_type: 識別的場景類型
856
+ detected_objects: 檢測到的物件列表
857
+
858
+ Returns:
859
+ str: 基本場景詳細信息
860
+ """
861
+ try:
862
+ # 處理特定場景類型的自定義邏輯
863
+ if scene_type == "living_room":
864
+ tv_objs = [obj for obj in detected_objects if obj.get("class_id") == 62] # TV
865
+ sofa_objs = [obj for obj in detected_objects if obj.get("class_id") == 57] # Sofa
866
+
867
+ if tv_objs and sofa_objs:
868
+ tv_region = tv_objs[0].get("region", "center")
869
+ sofa_region = sofa_objs[0].get("region", "center")
870
+
871
+ arrangement = f"The TV is in the {tv_region.replace('_', ' ')} of the image, "
872
+ arrangement += f"while the sofa is in the {sofa_region.replace('_', ' ')}. "
873
+
874
+ return f"{arrangement}This appears to be a space designed for relaxation and entertainment."
875
+
876
+ elif scene_type == "bedroom":
877
+ bed_objs = [obj for obj in detected_objects if obj.get("class_id") == 59] # Bed
878
+
879
+ if bed_objs:
880
+ bed_region = bed_objs[0].get("region", "center")
881
+ extra_items = []
882
+
883
+ for obj in detected_objects:
884
+ if obj.get("class_id") == 74: # Clock
885
+ extra_items.append("clock")
886
+ elif obj.get("class_id") == 73: # Book
887
+ extra_items.append("book")
888
+
889
+ extras = ""
890
+ if extra_items:
891
+ extras = f" There is also a {' and a '.join(extra_items)} visible."
892
+
893
+ return f"The bed is located in the {bed_region.replace('_', ' ')} of the image.{extras}"
894
+
895
+ elif scene_type in ["dining_area", "kitchen"]:
896
+ # 計算食物和餐飲相關物品
897
+ food_items = []
898
+ for obj in detected_objects:
899
+ if obj.get("class_id") in [39, 41, 42, 43, 44, 45]: # 廚房物品
900
+ food_items.append(obj.get("class_name", "kitchen item"))
901
+
902
+ food_str = ""
903
+ if food_items:
904
+ unique_items = list(set(food_items))
905
+ if len(unique_items) <= 3:
906
+ food_str = f" with {', '.join(unique_items)}"
907
+ else:
908
+ food_str = f" with {', '.join(unique_items[:3])} and other items"
909
+
910
+ return f"{food_str}."
911
+
912
+ elif scene_type == "city_street":
913
+ # 計算人員和車輛
914
+ people_count = len([obj for obj in detected_objects if obj.get("class_id") == 0])
915
+ vehicle_count = len([obj for obj in detected_objects
916
+ if obj.get("class_id") in [1, 2, 3, 5, 7]]) # Bicycle, car, motorbike, bus, truck
917
+
918
+ traffic_desc = ""
919
+ if people_count > 0 and vehicle_count > 0:
920
+ traffic_desc = f" with {people_count} {'people' if people_count > 1 else 'person'} and "
921
+ traffic_desc += f"{vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}"
922
+ elif people_count > 0:
923
+ traffic_desc = f" with {people_count} {'people' if people_count > 1 else 'person'}"
924
+ elif vehicle_count > 0:
925
+ traffic_desc = f" with {vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}"
926
+
927
+ return f"{traffic_desc}."
928
+
929
+ elif scene_type == "asian_commercial_street":
930
+ # 尋找關鍵城市元素
931
+ people_count = len([obj for obj in detected_objects if obj.get("class_id") == 0])
932
+ vehicle_count = len([obj for obj in detected_objects if obj.get("class_id") in [1, 2, 3]])
933
+
934
+ # 分析行人分布
935
+ people_positions = []
936
+ for obj in detected_objects:
937
+ if obj.get("class_id") == 0: # Person
938
+ people_positions.append(obj.get("normalized_center", (0.5, 0.5)))
939
+
940
+ # 檢查人員是否沿線分布(表示步行路徑)
941
+ structured_path = False
942
+ if len(people_positions) >= 3:
943
+ # 簡化檢查 - 查看多個人員的y坐標是否相似
944
+ y_coords = [pos[1] for pos in people_positions]
945
+ y_mean = sum(y_coords) / len(y_coords)
946
+ y_variance = sum((y - y_mean)**2 for y in y_coords) / len(y_coords)
947
+ if y_variance < 0.05: # 低變異數表示線性排列
948
+ structured_path = True
949
+
950
+ street_desc = "A commercial street with "
951
+ if people_count > 0:
952
+ street_desc += f"{people_count} {'pedestrians' if people_count > 1 else 'pedestrian'}"
953
+ if vehicle_count > 0:
954
+ street_desc += f" and {vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}"
955
+ elif vehicle_count > 0:
956
+ street_desc += f"{vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}"
957
+ else:
958
+ street_desc += "various commercial elements"
959
+
960
+ if structured_path:
961
+ street_desc += ". The pedestrians appear to be following a defined walking path"
962
+
963
+ # 添加文化元素
964
+ street_desc += ". The signage and architectural elements suggest an Asian urban setting."
965
+
966
+ return street_desc
967
+
968
+ # 默認通用描述
969
+ return "The scene contains various elements characteristic of this environment."
970
+
971
+ except Exception as e:
972
+ self.logger.warning(f"Error generating basic details for scene_type '{scene_type}': {str(e)}")
973
+ return "The scene contains various elements characteristic of this environment."
974
+
975
+ def generate_placeholder_content(self, placeholder: str, detected_objects: List[Dict], scene_type: str) -> str:
976
+ """
977
+ 為模板佔位符生成內容
978
+
979
+ Args:
980
+ placeholder: 模板佔位符
981
+ detected_objects: 檢測到的物件列表
982
+ scene_type: 場景類型
983
+
984
+ Returns:
985
+ str: 生成的佔位符內容
986
+ """
987
+ try:
988
+ # 處理不同類型的佔位符與自定義邏輯
989
+ if placeholder == "furniture":
990
+ # 提取家具物品
991
+ furniture_ids = [56, 57, 58, 59, 60, 61] # 家具類別ID示例
992
+ furniture_objects = [obj for obj in detected_objects if obj.get("class_id") in furniture_ids]
993
+
994
+ if furniture_objects:
995
+ furniture_names = []
996
+ for obj in furniture_objects[:3]:
997
+ raw_name = obj.get("class_name", "furniture")
998
+ normalized_name = self._normalize_object_class_name(raw_name)
999
+ furniture_names.append(normalized_name)
1000
+
1001
+ unique_names = list(set(furniture_names))
1002
+ if len(unique_names) == 1:
1003
+ return unique_names[0]
1004
+ elif len(unique_names) == 2:
1005
+ return f"{unique_names[0]} and {unique_names[1]}"
1006
+ else:
1007
+ return ", ".join(unique_names[:-1]) + f", and {unique_names[-1]}"
1008
+ return "various furniture items"
1009
+
1010
+ elif placeholder == "electronics":
1011
+ # 提取電子物品
1012
+ electronics_ids = [62, 63, 64, 65, 66, 67, 68, 69, 70] # 電子設備類別ID示例
1013
+ electronics_objects = [obj for obj in detected_objects if obj.get("class_id") in electronics_ids]
1014
+
1015
+ if electronics_objects:
1016
+ electronics_names = [obj.get("class_name", "electronic device") for obj in electronics_objects[:3]]
1017
+ return ", ".join(set(electronics_names))
1018
+ return "electronic devices"
1019
+
1020
+ elif placeholder == "people_count":
1021
+ # 計算人數
1022
+ people_count = len([obj for obj in detected_objects if obj.get("class_id") == 0])
1023
+
1024
+ if people_count == 0:
1025
+ return "no people"
1026
+ elif people_count == 1:
1027
+ return "one person"
1028
+ elif people_count < 5:
1029
+ return f"{people_count} people"
1030
+ else:
1031
+ return "several people"
1032
+
1033
+ elif placeholder == "seating":
1034
+ # 提取座位物品
1035
+ seating_ids = [56, 57] # chair, sofa
1036
+ seating_objects = [obj for obj in detected_objects if obj.get("class_id") in seating_ids]
1037
+
1038
+ if seating_objects:
1039
+ seating_names = [obj.get("class_name", "seating") for obj in seating_objects[:2]]
1040
+ return ", ".join(set(seating_names))
1041
+ return "seating arrangements"
1042
+
1043
+ # 默認情況 - 空字符串
1044
+ return ""
1045
+
1046
+ except Exception as e:
1047
+ self.logger.warning(f"Error generating placeholder content for '{placeholder}': {str(e)}")
1048
+ return ""
1049
+
1050
+ def describe_functional_zones(self, functional_zones: Dict) -> str:
1051
+ """
1052
+ 生成場景功能區域的描述,優化處理行人區域、人數統計和物品重複問題
1053
+
1054
+ Args:
1055
+ functional_zones: 識別出的功能區域字典
1056
+
1057
+ Returns:
1058
+ str: 功能區域描述
1059
+ """
1060
+ try:
1061
+ if not functional_zones:
1062
+ return ""
1063
+
1064
+ # 處理不同類型的 functional_zones 參數
1065
+ if isinstance(functional_zones, list):
1066
+ # 如果是列表,轉換為字典格式
1067
+ zones_dict = {}
1068
+ for i, zone in enumerate(functional_zones):
1069
+ if isinstance(zone, dict) and 'name' in zone:
1070
+ zone_name = self._normalize_zone_name(zone['name'])
1071
+ else:
1072
+ zone_name = f"functional area {i+1}"
1073
+ zones_dict[zone_name] = zone if isinstance(zone, dict) else {"description": str(zone)}
1074
+ functional_zones = zones_dict
1075
+ elif not isinstance(functional_zones, dict):
1076
+ return ""
1077
+
1078
+ # 標準化所有區域鍵名,移除內部標識符格式
1079
+ normalized_zones = {}
1080
+ for zone_key, zone_data in functional_zones.items():
1081
+ normalized_key = self._normalize_zone_name(zone_key)
1082
+ normalized_zones[normalized_key] = zone_data
1083
+ functional_zones = normalized_zones
1084
+
1085
+ # 計算場景中的總人數
1086
+ total_people_count = 0
1087
+ people_by_zone = {}
1088
+
1089
+ # 計算每個區域的人數並累計總人數
1090
+ for zone_name, zone_info in functional_zones.items():
1091
+ if "objects" in zone_info:
1092
+ zone_people_count = zone_info["objects"].count("person")
1093
+ people_by_zone[zone_name] = zone_people_count
1094
+ total_people_count += zone_people_count
1095
+
1096
+ # 分類區域為行人區域和其他區域
1097
+ pedestrian_zones = []
1098
+ other_zones = []
1099
+
1100
+ for zone_name, zone_info in functional_zones.items():
1101
+ # 檢查是否是行人相關區域
1102
+ if any(keyword in zone_name.lower() for keyword in ["pedestrian", "crossing", "people"]):
1103
+ pedestrian_zones.append((zone_name, zone_info))
1104
+ else:
1105
+ other_zones.append((zone_name, zone_info))
1106
+
1107
+ # 獲取最重要的行人區域和其他區域
1108
+ main_pedestrian_zones = sorted(pedestrian_zones,
1109
+ key=lambda z: people_by_zone.get(z[0], 0),
1110
+ reverse=True)[:1] # 最多1個主要行人區域
1111
+
1112
+ top_other_zones = sorted(other_zones,
1113
+ key=lambda z: len(z[1].get("objects", [])),
1114
+ reverse=True)[:2] # 最多2個其他區域
1115
+
1116
+ # 合併區域
1117
+ top_zones = main_pedestrian_zones + top_other_zones
1118
+
1119
+ if not top_zones:
1120
+ return ""
1121
+
1122
+ # 生成匯總描述
1123
+ summary = ""
1124
+ max_mentioned_people = 0 # 追蹤已經提到的最大人數
1125
+
1126
+ # 如果總人數顯著且還沒在主描述中提到,添加總人數描述
1127
+ if total_people_count > 5:
1128
+ summary = f"The scene contains a significant number of pedestrians ({total_people_count} people). "
1129
+ max_mentioned_people = total_people_count # 更新已提到的最大人數
1130
+
1131
+ # 處理每個區域的描述,確保人數信息的一致性
1132
+ processed_zones = []
1133
+
1134
+ for zone_name, zone_info in top_zones:
1135
+ zone_desc = zone_info.get("description", "a functional zone")
1136
+ zone_people_count = people_by_zone.get(zone_name, 0)
1137
+
1138
+ # 檢查描述中是否包含人數資訊
1139
+ contains_people_info = "with" in zone_desc and ("person" in zone_desc.lower() or "people" in zone_desc.lower())
1140
+
1141
+ # 如果描述包含人數信息,且人數較小(小於已提到的最大人數),則修改描述
1142
+ if contains_people_info and zone_people_count < max_mentioned_people:
1143
+ parts = zone_desc.split("with")
1144
+ if len(parts) > 1:
1145
+ # 移除人數部分
1146
+ zone_desc = parts[0].strip() + " area"
1147
+
1148
+ processed_zones.append((zone_name, {"description": zone_desc}))
1149
+
1150
+ # 根據處理後的區域數量生成最終描述
1151
+ final_desc = ""
1152
+
1153
+ if len(processed_zones) == 1:
1154
+ _, zone_info = processed_zones[0]
1155
+ zone_desc = zone_info["description"]
1156
+ final_desc = summary + f"The scene includes {zone_desc}."
1157
+ elif len(processed_zones) == 2:
1158
+ _, zone1_info = processed_zones[0]
1159
+ _, zone2_info = processed_zones[1]
1160
+ zone1_desc = zone1_info["description"]
1161
+ zone2_desc = zone2_info["description"]
1162
+ final_desc = summary + f"The scene is divided into two main areas: {zone1_desc} and {zone2_desc}."
1163
+ else:
1164
+ zones_desc = ["The scene contains multiple functional areas including"]
1165
+ zone_descriptions = [z[1]["description"] for z in processed_zones]
1166
+
1167
+ # 格式化最終的多區域描述
1168
+ if len(zone_descriptions) == 3:
1169
+ formatted_desc = f"{zone_descriptions[0]}, {zone_descriptions[1]}, and {zone_descriptions[2]}"
1170
+ else:
1171
+ formatted_desc = ", ".join(zone_descriptions[:-1]) + f", and {zone_descriptions[-1]}"
1172
+
1173
+ final_desc = summary + f"{zones_desc[0]} {formatted_desc}."
1174
+
1175
+ return self.optimize_object_description(final_desc)
1176
+
1177
+ except Exception as e:
1178
+ self.logger.warning(f"Error describing functional zones: {str(e)}")
1179
+ return ""
1180
+
1181
+ def _normalize_zone_name(self, zone_name: str) -> str:
1182
+ """
1183
+ 將內部區域鍵名標準化為自然語言描述
1184
+
1185
+ Args:
1186
+ zone_name: 原始區域名稱
1187
+
1188
+ Returns:
1189
+ str: 標準化後的區域名稱
1190
+ """
1191
+ try:
1192
+ if not zone_name or not isinstance(zone_name, str):
1193
+ return "functional area"
1194
+
1195
+ # 移除數字後綴(如 crossing_zone_1 -> crossing_zone)
1196
+ import re
1197
+ base_name = re.sub(r'_\d+$', '', zone_name)
1198
+
1199
+ # 將下劃線替換為空格
1200
+ normalized = base_name.replace('_', ' ')
1201
+
1202
+ # 標準化常見的區域類型名稱
1203
+ zone_type_mapping = {
1204
+ 'crossing zone': 'pedestrian crossing area',
1205
+ 'vehicle zone': 'vehicle movement area',
1206
+ 'pedestrian zone': 'pedestrian activity area',
1207
+ 'traffic zone': 'traffic flow area',
1208
+ 'waiting zone': 'waiting area',
1209
+ 'seating zone': 'seating area',
1210
+ 'dining zone': 'dining area',
1211
+ 'furniture zone': 'furniture arrangement area',
1212
+ 'electronics zone': 'electronics area',
1213
+ 'people zone': 'social activity area',
1214
+ 'functional area': 'activity area'
1215
+ }
1216
+
1217
+ # 檢查是否有對應的標準化名稱
1218
+ for pattern, replacement in zone_type_mapping.items():
1219
+ if pattern in normalized.lower():
1220
+ return replacement
1221
+
1222
+ # 如果沒有特定映射,使用通用格式
1223
+ if 'zone' in normalized.lower():
1224
+ normalized = normalized.replace('zone', 'area')
1225
+ elif not any(keyword in normalized.lower() for keyword in ['area', 'space', 'region']):
1226
+ normalized += ' area'
1227
+
1228
+ return normalized.strip()
1229
+
1230
+ except Exception as e:
1231
+ self.logger.warning(f"Error normalizing zone name '{zone_name}': {str(e)}")
1232
+ return "activity area"
1233
+
1234
+ def get_configuration(self) -> Dict[str, Any]:
1235
+ """
1236
+ 獲取當前配置參數
1237
+
1238
+ Returns:
1239
+ Dict[str, Any]: 配置參數字典
1240
+ """
1241
+ return {
1242
+ "min_prominence_score": self.min_prominence_score,
1243
+ "max_categories_to_return": self.max_categories_to_return,
1244
+ "max_total_objects": self.max_total_objects,
1245
+ "confidence_threshold_for_description": self.confidence_threshold_for_description
1246
+ }
1247
+
1248
+ def update_configuration(self, **kwargs):
1249
+ """
1250
+ 更新配置參數
1251
+
1252
+ Args:
1253
+ **kwargs: 要更新的配置參數
1254
+ """
1255
+ try:
1256
+ for key, value in kwargs.items():
1257
+ if hasattr(self, key):
1258
+ old_value = getattr(self, key)
1259
+ setattr(self, key, value)
1260
+ self.logger.info(f"Updated {key}: {old_value} -> {value}")
1261
+ else:
1262
+ self.logger.warning(f"Unknown configuration parameter: {key}")
1263
+
1264
+ except Exception as e:
1265
+ self.logger.error(f"Error updating configuration: {str(e)}")
1266
+ raise ObjectDescriptionError(f"Failed to update configuration: {str(e)}") from e
object_extractor.py ADDED
@@ -0,0 +1,358 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import logging
3
+ import traceback
4
+ from typing import Dict, List, Any, Optional
5
+
6
+ # 設置日誌記錄器
7
+ logger = logging.getLogger(__name__)
8
+
9
+ class ObjectExtractor:
10
+ """
11
+ 專門處理物件檢測結果的提取和預處理
12
+ 負責從YOLO檢測結果提取物件資訊、物件分類和核心物件的辨識
13
+ """
14
+
15
+ def __init__(self, class_names: Dict[int, str] = None, object_categories: Dict[str, List[int]] = None):
16
+ """
17
+ 初始化物件提取器
18
+
19
+ Args:
20
+ class_names: 類別ID到類別名稱的映射字典
21
+ object_categories: 物件類別分組字典
22
+ """
23
+ try:
24
+ self.class_names = class_names or {}
25
+ self.object_categories = object_categories or {}
26
+
27
+ # 1. 讀取並設定基本信心度門檻(如果外部沒傳,就預設 0.25)
28
+ self.base_conf_threshold = 0.25
29
+
30
+ # 2. 動態信心度調整映射表 (key: 小寫 class_name, value: 調整係數)
31
+ # 最終的門檻 = base_conf_threshold * factor
32
+ # 如果某個 class_name 沒在這裡,就直接用 base_conf_threshold(相當於 factor=1.0)
33
+ self.dynamic_conf_map = {
34
+ "traffic light": 0.6, # 0.25 * 0.6 = 0.15
35
+ "car": 0.8, # 0.25 * 0.8 = 0.20
36
+ "person": 0.7, # 0.25 * 0.7 = 0.175
37
+
38
+ }
39
+
40
+ logger.info(f"ObjectExtractor initialized with {len(self.class_names)} class names and {len(self.object_categories)} object categories")
41
+
42
+ except Exception as e:
43
+ logger.error(f"Failed to initialize ObjectExtractor: {str(e)}")
44
+ logger.error(traceback.format_exc())
45
+ raise
46
+
47
+ def _get_dynamic_threshold(self, class_name: str) -> float:
48
+ """
49
+ 根據 class_name 從 dynamic_conf_map 拿到 factor,計算最終的信心度門檻:
50
+ threshold = base_conf_threshold * factor
51
+
52
+ 如果 class_name 不在映射表裡,就回傳 base_conf_threshold。
53
+ """
54
+ # 使用小寫做匹配,確保在 dynamic_conf_map 裡的 key 也都用小寫
55
+ key = class_name.lower()
56
+ factor = self.dynamic_conf_map.get(key, 1.0)
57
+ return self.base_conf_threshold * factor
58
+
59
+ def extract_detected_objects(
60
+ self,
61
+ detection_result: Any,
62
+ confidence_threshold: float = 0.25,
63
+ region_analyzer=None
64
+ ) -> List[Dict]:
65
+ """
66
+ 從檢測結果中提取物件資訊,包含位置資訊
67
+
68
+ Args:
69
+ detection_result: YOLO檢測結果
70
+ confidence_threshold: 改由動態門檻決定
71
+ region_analyzer: 區域分析器實例,用於判斷物件所屬區域
72
+
73
+ Returns:
74
+ 包含檢測物件資訊的字典列表
75
+ """
76
+ try:
77
+ # 調試信息:記錄當前類別映射狀態
78
+ logger.info(f"ObjectExtractor.extract_detected_objects called")
79
+ logger.info(f"Current class_names keys: {list(self.class_names.keys()) if self.class_names else 'None'}")
80
+
81
+ if detection_result is None:
82
+ logger.warning("Detection result is None")
83
+ return []
84
+
85
+ if not hasattr(detection_result, 'boxes'):
86
+ logger.error("Detection result does not have boxes attribute")
87
+ return []
88
+
89
+ boxes = detection_result.boxes.xyxy.cpu().numpy()
90
+ classes = detection_result.boxes.cls.cpu().numpy().astype(int)
91
+ confidences = detection_result.boxes.conf.cpu().numpy()
92
+
93
+ # 獲取圖像尺寸
94
+ img_height, img_width = detection_result.orig_shape[:2]
95
+
96
+ detected_objects = []
97
+
98
+ for box, class_id, confidence in zip(boxes, classes, confidences):
99
+ try:
100
+ # 1. 先拿到這筆偵測物件的 class_name
101
+ class_name = self.class_names.get(int(class_id), f"unknown_class_{class_id}")
102
+ # 2. 計算這個 class 應該採用的動態 threshold
103
+ dyn_thr = self._get_dynamic_threshold(class_name) # e.g. 0.25 * factor
104
+ # 3. 如果 confidence < dyn_thr,就跳過這一筆
105
+ if confidence < dyn_thr:
106
+ continue
107
+
108
+ # 後面維持原本的座標、中心、大小、區域等資訊計算
109
+ x1, y1, x2, y2 = box
110
+ width = x2 - x1
111
+ height = y2 - y1
112
+
113
+ # 中心點計算
114
+ center_x = (x1 + x2) / 2
115
+ center_y = (y1 + y2) / 2
116
+
117
+ # 標準化位置 (0-1)
118
+ norm_x = center_x / img_width
119
+ norm_y = center_y / img_height
120
+ norm_width = width / img_width
121
+ norm_height = height / img_height
122
+
123
+ # 面積計算
124
+ area = width * height
125
+ norm_area = area / (img_width * img_height)
126
+
127
+ # 區域判斷
128
+ object_region = "unknown"
129
+ if region_analyzer:
130
+ object_region = region_analyzer.determine_region(norm_x, norm_y)
131
+
132
+ # 調試信息:記錄映射過程
133
+ if class_name.startswith("unknown_class_"):
134
+ logger.warning(
135
+ f"Class ID {class_id} not found in class_names. "
136
+ f"Available keys: {list(self.class_names.keys())}"
137
+ )
138
+ else:
139
+ logger.debug(f"Successfully mapped class ID {class_id} to '{class_name}'")
140
+
141
+ detected_objects.append({
142
+ "class_id": int(class_id),
143
+ "class_name": class_name,
144
+ "confidence": float(confidence),
145
+ "box": [float(x1), float(y1), float(x2), float(y2)],
146
+ "center": [float(center_x), float(center_y)],
147
+ "normalized_center": [float(norm_x), float(norm_y)],
148
+ "size": [float(width), float(height)],
149
+ "normalized_size": [float(norm_width), float(norm_height)],
150
+ "area": float(area),
151
+ "normalized_area": float(norm_area),
152
+ "region": object_region
153
+ })
154
+
155
+ except Exception as e:
156
+ logger.error(f"Error processing object with class_id {class_id}: {str(e)}")
157
+ continue
158
+
159
+ logger.info(f"Extracted {len(detected_objects)} objects from detection result")
160
+ return detected_objects
161
+
162
+ except Exception as e:
163
+ logger.error(f"Error extracting detected objects: {str(e)}")
164
+ logger.error(traceback.format_exc())
165
+ return []
166
+
167
+ def update_class_names(self, class_names: Dict[int, str]):
168
+ """
169
+ 動態更新類別名稱映射
170
+
171
+ Args:
172
+ class_names: 新的類別名稱映射字典
173
+ """
174
+ try:
175
+ self.class_names = class_names or {}
176
+ logger.info(f"Class names updated: {len(self.class_names)} classes")
177
+ logger.debug(f"Updated class names: {self.class_names}")
178
+ except Exception as e:
179
+ logger.error(f"Failed to update class names: {str(e)}")
180
+
181
+ def categorize_object(self, obj: Dict) -> str:
182
+ """
183
+ 將檢測到的物件分類到功能類別中,用於區域識別
184
+
185
+ Args:
186
+ obj: 物件字典
187
+
188
+ Returns:
189
+ 物件功能類別字串
190
+ """
191
+ try:
192
+ class_id = obj.get("class_id", -1)
193
+ class_name = obj.get("class_name", "").lower()
194
+
195
+ # 使用現有的類別映射(如果可用)
196
+ if self.object_categories:
197
+ for category, ids in self.object_categories.items():
198
+ if class_id in ids:
199
+ return category
200
+
201
+ # 基於COCO類別名稱的後備分類
202
+ furniture_items = ["chair", "couch", "bed", "dining table", "toilet"]
203
+ plant_items = ["potted plant"]
204
+ electronic_items = ["tv", "laptop", "mouse", "remote", "keyboard", "cell phone"]
205
+ vehicle_items = ["bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat"]
206
+ person_items = ["person"]
207
+ kitchen_items = ["bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl",
208
+ "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog",
209
+ "pizza", "donut", "cake", "refrigerator", "oven", "toaster", "sink", "microwave"]
210
+ sports_items = ["frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat",
211
+ "baseball glove", "skateboard", "surfboard", "tennis racket"]
212
+ personal_items = ["handbag", "tie", "suitcase", "umbrella", "backpack"]
213
+
214
+ if any(item in class_name for item in furniture_items):
215
+ return "furniture"
216
+ elif any(item in class_name for item in plant_items):
217
+ return "plant"
218
+ elif any(item in class_name for item in electronic_items):
219
+ return "electronics"
220
+ elif any(item in class_name for item in vehicle_items):
221
+ return "vehicle"
222
+ elif any(item in class_name for item in person_items):
223
+ return "person"
224
+ elif any(item in class_name for item in kitchen_items):
225
+ return "kitchen_items"
226
+ elif any(item in class_name for item in sports_items):
227
+ return "sports"
228
+ elif any(item in class_name for item in personal_items):
229
+ return "personal_items"
230
+ else:
231
+ return "misc"
232
+
233
+ except Exception as e:
234
+ logger.error(f"Error categorizing object: {str(e)}")
235
+ logger.error(traceback.format_exc())
236
+ return "misc"
237
+
238
+ def get_object_categories(self, detected_objects: List[Dict]) -> set:
239
+ """
240
+ 從檢測到的物件中取得唯一的物件類別
241
+
242
+ Args:
243
+ detected_objects: 檢測到的物件列表
244
+
245
+ Returns:
246
+ 唯一物件類別的集合
247
+ """
248
+ try:
249
+ object_categories = set()
250
+ for obj in detected_objects:
251
+ category = self.categorize_object(obj)
252
+ if category:
253
+ object_categories.add(category)
254
+
255
+ logger.info(f"Found {len(object_categories)} unique object categories")
256
+ return object_categories
257
+
258
+ except Exception as e:
259
+ logger.error(f"Error getting object categories: {str(e)}")
260
+ logger.error(traceback.format_exc())
261
+ return set()
262
+
263
+ def identify_core_objects_for_scene(self, detected_objects: List[Dict], scene_type: str) -> List[Dict]:
264
+ """
265
+ 識別定義特定場景類型的核心物件
266
+
267
+ Args:
268
+ detected_objects: 檢測到的物件列表
269
+ scene_type: 場景類型
270
+
271
+ Returns:
272
+ 場景的核心物件列表
273
+ """
274
+ try:
275
+ core_objects = []
276
+
277
+ # 場景核心物件映射
278
+ scene_core_mapping = {
279
+ "bedroom": [59], # bed
280
+ "kitchen": [68, 69, 71, 72], # microwave, oven, sink, refrigerator
281
+ "living_room": [57, 58, 62], # sofa, chair, tv
282
+ "dining_area": [60, 46, 47], # dining table, fork, knife
283
+ "office_workspace": [63, 64, 66, 73] # laptop, mouse, keyboard, book
284
+ }
285
+
286
+ if scene_type in scene_core_mapping:
287
+ core_class_ids = scene_core_mapping[scene_type]
288
+ for obj in detected_objects:
289
+ if obj.get("class_id") in core_class_ids and obj.get("confidence", 0) >= 0.4:
290
+ core_objects.append(obj)
291
+
292
+ logger.info(f"Identified {len(core_objects)} core objects for scene type '{scene_type}'")
293
+ return core_objects
294
+
295
+ except Exception as e:
296
+ logger.error(f"Error identifying core objects for scene '{scene_type}': {str(e)}")
297
+ logger.error(traceback.format_exc())
298
+ return []
299
+
300
+ def group_objects_by_category_and_region(self, detected_objects: List[Dict]) -> Dict:
301
+ """
302
+ 將物件按類別和區域分組
303
+
304
+ Args:
305
+ detected_objects: 檢測到的物件列表
306
+
307
+ Returns:
308
+ 按類別和區域分組的物件字典
309
+ """
310
+ try:
311
+ category_regions = {}
312
+
313
+ for obj in detected_objects:
314
+ category = self.categorize_object(obj)
315
+ if not category:
316
+ continue
317
+
318
+ if category not in category_regions:
319
+ category_regions[category] = {}
320
+
321
+ region = obj.get("region", "center")
322
+ if region not in category_regions[category]:
323
+ category_regions[category][region] = []
324
+
325
+ category_regions[category][region].append(obj)
326
+
327
+ logger.info(f"Grouped objects into {len(category_regions)} categories across regions")
328
+ return category_regions
329
+
330
+ except Exception as e:
331
+ logger.error(f"Error grouping objects by category and region: {str(e)}")
332
+ logger.error(traceback.format_exc())
333
+ return {}
334
+
335
+ def filter_objects_by_confidence(self, detected_objects: List[Dict], min_confidence: float) -> List[Dict]:
336
+ """
337
+ 根據信心度過濾物件
338
+
339
+ Args:
340
+ detected_objects: 檢測到的物件列表
341
+ min_confidence: 最小信心度閾值
342
+
343
+ Returns:
344
+ 過濾後的物件列表
345
+ """
346
+ try:
347
+ filtered_objects = [
348
+ obj for obj in detected_objects
349
+ if obj.get("confidence", 0) >= min_confidence
350
+ ]
351
+
352
+ logger.info(f"Filtered {len(detected_objects)} objects to {len(filtered_objects)} objects with confidence >= {min_confidence}")
353
+ return filtered_objects
354
+
355
+ except Exception as e:
356
+ logger.error(f"Error filtering objects by confidence: {str(e)}")
357
+ logger.error(traceback.format_exc())
358
+ return detected_objects # 發生錯誤時返回原始列表
prompt_template_manager.py ADDED
@@ -0,0 +1,547 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import traceback
3
+ from typing import Dict, List, Any, Optional
4
+
5
+ class PromptTemplateError(Exception):
6
+ """提示模板相關錯誤的自定義異常"""
7
+ pass
8
+
9
+
10
+ class PromptTemplateManager:
11
+ """
12
+ 負責管理和格式化各種LLM提示模板。
13
+ 包含場景描述增強、錯誤檢測、無檢測處理等不同場景的模板。
14
+ """
15
+
16
+ def __init__(self):
17
+ """初始化提示模板管理器"""
18
+ # set the logger
19
+ self.logger = logging.getLogger(self.__class__.__name__)
20
+ if not self.logger.handlers:
21
+ handler = logging.StreamHandler()
22
+ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
23
+ handler.setFormatter(formatter)
24
+ self.logger.addHandler(handler)
25
+ self.logger.setLevel(logging.INFO)
26
+
27
+ # initialize all templates
28
+ self._initialize_templates()
29
+ self.logger.info("PromptTemplateManager initialized successfully")
30
+
31
+ def _initialize_templates(self):
32
+ """初始化所有提示模板"""
33
+ try:
34
+ self._setup_enhancement_template()
35
+ self._setup_verification_template()
36
+ self._setup_no_detection_template()
37
+ self.logger.info("All prompt templates initialized")
38
+ except Exception as e:
39
+ self.logger.error(f"Failed to initialize templates: {str(e)}")
40
+ self.logger.error(traceback.format_exc())
41
+ raise PromptTemplateError(f"Template initialization failed: {str(e)}") from e
42
+
43
+
44
+ def format_enhancement_prompt_with_landmark(self, scene_data: Dict[str, Any], object_list: str, original_description: str) -> str:
45
+ try:
46
+ # 確保場景類型被正確清理
47
+ scene_type = scene_data.get("scene_type", "unknown scene")
48
+ cleaned_scene_type = self._clean_scene_type(scene_type)
49
+
50
+ # 通用文本格式清理:處理底線和格式化問題
51
+ cleaned_description = self._clean_text_formatting(original_description)
52
+
53
+ # 額外清理場景類型底線格式
54
+ cleaned_description = self._clean_scene_type_underscores(cleaned_description)
55
+
56
+ # 強化輸入清理
57
+ cleaned_description = self._enhance_input_cleaning(cleaned_description)
58
+
59
+ # 在原始描述中替換未清理的場景類型
60
+ if scene_type != cleaned_scene_type:
61
+ cleaned_description = cleaned_description.replace(scene_type, cleaned_scene_type)
62
+
63
+ # 檢查是否有地標資訊
64
+ landmark_info = scene_data.get("landmark_location_info")
65
+ is_fallback = scene_data.get("is_fallback", False)
66
+
67
+ # 準備額外的地標指導內容
68
+ additional_guidance = ""
69
+ if landmark_info:
70
+ landmark_name = landmark_info.get("name", "")
71
+ landmark_location = landmark_info.get("location", "")
72
+ additional_guidance = f"""
73
+ LANDMARK LOCATION REQUIREMENT: This scene features {landmark_name} located in {landmark_location}.
74
+ 16. MANDATORY: Include the specific location "{landmark_location}" when first mentioning {landmark_name}. Use natural phrasing such as "Located in {landmark_location}, the {landmark_name}..." or "The {landmark_name} in {landmark_location}..." or "Standing majestically in {landmark_location}, {landmark_name}...".
75
+ 17. Avoid mechanical openings like "The tourist landmark is centered around" or "The scene is centered around". Instead, begin with the landmark itself as the subject.
76
+ 18. NEVER use terms with underscores like "tourist_landmark" or "historical_site" in your response. Use natural language: "tourist landmark", "historical site", "cultural attraction" etc.
77
+ 19. The geographical reference must appear naturally in the opening sentence, integrated as essential context rather than supplementary information."""
78
+ elif is_fallback:
79
+ additional_guidance = """
80
+ FALLBACK MODE: The previous enhancement was insufficient. Provide a more detailed description focusing on key visual elements, human activities, atmospheric details, and architectural features."""
81
+
82
+ # 建構完整的模板內容
83
+ if additional_guidance:
84
+ # 在CRITICAL RULES後添加地標相關指導
85
+ enhanced_template = self.enhance_description_template.replace(
86
+ "15. When describing quantities or arrangements, use only information explicitly confirmed by the object detection system.",
87
+ f"15. When describing quantities or arrangements, use only information explicitly confirmed by the object detection system.{additional_guidance}"
88
+ )
89
+ else:
90
+ enhanced_template = self.enhance_description_template
91
+
92
+ formatted_prompt = enhanced_template.format(
93
+ original_description=cleaned_description,
94
+ object_list=object_list
95
+ )
96
+
97
+ return formatted_prompt
98
+
99
+ except Exception as e:
100
+ self.logger.error(f"Failed to format enhancement prompt: {str(e)}")
101
+ self.logger.error(traceback.format_exc())
102
+ raise PromptTemplateError(f"Prompt formatting failed: {e}") from e
103
+
104
+ def _clean_text_formatting(self, text: str) -> str:
105
+ """
106
+ 通用文本格式清理方法,處理底線、格式化等問題
107
+
108
+ Args:
109
+ text: 需要清理的原始文本
110
+
111
+ Returns:
112
+ str: 清理後的文本
113
+ """
114
+ if not text:
115
+ return text
116
+
117
+ try:
118
+ import re
119
+
120
+ # 替換常見的技術性詞彙
121
+ replacements = {
122
+ 'tourist_landmark': 'tourist landmark',
123
+ 'historical_site': 'historical site',
124
+ 'religious_building': 'religious building',
125
+ 'cultural_landmark': 'cultural landmark',
126
+ 'architectural_site': 'architectural site',
127
+ 'natural_landmark': 'natural landmark'
128
+ }
129
+
130
+ cleaned = text
131
+ for old_term, new_term in replacements.items():
132
+ cleaned = cleaned.replace(old_term, new_term)
133
+
134
+ # 處理其他底線情況
135
+ cleaned = re.sub(r'(\w+)_(\w+)', lambda m: f"{m.group(1)} {m.group(2)}", cleaned)
136
+
137
+ # 處理多個連續底線
138
+ cleaned = re.sub(r'_+', ' ', cleaned)
139
+
140
+ # 清理多餘空格
141
+ cleaned = re.sub(r'\s+', ' ', cleaned)
142
+
143
+ return cleaned.strip()
144
+
145
+ except Exception as e:
146
+ self.logger.warning(f"Error in text formatting cleanup: {str(e)}")
147
+ return text
148
+
149
+ def _clean_scene_type_underscores(self, text: str) -> str:
150
+ """
151
+ 專門清理場景類型中的底線格式
152
+
153
+ Args:
154
+ text: 需要清理的文本
155
+
156
+ Returns:
157
+ str: 清理後的文本
158
+ """
159
+ if not text:
160
+ return text
161
+
162
+ try:
163
+ import re
164
+
165
+ # 專門處理場景類型的底線格式
166
+ scene_type_patterns = [
167
+ 'urban_intersection', 'city_street', 'downtown_area', 'business_district',
168
+ 'residential_area', 'commercial_zone', 'industrial_area', 'shopping_center',
169
+ 'traffic_intersection', 'pedestrian_crossing', 'public_square'
170
+ ]
171
+
172
+ for pattern in scene_type_patterns:
173
+ if pattern in text:
174
+ replacement = pattern.replace('_', ' ')
175
+ text = text.replace(pattern, replacement)
176
+
177
+ # 處理任何剩餘的場景類型底線模式
178
+ text = re.sub(r'\b([a-z]+)_([a-z]+)(?=\s+(?:features|shows|displays|contains|is|area|zone|scene))',
179
+ r'\1 \2', text, flags=re.IGNORECASE)
180
+
181
+ return text
182
+
183
+ except Exception as e:
184
+ self.logger.warning(f"Error in scene type underscore cleanup: {str(e)}")
185
+ return text
186
+
187
+ def _enhance_input_cleaning(self, description: str) -> str:
188
+ """
189
+ 增強輸入描述的清理功能
190
+
191
+ Args:
192
+ description: 待清理的描述
193
+
194
+ Returns:
195
+ str: 清理後的描述
196
+ """
197
+ if not description:
198
+ return description
199
+
200
+ try:
201
+ import re
202
+
203
+ # 預防性清理底線格式
204
+ description = re.sub(r'\b(\w+)_(\w+)\b', r'\1 \2', description)
205
+
206
+ # 清理可能導致語法問題的模式
207
+ problematic_patterns = [
208
+ (r'\s+,\s+', ', '), # 修正空格-逗號問題
209
+ (r'\bIn\s*,', 'In the area,'), # 預防性修正
210
+ (r'\s+\.', '.'), # 修正句號前空格
211
+ ]
212
+
213
+ for pattern, replacement in problematic_patterns:
214
+ description = re.sub(pattern, replacement, description)
215
+
216
+ return description.strip()
217
+
218
+ except Exception as e:
219
+ self.logger.warning(f"Error in enhanced input cleaning: {str(e)}")
220
+ return description
221
+
222
+ def _setup_enhancement_template(self):
223
+ """設置場景描述增強模板"""
224
+ self.enhance_description_template = """
225
+ <|system|>
226
+ You are an expert visual analyst. Your task is to improve the readability and fluency of scene descriptions using STRICT factual accuracy.
227
+ Your **top priority is to avoid hallucination** or fabrication. You are working in a computer vision pipeline using object detection (YOLO) and image embeddings. You MUST treat the input object list as a whitelist. Do not speculate beyond this list.
228
+ </|system|>
229
+ <|user|>
230
+ Rewrite the following scene description to be fluent and clear. DO NOT add any objects, events, or spatial relationships that are not explicitly present in the original or object list.
231
+ ORIGINAL:
232
+ {original_description}
233
+ CRITICAL RULES:
234
+ 1. CRITICAL ADHERENCE TO INPUT: Strictly adhere to the information explicitly provided in the ORIGINAL description and the {object_list}.
235
+ a. NEVER assume or infer room types, object functions, scene purposes, or abstract conceptual zones (e.g., 'personal items zone', 'activity area') unless such concepts, along with their specific constituent objects and locations, are explicitly detailed in the ORIGINAL description or clearly supported by multiple items in the {object_list}.
236
+ b. Your role is to rephrase and enhance the provided factual data, not to introduce new conceptual layers or interpretations not directly supported by the input.
237
+ 2. OBJECT WHITELIST & DETAIL ACCURACY:
238
+ a. The provided {object_list} is an exhaustive list of objects confirmed by the vision system. Mention ONLY objects from this list or objects explicitly detailed in the ORIGINAL description.
239
+ b. DO NOT invent additional objects or infer the presence of 'various scattered objects' if only a single specific item (e.g., one 'handbag') is mentioned in relation to a category or area. Describe only what is explicitly listed.
240
+ 3. NEVER speculate on object quantity. If the description says "10 people" , DO NOT say "dozens" or "many". Maintain the original quantity unless specified.
241
+ 4. SPATIAL ACCURACY - STRICTLY FROM ORIGINAL:
242
+ a. Base ALL descriptions of object locations (e.g., 'foreground', 'background', 'middle center') and spatial relationships STRICTLY on the information explicitly provided in the ORIGINAL description.
243
+ b. If the ORIGINAL description states an object is 'in the background,' use that exact term. If it specifies 'in the foreground,' use that. If it describes an object as being 'carried by a person', reflect this precise relationship.
244
+ c. If the ORIGINAL description is less specific about an object's location (e.g., 'a car is present'), then use general, non-committal terms like 'visible in the scene' or 'present in the image.'
245
+ d. DO NOT re-interpret object positions from any perceived understanding of the raw image; your sole source for spatial information is the ORIGINAL description. Do not relocate objects (e.g., moving a carried handbag from the person to 'the background').
246
+ 5. You MAY describe confirmed materials, colors, and composition style if visually obvious and non-speculative, AND if such details are hinted at or present in the ORIGINAL description or {object_list}.
247
+ 6. Write 2–4 complete, well-structured sentences with punctuation.
248
+ 7. Final output MUST be a single fluent paragraph of 60–200 words (not longer). Within this concise format, every sentence should aim to introduce new information or build upon previous statements without significant overlap.
249
+ 8. Begin your response directly with the scene description. Do NOT include any introductory phrases, explanations, or formatting indicators.
250
+ 9. Ensure grammatical completeness in all sentences. Each sentence must have a complete subject and predicate structure.
251
+ a. NEVER use underscore formatting (e.g., tourist_landmark, urban_intersection). Always use natural spacing (tourist landmark, urban intersection).
252
+ b. NEVER begin sentences with incomplete phrases like "In ," or "Overall," without proper subjects. Always ensure complete sentence structure.
253
+ c. AVOID redundant or circular phrasing such as "with lights turned illuminating" or "atmosphere of is one of."
254
+ d. If you encounter incomplete spatial descriptions like "visible in ," or "positioned in the middle of.", complete them naturally by adding appropriate context such as "visible in the scene" or "positioned in the middle of the frame", ensuring these completions are consistent with the ORIGINAL description. Always ensure spatial descriptions have complete prepositional phrases.
255
+ e. GRAMMAR AND FLUENCY CHECK: Ensure all sentences are grammatically flawless and flow naturally. Avoid awkward phrasing or dangling prepositions (e.g., 'glow over ,'). Mentally re-read your generated description to catch and correct such minor errors before finalizing.
256
+ 10. Vary sentence structures naturally while maintaining grammatical accuracy.
257
+ 11. CRITICAL: Avoid repeating the mention of specific objects, groups of objects, or their spatial arrangements. Once an object or layout aspect is described, only refer to it again if providing genuinely NEW and DISTINCT information or a significantly different perspective that adds substantial value. Strive for conciseness and information density.
258
+ 12. Create natural spatial flow by connecting object descriptions organically rather than listing positions mechanically.
259
+ 13. Use transitional phrases to connect ideas smoothly, varying expression patterns throughout the description.
260
+ 14. For the concluding sentence, focus on the overall atmosphere, style, perceived activity, or overarching impression of the scene. DO NOT simply restate the primary objects or their layout as a summary or 'backdrop' if they have already been clearly described earlier in the paragraph. The conclusion should offer a higher-level takeaway.
261
+ 15. When describing quantities or arrangements, use only information explicitly confirmed by the object detection system or ORIGINAL description.
262
+ </|user|>
263
+ <|assistant|>
264
+ """
265
+
266
+ def _setup_verification_template(self):
267
+ """設置檢測結果驗證模板"""
268
+ self.verify_detection_template = """
269
+ Task: You are an advanced vision system that verifies computer vision detections for accuracy.
270
+ Analyze the following detection results and identify any potential errors or inconsistencies:
271
+ SCENE TYPE: {scene_type}
272
+ SCENE NAME: {scene_name}
273
+ CONFIDENCE: {confidence:.2f}
274
+ DETECTED OBJECTS: {detected_objects}
275
+ CLIP ANALYSIS RESULTS:
276
+ {clip_analysis}
277
+ Possible Errors to Check:
278
+ 1. Objects misidentified (e.g., architectural elements labeled as vehicles)
279
+ 2. Cultural elements misunderstood (e.g., Asian temple structures labeled as boats)
280
+ 3. Objects that seem out of place for this type of scene
281
+ 4. Inconsistencies between different detection systems
282
+ If you find potential errors, list them clearly with explanations. If the detections seem reasonable, state that they appear accurate.
283
+ Verification Results:
284
+ """
285
+
286
+ def _setup_no_detection_template(self):
287
+ """設置無檢測結果處理模板"""
288
+ self.no_detection_template = """
289
+ Task: You are an advanced scene understanding system analyzing an image where standard object detection failed to identify specific objects.
290
+ Based on advanced image embeddings (CLIP analysis), we have the following information:
291
+ MOST LIKELY SCENE: {top_scene} (confidence: {top_confidence:.2f})
292
+ VIEWPOINT: {viewpoint}
293
+ LIGHTING: {lighting_condition}
294
+ CULTURAL ANALYSIS: {cultural_analysis}
295
+ Create a detailed description of what might be in this scene, considering:
296
+ 1. The most likely type of location or setting
297
+ 2. Possible architectural or natural elements present
298
+ 3. The lighting and atmosphere
299
+ 4. Potential cultural or regional characteristics
300
+ Your description should be natural, flowing, and offer insights into what the image likely contains despite the lack of specific object detection.
301
+ Scene Description:
302
+ """
303
+
304
+ def format_enhancement_prompt(self, scene_data: Dict[str, Any], object_list: str, original_description: str) -> str:
305
+ try:
306
+ # 確保場景類型被正確清理
307
+ scene_type = scene_data.get("scene_type", "unknown scene")
308
+ cleaned_scene_type = self._clean_scene_type(scene_type)
309
+
310
+ # 在原始描述中替換未清理的場景類型
311
+ if scene_type != cleaned_scene_type:
312
+ original_description = original_description.replace(scene_type, cleaned_scene_type)
313
+
314
+ formatted_prompt = self.enhance_description_template.format(
315
+ original_description=original_description,
316
+ object_list=object_list
317
+ )
318
+
319
+ return formatted_prompt
320
+
321
+ except Exception as e:
322
+ self.logger.error(f"Failed to format enhancement prompt: {str(e)}")
323
+ self.logger.error(traceback.format_exc())
324
+ raise PromptTemplateError(f"Prompt formatting failed: {e}") from e
325
+
326
+
327
+ def format_verification_prompt(self,
328
+ detected_objects: List[Dict],
329
+ clip_analysis: Dict[str, Any],
330
+ scene_type: str,
331
+ scene_name: str,
332
+ confidence: float) -> str:
333
+ """
334
+ 格式化檢測結果驗證提示
335
+
336
+ Args:
337
+ detected_objects: 檢測到的物件列表
338
+ clip_analysis: CLIP分析結果
339
+ scene_type: 場景類型
340
+ scene_name: 場景名稱
341
+ confidence: 場景分類信心度
342
+
343
+ Returns:
344
+ str: 格式化後的驗證提示字符串
345
+
346
+ Raises:
347
+ PromptTemplateError: 當模板格式化失敗時
348
+ """
349
+ try:
350
+ self.logger.debug("Formatting verification prompt")
351
+
352
+ # 格式化物件列表和CLIP分析結果
353
+ objects_str = self._format_objects_for_prompt(detected_objects)
354
+ clip_str = self._format_clip_results(clip_analysis)
355
+
356
+ # 格式化提示
357
+ formatted_prompt = self.verify_detection_template.format(
358
+ scene_type=scene_type,
359
+ scene_name=scene_name,
360
+ confidence=confidence,
361
+ detected_objects=objects_str,
362
+ clip_analysis=clip_str
363
+ )
364
+
365
+ self.logger.debug(f"Verification prompt formatted successfully (length: {len(formatted_prompt)})")
366
+ return formatted_prompt
367
+
368
+ except Exception as e:
369
+ error_msg = f"Failed to format verification prompt: {str(e)}"
370
+ self.logger.error(error_msg)
371
+ self.logger.error(traceback.format_exc())
372
+ raise PromptTemplateError(error_msg) from e
373
+
374
+ def format_no_detection_prompt(self, clip_analysis: Dict[str, Any]) -> str:
375
+ """
376
+ 格式化無檢測結果處理提示
377
+
378
+ Args:
379
+ clip_analysis: CLIP分析結果字典
380
+
381
+ Returns:
382
+ str: 格式化後的無檢測處理提示字符串
383
+
384
+ Raises:
385
+ PromptTemplateError: 當模板格式化失敗時
386
+ """
387
+ try:
388
+ self.logger.debug("Formatting no-detection prompt")
389
+
390
+ # 提取CLIP分析結果
391
+ top_scene, top_confidence = clip_analysis.get("top_scene", ("unknown", 0))
392
+ viewpoint = clip_analysis.get("viewpoint", ("standard", 0))[0]
393
+ lighting = clip_analysis.get("lighting_condition", ("unknown", 0))[0]
394
+
395
+ # 格式化文化分析
396
+ cultural_str = self._format_cultural_analysis(clip_analysis.get("cultural_analysis", {}))
397
+
398
+ # 格式化提示
399
+ formatted_prompt = self.no_detection_template.format(
400
+ top_scene=top_scene,
401
+ top_confidence=top_confidence,
402
+ viewpoint=viewpoint,
403
+ lighting_condition=lighting,
404
+ cultural_analysis=cultural_str
405
+ )
406
+
407
+ self.logger.debug(f"No-detection prompt formatted successfully (length: {len(formatted_prompt)})")
408
+ return formatted_prompt
409
+
410
+ except Exception as e:
411
+ error_msg = f"Failed to format no-detection prompt: {str(e)}"
412
+ self.logger.error(error_msg)
413
+ self.logger.error(traceback.format_exc())
414
+ raise PromptTemplateError(error_msg) from e
415
+
416
+ def _clean_scene_type(self, scene_type: str) -> str:
417
+ """
418
+ 清理場景類型,使其更適合用於提示詞
419
+
420
+ Args:
421
+ scene_type: 原始場景類型
422
+
423
+ Returns:
424
+ str: 清理後的場景類型
425
+ """
426
+ if not scene_type:
427
+ return "scene"
428
+
429
+ # 將底線替換為空格並首字母大寫
430
+ if '_' in scene_type:
431
+ return ' '.join(word.capitalize() for word in scene_type.split('_'))
432
+
433
+ return scene_type
434
+
435
+ def _format_objects_for_prompt(self, objects: List[Dict]) -> str:
436
+ """
437
+ 格式化物件列表以用於提示
438
+
439
+ Args:
440
+ objects: 檢測到的物件列表
441
+
442
+ Returns:
443
+ str: 格式化後的物件字符串
444
+ """
445
+ if not objects:
446
+ return "No objects detected"
447
+
448
+ try:
449
+ formatted = []
450
+ for obj in objects:
451
+ class_name = obj.get("class_name", "unknown")
452
+ confidence = obj.get("confidence", 0)
453
+ formatted.append(f"{class_name} (confidence: {confidence:.2f})")
454
+
455
+ return "\n- " + "\n- ".join(formatted)
456
+
457
+ except Exception as e:
458
+ self.logger.warning(f"Error formatting objects: {str(e)}")
459
+ return "Object formatting error"
460
+
461
+ def _format_clip_results(self, clip_analysis: Dict) -> str:
462
+ """
463
+ 格式化CLIP分析結果以用於提示
464
+
465
+ Args:
466
+ clip_analysis: CLIP分析結果字典
467
+
468
+ Returns:
469
+ str: 格式化後的CLIP分析字符串
470
+ """
471
+ if not clip_analysis or "error" in clip_analysis:
472
+ return "No CLIP analysis available"
473
+
474
+ try:
475
+ parts = ["CLIP Analysis Results:"]
476
+
477
+ # 添加頂級場景
478
+ top_scene, confidence = clip_analysis.get("top_scene", ("unknown", 0))
479
+ parts.append(f"- Most likely scene: {top_scene} (confidence: {confidence:.2f})")
480
+
481
+ # 添加視角
482
+ viewpoint, vp_conf = clip_analysis.get("viewpoint", ("standard", 0))
483
+ parts.append(f"- Camera viewpoint: {viewpoint} (confidence: {vp_conf:.2f})")
484
+
485
+ # 添加物件組合
486
+ if "object_combinations" in clip_analysis:
487
+ combos = []
488
+ for combo, score in clip_analysis["object_combinations"][:3]:
489
+ combos.append(f"{combo} ({score:.2f})")
490
+ parts.append(f"- Object combinations: {', '.join(combos)}")
491
+
492
+ # 添加文化分析
493
+ if "cultural_analysis" in clip_analysis:
494
+ parts.append("- Cultural analysis:")
495
+ for culture_type, data in clip_analysis["cultural_analysis"].items():
496
+ best_desc = data.get("best_description", "")
497
+ desc_conf = data.get("confidence", 0)
498
+ parts.append(f" * {culture_type}: {best_desc} ({desc_conf:.2f})")
499
+
500
+ return "\n".join(parts)
501
+
502
+ except Exception as e:
503
+ self.logger.warning(f"Error formatting CLIP results: {str(e)}")
504
+ return "CLIP analysis formatting error"
505
+
506
+ def _format_cultural_analysis(self, cultural_analysis: Dict) -> str:
507
+ """
508
+ 格式化文化分析結果
509
+
510
+ Args:
511
+ cultural_analysis: 文化分析結果字典
512
+
513
+ Returns:
514
+ str: 格式化後的文化分析字符串
515
+ """
516
+ if not cultural_analysis:
517
+ return "No specific cultural elements detected"
518
+
519
+ try:
520
+ parts = []
521
+ for culture_type, data in cultural_analysis.items():
522
+ best_desc = data.get("best_description", "")
523
+ desc_conf = data.get("confidence", 0)
524
+ parts.append(f"{culture_type}: {best_desc} (confidence: {desc_conf:.2f})")
525
+
526
+ return "\n".join(parts)
527
+
528
+ except Exception as e:
529
+ self.logger.warning(f"Error formatting cultural analysis: {str(e)}")
530
+ return "Cultural analysis formatting error"
531
+
532
+ def get_template_info(self) -> Dict[str, Any]:
533
+ """
534
+ 獲取模板管理器的信息
535
+
536
+ Returns:
537
+ Dict[str, Any]: 包含模板數量和狀態的信息
538
+ """
539
+ return {
540
+ "templates_count": 3,
541
+ "available_templates": [
542
+ "enhance_description_template",
543
+ "verify_detection_template",
544
+ "no_detection_template"
545
+ ],
546
+ "initialization_status": "success"
547
+ }
region_analyzer.py ADDED
@@ -0,0 +1,487 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import logging
3
+ import traceback
4
+ from typing import Dict, List, Any
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+ class RegionAnalyzer:
9
+ """
10
+ 負責處理圖像區域劃分和基礎空間分析功能
11
+ 專注於3x3網格的區域劃分、物件分布分析和空間多樣性計算
12
+ """
13
+
14
+ def __init__(self):
15
+ """初始化區域分析器,定義3x3網格區域"""
16
+ try:
17
+ # 定義圖像的3x3網格區域
18
+ self.regions = {
19
+ "top_left": (0, 0, 1/3, 1/3),
20
+ "top_center": (1/3, 0, 2/3, 1/3),
21
+ "top_right": (2/3, 0, 1, 1/3),
22
+ "middle_left": (0, 1/3, 1/3, 2/3),
23
+ "middle_center": (1/3, 1/3, 2/3, 2/3),
24
+ "middle_right": (2/3, 1/3, 1, 2/3),
25
+ "bottom_left": (0, 2/3, 1/3, 1),
26
+ "bottom_center": (1/3, 2/3, 2/3, 1),
27
+ "bottom_right": (2/3, 2/3, 1, 1)
28
+ }
29
+ logger.info("RegionAnalyzer initialized successfully with 3x3 grid regions")
30
+ except Exception as e:
31
+ logger.error(f"Failed to initialize RegionAnalyzer: {str(e)}")
32
+ logger.error(traceback.format_exc())
33
+ raise
34
+
35
+ def determine_region(self, x: float, y: float) -> str:
36
+ """
37
+ 判斷點位於哪個區域
38
+
39
+ Args:
40
+ x: 標準化x座標 (0-1)
41
+ y: 標準化y座標 (0-1)
42
+
43
+ Returns:
44
+ 區域名稱
45
+ """
46
+ try:
47
+ for region_name, (x1, y1, x2, y2) in self.regions.items():
48
+ if x1 <= x < x2 and y1 <= y < y2:
49
+ return region_name
50
+
51
+ logger.warning(f"Point ({x}, {y}) does not fall into any defined region")
52
+ return "unknown"
53
+
54
+ except Exception as e:
55
+ logger.error(f"Error determining region for point ({x}, {y}): {str(e)}")
56
+ logger.error(traceback.format_exc())
57
+ return "unknown"
58
+
59
+ def get_spatial_description_phrase(self, region: str) -> str:
60
+ """
61
+ 將region ID轉換為完整的空間描述短語,包含適當的介詞結構
62
+
63
+ Args:
64
+ region: 區域標識符(如 "middle_center", "top_left")
65
+
66
+ Returns:
67
+ str: 完整的空間描述短語,空值時返回空字串
68
+ """
69
+ try:
70
+ # 處理空值或無效輸入
71
+ if not region or region.strip() == "" or region == "unknown":
72
+ return "within the visible area"
73
+
74
+ # 清理region格式,移除底線
75
+ clean_region = region.replace('_', ' ').strip().lower()
76
+
77
+ # 根據區域位置生成自然語言描述
78
+ region_mappings = {
79
+ "top left": "in the upper left area",
80
+ "top center": "in the upper area",
81
+ "top right": "in the upper right area",
82
+ "middle left": "on the left side",
83
+ "middle center": "in the center",
84
+ "center": "in the center",
85
+ "middle right": "on the right side",
86
+ "bottom left": "in the lower left area",
87
+ "bottom center": "in the lower area",
88
+ "bottom right": "in the lower right area"
89
+ }
90
+
91
+ # 直接映射匹配
92
+ if clean_region in region_mappings:
93
+ return region_mappings[clean_region]
94
+
95
+ # 模糊匹配方位的處理
96
+ if "top" in clean_region and "left" in clean_region:
97
+ return "in the upper left area"
98
+ elif "top" in clean_region and "right" in clean_region:
99
+ return "in the upper right area"
100
+ elif "bottom" in clean_region and "left" in clean_region:
101
+ return "in the lower left area"
102
+ elif "bottom" in clean_region and "right" in clean_region:
103
+ return "in the lower right area"
104
+ elif "top" in clean_region:
105
+ return "in the upper area"
106
+ elif "bottom" in clean_region:
107
+ return "in the lower area"
108
+ elif "left" in clean_region:
109
+ return "on the left side"
110
+ elif "right" in clean_region:
111
+ return "on the right side"
112
+ elif "center" in clean_region or "middle" in clean_region:
113
+ return "in the center"
114
+ else:
115
+ # 對於無法辨識的區域,返回通用描述
116
+ return f"in the {clean_region} area"
117
+
118
+ except Exception as e:
119
+ logger.warning(f"Error generating spatial description for region '{region}': {str(e)}")
120
+ return ""
121
+
122
+ def get_contextual_spatial_description(self, region: str, object_type: str = "") -> str:
123
+ """
124
+ 根據物件類型提供更具情境的空間描述
125
+
126
+ Args:
127
+ region: 區域標識符
128
+ object_type: 物件類型,用於優化描述語境
129
+
130
+ Returns:
131
+ str: 情境化的空間描述短語
132
+ """
133
+ try:
134
+ # 獲取基礎空間描述
135
+ base_description = self.get_spatial_description_phrase(region)
136
+
137
+ if not base_description:
138
+ return ""
139
+
140
+ # 根據物件類型調整描述語境
141
+ if object_type:
142
+ object_type_lower = object_type.lower()
143
+
144
+ # 對於辨識到人相關,用更自然的位置描述
145
+ if "person" in object_type_lower or "people" in object_type_lower:
146
+ if "center" in base_description:
147
+ return "in the central area"
148
+ elif "upper" in base_description:
149
+ return "in the background"
150
+ elif "lower" in base_description:
151
+ return "in the foreground"
152
+
153
+ # 對於車輛,強調道路位置
154
+ elif any(vehicle in object_type_lower for vehicle in ["car", "vehicle", "truck", "bus"]):
155
+ if "left" in base_description:
156
+ return "on the left side of the scene"
157
+ elif "right" in base_description:
158
+ return "on the right side of the scene"
159
+ elif "center" in base_description:
160
+ return "in the central area"
161
+
162
+ # 對於交通設施,使用更具體的位置描述
163
+ elif "traffic" in object_type_lower:
164
+ if "upper" in base_description:
165
+ return "positioned in the upper portion"
166
+ elif "center" in base_description:
167
+ return "centrally positioned"
168
+ else:
169
+ return base_description.replace("in the", "positioned in the")
170
+
171
+ return base_description
172
+
173
+ except Exception as e:
174
+ logger.warning(f"Error generating contextual spatial description: {str(e)}")
175
+ return self.get_spatial_description_phrase(region)
176
+
177
+
178
+ def validate_region_input(self, region: str) -> bool:
179
+ """
180
+ 驗證region輸入是否有效
181
+
182
+ Args:
183
+ region: 待驗證的區域標識符
184
+
185
+ Returns:
186
+ bool: 是否為有效的region
187
+ """
188
+ try:
189
+ if not region or region.strip() == "":
190
+ return False
191
+
192
+ # 清理並檢查是否為已知區域
193
+ clean_region = region.replace('_', ' ').strip().lower()
194
+
195
+ known_regions = [
196
+ "top left", "top center", "top right",
197
+ "middle left", "middle center", "middle right",
198
+ "bottom left", "bottom center", "bottom right",
199
+ "center", "unknown"
200
+ ]
201
+
202
+ # 直接匹配或包含關鍵詞匹配
203
+ if clean_region in known_regions:
204
+ return True
205
+
206
+ # 檢查是否包含有效的位置關鍵詞組合
207
+ position_keywords = ["top", "bottom", "left", "right", "center", "middle"]
208
+ has_valid_keyword = any(keyword in clean_region for keyword in position_keywords)
209
+
210
+ return has_valid_keyword
211
+
212
+ except Exception as e:
213
+ logger.warning(f"Error validating region input '{region}': {str(e)}")
214
+ return False
215
+
216
+ def get_enhanced_directional_description(self, region: str) -> str:
217
+ """
218
+ 增強版的方位描述生成,提供更豐富的方位資訊
219
+ 擴展原有的get_directional_description方法功能
220
+
221
+ Args:
222
+ region: 區域名稱
223
+
224
+ Returns:
225
+ str: 增強的方位描述字串
226
+ """
227
+ try:
228
+ if not self.validate_region_input(region):
229
+ return "central"
230
+
231
+ region_lower = region.replace('_', ' ').strip().lower()
232
+
233
+ # 用比較準確的方位映射
234
+ direction_mappings = {
235
+ "top left": "northwest",
236
+ "top center": "north",
237
+ "top right": "northeast",
238
+ "middle left": "west",
239
+ "middle center": "central",
240
+ "center": "central",
241
+ "middle right": "east",
242
+ "bottom left": "southwest",
243
+ "bottom center": "south",
244
+ "bottom right": "southeast"
245
+ }
246
+
247
+ if region_lower in direction_mappings:
248
+ return direction_mappings[region_lower]
249
+
250
+ # 模糊匹配邏輯保持與原方法相同
251
+ if "top" in region_lower and "left" in region_lower:
252
+ return "northwest"
253
+ elif "top" in region_lower and "right" in region_lower:
254
+ return "northeast"
255
+ elif "bottom" in region_lower and "left" in region_lower:
256
+ return "southwest"
257
+ elif "bottom" in region_lower and "right" in region_lower:
258
+ return "southeast"
259
+ elif "top" in region_lower:
260
+ return "north"
261
+ elif "bottom" in region_lower:
262
+ return "south"
263
+ elif "left" in region_lower:
264
+ return "west"
265
+ elif "right" in region_lower:
266
+ return "east"
267
+ else:
268
+ return "central"
269
+
270
+ except Exception as e:
271
+ logger.error(f"Error getting enhanced directional description for region '{region}': {str(e)}")
272
+ return "central"
273
+
274
+ def analyze_regions(self, detected_objects: List[Dict]) -> Dict:
275
+ """
276
+ 分析物件在各區域的分布情況
277
+
278
+ Args:
279
+ detected_objects: 包含位置資訊的檢測物件列表
280
+
281
+ Returns:
282
+ 包含區域分析結果的字典
283
+ """
284
+ try:
285
+ if not detected_objects:
286
+ logger.warning("No detected objects provided for region analysis")
287
+ return {
288
+ "counts": {region: 0 for region in self.regions.keys()},
289
+ "main_focus": [],
290
+ "objects_by_region": {region: [] for region in self.regions.keys()}
291
+ }
292
+
293
+ # 計算每個區域的物件數量
294
+ region_counts = {region: 0 for region in self.regions.keys()}
295
+ region_objects = {region: [] for region in self.regions.keys()}
296
+
297
+ for obj in detected_objects:
298
+ try:
299
+ region = obj.get("region", "unknown")
300
+ if region in region_counts:
301
+ region_counts[region] += 1
302
+ region_objects[region].append({
303
+ "class_id": obj.get("class_id"),
304
+ "class_name": obj.get("class_name")
305
+ })
306
+ else:
307
+ logger.warning(f"Unknown region '{region}' found in object")
308
+
309
+ except Exception as e:
310
+ logger.error(f"Error processing object in region analysis: {str(e)}")
311
+ continue
312
+
313
+ # 確定主要焦點區域(按物件數量排序的前1-2個區域)
314
+ sorted_regions = sorted(region_counts.items(), key=lambda x: x[1], reverse=True)
315
+ main_regions = [region for region, count in sorted_regions if count > 0][:2]
316
+
317
+ result = {
318
+ "counts": region_counts,
319
+ "main_focus": main_regions,
320
+ "objects_by_region": region_objects
321
+ }
322
+
323
+ logger.info(f"Region analysis completed. Main focus areas: {main_regions}")
324
+ return result
325
+
326
+ except Exception as e:
327
+ logger.error(f"Error in region analysis: {str(e)}")
328
+ logger.error(traceback.format_exc())
329
+ # 返回空的結果結構而不是拋出異常
330
+ return {
331
+ "counts": {region: 0 for region in self.regions.keys()},
332
+ "main_focus": [],
333
+ "objects_by_region": {region: [] for region in self.regions.keys()}
334
+ }
335
+
336
+ def create_distribution_map(self, detected_objects: List[Dict]) -> Dict:
337
+ """
338
+ 創建物件在各區域分布的詳細地圖,用於空間分析
339
+
340
+ Args:
341
+ detected_objects: 檢測到的物件列表
342
+
343
+ Returns:
344
+ 包含各區域分布詳情的字典
345
+ """
346
+ try:
347
+ if not detected_objects:
348
+ logger.warning("No detected objects provided for distribution map creation")
349
+ return self._get_empty_distribution_map()
350
+
351
+ distribution = {}
352
+
353
+ # 初始化所有區域
354
+ for region in self.regions.keys():
355
+ distribution[region] = {
356
+ "total": 0,
357
+ "objects": {},
358
+ "density": 0
359
+ }
360
+
361
+ # 填充分布資料
362
+ for obj in detected_objects:
363
+ try:
364
+ region = obj.get("region", "unknown")
365
+ class_id = obj.get("class_id")
366
+ class_name = obj.get("class_name", "unknown")
367
+
368
+ if region not in distribution:
369
+ logger.warning(f"Unknown region '{region}' found, skipping object")
370
+ continue
371
+
372
+ distribution[region]["total"] += 1
373
+
374
+ if class_id not in distribution[region]["objects"]:
375
+ distribution[region]["objects"][class_id] = {
376
+ "name": class_name,
377
+ "count": 0,
378
+ "positions": []
379
+ }
380
+
381
+ distribution[region]["objects"][class_id]["count"] += 1
382
+
383
+ # 儲存位置資訊用於空間關係分析
384
+ normalized_center = obj.get("normalized_center")
385
+ if normalized_center:
386
+ distribution[region]["objects"][class_id]["positions"].append(normalized_center)
387
+
388
+ except Exception as e:
389
+ logger.error(f"Error processing object in distribution map: {str(e)}")
390
+ continue
391
+
392
+ # 計算每個區域的物件密度
393
+ for region, data in distribution.items():
394
+ # 假設所有區域在網格中大小相等
395
+ data["density"] = data["total"] / 1
396
+
397
+ logger.info("Distribution map created successfully")
398
+ return distribution
399
+
400
+ except Exception as e:
401
+ logger.error(f"Error creating distribution map: {str(e)}")
402
+ logger.error(traceback.format_exc())
403
+ return self._get_empty_distribution_map()
404
+
405
+ def calculate_spatial_diversity(self, detected_objects: List[Dict]) -> float:
406
+ """
407
+ 計算物件空間分布的多樣性
408
+ 評估物件是否分散在不同區域,避免所有物件集中在單一區域
409
+
410
+ Args:
411
+ detected_objects: 檢測到的物件列表
412
+
413
+ Returns:
414
+ 空間多樣性評分 (0.0-1.0)
415
+ """
416
+ try:
417
+ if not detected_objects:
418
+ logger.warning("No detected objects provided for spatial diversity calculation")
419
+ return 0.0
420
+
421
+ regions = set()
422
+ for obj in detected_objects:
423
+ region = obj.get("region", "center")
424
+ regions.add(region)
425
+
426
+ unique_regions = len(regions)
427
+ diversity_score = min(unique_regions / 2.0, 1.0)
428
+
429
+ logger.info(f"Spatial diversity calculated: {diversity_score:.3f} (regions: {unique_regions})")
430
+ return diversity_score
431
+
432
+ except Exception as e:
433
+ logger.error(f"Error calculating spatial diversity: {str(e)}")
434
+ logger.error(traceback.format_exc())
435
+ return 0.0
436
+
437
+ def get_directional_description(self, region: str) -> str:
438
+ """
439
+ 將區域名稱轉換為方位描述(東西南北)
440
+
441
+ Args:
442
+ region: 區域名稱
443
+
444
+ Returns:
445
+ 方位描述字串
446
+ """
447
+ try:
448
+ region_lower = region.lower()
449
+
450
+ if "top" in region_lower and "left" in region_lower:
451
+ return "northwest"
452
+ elif "top" in region_lower and "right" in region_lower:
453
+ return "northeast"
454
+ elif "bottom" in region_lower and "left" in region_lower:
455
+ return "southwest"
456
+ elif "bottom" in region_lower and "right" in region_lower:
457
+ return "southeast"
458
+ elif "top" in region_lower:
459
+ return "north"
460
+ elif "bottom" in region_lower:
461
+ return "south"
462
+ elif "left" in region_lower:
463
+ return "west"
464
+ elif "right" in region_lower:
465
+ return "east"
466
+ else:
467
+ return "central"
468
+
469
+ except Exception as e:
470
+ logger.error(f"Error getting directional description for region '{region}': {str(e)}")
471
+ return "central"
472
+
473
+ def _get_empty_distribution_map(self) -> Dict:
474
+ """
475
+ 返回空的分布地圖結構
476
+
477
+ Returns:
478
+ 空的分布地圖字典
479
+ """
480
+ distribution = {}
481
+ for region in self.regions.keys():
482
+ distribution[region] = {
483
+ "total": 0,
484
+ "objects": {},
485
+ "density": 0
486
+ }
487
+ return distribution
requirements.txt CHANGED
@@ -14,5 +14,4 @@ accelerate
14
  bitsandbytes
15
  sentencepiece
16
  huggingface_hub>=0.19.0
17
- scikit-image
18
  urllib3>=1.26.0
 
14
  bitsandbytes
15
  sentencepiece
16
  huggingface_hub>=0.19.0
 
17
  urllib3>=1.26.0
response_processor.py ADDED
@@ -0,0 +1,1049 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import logging
3
+ import traceback
4
+ from typing import Dict, List, Any, Optional, Set
5
+
6
+
7
+ class ResponseProcessingError(Exception):
8
+ """回應處理相關錯誤的自定義異常"""
9
+ pass
10
+
11
+
12
+ class ResponseProcessor:
13
+ """
14
+ 負責處理和清理LLM模型輸出的回應。
15
+ 包含格式清理、重複內容檢測、語法完整性確保等功能。
16
+ """
17
+
18
+ def __init__(self):
19
+ """初始化回應處理器"""
20
+ # set the logger
21
+ self.logger = logging.getLogger(self.__class__.__name__)
22
+ if not self.logger.handlers:
23
+ handler = logging.StreamHandler()
24
+ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
25
+ handler.setFormatter(formatter)
26
+ self.logger.addHandler(handler)
27
+ self.logger.setLevel(logging.INFO)
28
+
29
+ # 初始化清理規則和替換字典
30
+ self._initialize_cleaning_rules()
31
+ self.logger.info("ResponseProcessor initialized successfully")
32
+
33
+
34
+ def _initialize_cleaning_rules(self):
35
+ """初始化各種清理規則和替換字典,把常見有問題情況優化"""
36
+ try:
37
+ # 設置重複詞彙的替換字典
38
+ self.replacement_alternatives = {
39
+ 'visible': ['present', 'evident', 'apparent', 'observable'],
40
+ 'positioned': ['arranged', 'placed', 'set', 'organized'],
41
+ 'located': ['found', 'placed', 'situated', 'established'],
42
+ 'situated': ['placed', 'positioned', 'arranged', 'set'],
43
+ 'appears': ['seems', 'looks', 'presents', 'exhibits'],
44
+ 'features': ['includes', 'contains', 'displays', 'showcases'],
45
+ 'shows': ['reveals', 'presents', 'exhibits', 'demonstrates'],
46
+ 'displays': ['presents', 'exhibits', 'shows', 'reveals']
47
+ }
48
+
49
+ # 設置需要移除的前綴短語
50
+ self.prefixes_to_remove = [
51
+ "Here's the enhanced description:",
52
+ "Enhanced description:",
53
+ "Here is the enhanced scene description:",
54
+ "I've enhanced the description while preserving all factual details:",
55
+ "Enhanced Description:",
56
+ "Scene Description:",
57
+ "Description:",
58
+ "Here is the enhanced description:",
59
+ "Here's the enhanced description:",
60
+ "Here is a rewritten scene description that adheres to the provided critical rules:",
61
+ "Here is the rewritten scene description:",
62
+ "Here's a rewritten scene description:",
63
+ "The rewritten scene description is as follows:"
64
+ ]
65
+
66
+ # 設置需要移除的後綴短語
67
+ self.suffixes_to_remove = [
68
+ "I've maintained all the key factual elements",
69
+ "I've preserved all the factual details",
70
+ "All factual elements have been maintained"
71
+ ]
72
+
73
+ # 設置重複檢測模式
74
+ self.repetitive_patterns = [
75
+ (r'\b(visible)\b.*?\b(visible)\b', 'Multiple uses of "visible" detected'),
76
+ (r'\b(positioned)\b.*?\b(positioned)\b', 'Multiple uses of "positioned" detected'),
77
+ (r'\b(located)\b.*?\b(located)\b', 'Multiple uses of "located" detected'),
78
+ (r'\b(situated)\b.*?\b(situated)\b', 'Multiple uses of "situated" detected'),
79
+ (r'\b(appears)\b.*?\b(appears)\b', 'Multiple uses of "appears" detected'),
80
+ (r'\b(features)\b.*?\b(features)\b', 'Multiple uses of "features" detected'),
81
+ (r'\bThis\s+(\w+)\s+.*?\bThis\s+\1\b', 'Repetitive sentence structure detected')
82
+ ]
83
+
84
+ # 斜線組合的形容詞替換字典(有時會有斜線格式問題)
85
+ self.slash_replacements = {
86
+ 'sunrise/sunset': 'warm lighting',
87
+ 'sunset/sunrise': 'warm lighting',
88
+ 'day/night': 'ambient lighting',
89
+ 'night/day': 'ambient lighting',
90
+ 'morning/evening': 'soft lighting',
91
+ 'evening/morning': 'soft lighting',
92
+ 'dawn/dusk': 'gentle lighting',
93
+ 'dusk/dawn': 'gentle lighting',
94
+ 'sunny/cloudy': 'natural lighting',
95
+ 'cloudy/sunny': 'natural lighting',
96
+ 'bright/dark': 'varied lighting',
97
+ 'dark/bright': 'varied lighting',
98
+ 'light/shadow': 'contrasting illumination',
99
+ 'shadow/light': 'contrasting illumination',
100
+ 'indoor/outdoor': 'mixed environment',
101
+ 'outdoor/indoor': 'mixed environment',
102
+ 'inside/outside': 'transitional space',
103
+ 'outside/inside': 'transitional space',
104
+ 'urban/rural': 'diverse landscape',
105
+ 'rural/urban': 'diverse landscape',
106
+ 'modern/traditional': 'architectural blend',
107
+ 'traditional/modern': 'architectural blend',
108
+ 'old/new': 'varied architecture',
109
+ 'new/old': 'varied architecture',
110
+ 'busy/quiet': 'dynamic atmosphere',
111
+ 'quiet/busy': 'dynamic atmosphere',
112
+ 'crowded/empty': 'varying occupancy',
113
+ 'empty/crowded': 'varying occupancy',
114
+ 'hot/cold': 'comfortable temperature',
115
+ 'cold/hot': 'comfortable temperature',
116
+ 'wet/dry': 'mixed conditions',
117
+ 'dry/wet': 'mixed conditions',
118
+ 'summer/winter': 'seasonal atmosphere',
119
+ 'winter/summer': 'seasonal atmosphere',
120
+ 'spring/autumn': 'transitional season',
121
+ 'autumn/spring': 'transitional season',
122
+ 'left/right': 'balanced composition',
123
+ 'right/left': 'balanced composition',
124
+ 'near/far': 'layered perspective',
125
+ 'far/near': 'layered perspective',
126
+ 'high/low': 'varied elevation',
127
+ 'low/high': 'varied elevation',
128
+ 'big/small': 'diverse scale',
129
+ 'small/big': 'diverse scale',
130
+ 'wide/narrow': 'varied width',
131
+ 'narrow/wide': 'varied width',
132
+ 'open/closed': 'flexible space',
133
+ 'closed/open': 'flexible space',
134
+ 'public/private': 'community space',
135
+ 'private/public': 'community space',
136
+ 'formal/informal': 'relaxed setting',
137
+ 'informal/formal': 'relaxed setting',
138
+ 'commercial/residential': 'mixed-use area',
139
+ 'residential/commercial': 'mixed-use area'
140
+ }
141
+
142
+ # 新增:擴展的底線替換字典
143
+ self.underscore_replacements = {
144
+ 'urban_intersection': 'urban intersection',
145
+ 'tourist_landmark': 'tourist landmark',
146
+ 'historical_site': 'historical site',
147
+ 'religious_building': 'religious building',
148
+ 'natural_landmark': 'natural landmark',
149
+ 'commercial_area': 'commercial area',
150
+ 'residential_area': 'residential area',
151
+ 'public_space': 'public space',
152
+ 'outdoor_scene': 'outdoor scene',
153
+ 'indoor_scene': 'indoor scene',
154
+ 'street_scene': 'street scene',
155
+ 'city_center': 'city center',
156
+ 'shopping_district': 'shopping district',
157
+ 'business_district': 'business district',
158
+ 'traffic_light': 'traffic light',
159
+ 'street_lamp': 'street lamp',
160
+ 'parking_meter': 'parking meter',
161
+ 'fire_hydrant': 'fire hydrant',
162
+ 'bus_stop': 'bus stop',
163
+ 'train_station': 'train station',
164
+ 'police_car': 'police car',
165
+ 'fire_truck': 'fire truck',
166
+ 'school_bus': 'school bus',
167
+ 'time_of_day': 'time of day',
168
+ 'weather_condition': 'weather condition',
169
+ 'lighting_condition': 'lighting condition',
170
+ 'atmospheric_condition': 'atmospheric condition',
171
+ 'human_activity': 'human activity',
172
+ 'pedestrian_traffic': 'pedestrian traffic',
173
+ 'vehicle_traffic': 'vehicle traffic',
174
+ 'social_gathering': 'social gathering',
175
+ 'object_detection': 'object detection',
176
+ 'scene_analysis': 'scene analysis',
177
+ 'image_classification': 'image classification',
178
+ 'computer_vision': 'computer vision'
179
+ }
180
+
181
+ self.logger.info("Cleaning rules initialized successfully")
182
+
183
+ except Exception as e:
184
+ error_msg = f"Failed to initialize cleaning rules: {str(e)}"
185
+ self.logger.error(error_msg)
186
+ self.logger.error(traceback.format_exc())
187
+ raise ResponseProcessingError(error_msg) from e
188
+
189
+ def clean_response(self, response: str, model_type: str = "general") -> str:
190
+ """
191
+ 清理LLM回應
192
+
193
+ Args:
194
+ response: 原始LLM回應
195
+ model_type: 模型類型(用於特定清理規則)
196
+
197
+ Returns:
198
+ str: 清理後的回應
199
+
200
+ Raises:
201
+ ResponseProcessingError: 當回應處理失敗時
202
+ """
203
+ if not response:
204
+ raise ResponseProcessingError("Empty response provided for cleaning")
205
+
206
+ try:
207
+ self.logger.debug(f"Starting response cleaning (original length: {len(response)})")
208
+
209
+ # 保存原始回應作為備份
210
+ original_response = response
211
+
212
+ # 根據模型類型選擇清理策略
213
+ if "llama" in model_type.lower():
214
+ cleaned_response = self._clean_llama_response(response)
215
+ else:
216
+ cleaned_response = self._clean_general_response(response)
217
+
218
+ # 如果清理後內容過短,嘗試���原始回應中恢復
219
+ if len(cleaned_response.strip()) < 40:
220
+ self.logger.warning("Cleaned response too short, attempting recovery")
221
+ cleaned_response = self._recover_from_overcleaning(original_response)
222
+
223
+ # 最終驗證
224
+ self._validate_cleaned_response(cleaned_response)
225
+
226
+ self.logger.debug(f"Response cleaning completed (final length: {len(cleaned_response)})")
227
+ return cleaned_response
228
+
229
+ except Exception as e:
230
+ error_msg = f"Response cleaning failed: {str(e)}"
231
+ self.logger.error(error_msg)
232
+ self.logger.error(traceback.format_exc())
233
+ raise ResponseProcessingError(error_msg) from e
234
+
235
+ def _clean_llama_response(self, response: str) -> str:
236
+ """
237
+ 專門處理Llama模型的回應清理
238
+
239
+ Args:
240
+ response: 原始Llama回應
241
+
242
+ Returns:
243
+ str: 清理後的回應
244
+ """
245
+ # 首先應用通用清理
246
+ response = self._clean_general_response(response)
247
+
248
+ # Llama特有的前綴清理
249
+ llama_prefixes = [
250
+ "Here's the enhanced description:",
251
+ "Enhanced description:",
252
+ "Here is the enhanced scene description:",
253
+ "I've enhanced the description while preserving all factual details:"
254
+ ]
255
+
256
+ for prefix in llama_prefixes:
257
+ if response.lower().startswith(prefix.lower()):
258
+ response = response[len(prefix):].strip()
259
+
260
+ # Llama特有的後綴清理
261
+ llama_suffixes = [
262
+ "I've maintained all the key factual elements",
263
+ "I've preserved all the factual details",
264
+ "All factual elements have been maintained"
265
+ ]
266
+
267
+ for suffix in llama_suffixes:
268
+ if response.lower().endswith(suffix.lower()):
269
+ response = response[:response.rfind(suffix)].strip()
270
+
271
+ return response
272
+
273
+ def _clean_general_response(self, response: str) -> str:
274
+ """
275
+ 通用回應清理方法
276
+
277
+ Args:
278
+ response: 原始回應
279
+
280
+ Returns:
281
+ str: 清理後的回應
282
+ """
283
+ response = self._critical_format_preprocess(response)
284
+
285
+ # 1. 移除系統remark
286
+ response = self._remove_system_markers(response)
287
+
288
+ # 2. 移除介紹性prefix
289
+ response = self._remove_introduction_prefixes(response)
290
+
291
+ # 3. 移除格式標記和上下文標籤
292
+ response = self._remove_format_markers(response)
293
+
294
+ # 4. 清理場景類型引用
295
+ response = self._clean_scene_type_references(response)
296
+
297
+ # 5. 標準化標點符號
298
+ response = self._normalize_punctuation(response)
299
+
300
+ # 6. 移除重複句子
301
+ response = self._remove_duplicate_sentences(response)
302
+
303
+ # 7. 處理重複詞彙
304
+ response = self._handle_repetitive_vocabulary(response)
305
+
306
+ # 8. ensure completement
307
+ response = self._ensure_grammatical_completeness(response)
308
+
309
+ # 9. 控制字數長度
310
+ response = self._control_word_length(response)
311
+
312
+ # 10. 最終格式化
313
+ response = self._final_formatting(response)
314
+
315
+ return response
316
+
317
+
318
+ def _critical_format_preprocess(self, response: str) -> str:
319
+ """
320
+ 關鍵格式預處理,處理最常見的格式問題
321
+
322
+ Args:
323
+ response: 原始回應
324
+
325
+ Returns:
326
+ str: 預處理後的回應
327
+ """
328
+ if not response:
329
+ return response
330
+
331
+ try:
332
+ import re
333
+
334
+ # 第一優先級:處理斜線問題
335
+ # 首先處理已知的斜線組合,使用形容詞替換
336
+ for slash_combo, replacement in self.slash_replacements.items():
337
+ if slash_combo.lower() in response.lower():
338
+ # 保持原始大小寫格式
339
+ if slash_combo.upper() in response:
340
+ replacement_formatted = replacement.upper()
341
+ elif slash_combo.title() in response:
342
+ replacement_formatted = replacement.title()
343
+ else:
344
+ replacement_formatted = replacement
345
+
346
+ # 執行替換(不區分大小寫)
347
+ response = re.sub(re.escape(slash_combo), replacement_formatted, response, flags=re.IGNORECASE)
348
+ self.logger.debug(f"Replaced slash pattern '{slash_combo}' with '{replacement_formatted}'")
349
+
350
+ # 處理其他未預定義的斜線模式
351
+ # 標準斜線模式:word/word
352
+ slash_pattern = r'\b([a-zA-Z]+)/([a-zA-Z]+)\b'
353
+ matches = list(re.finditer(slash_pattern, response))
354
+ for match in reversed(matches): # 從後往前處理避免位置偏移
355
+ word1, word2 = match.groups()
356
+ # 選擇較短或更常見的詞作為替換
357
+ if len(word1) <= len(word2):
358
+ replacement = word1
359
+ else:
360
+ replacement = word2
361
+ response = response[:match.start()] + replacement + response[match.end():]
362
+ self.logger.debug(f"Replaced general slash pattern '{match.group(0)}' with '{replacement}'")
363
+
364
+ # 第二優先級:處理底線格式
365
+ # 首先處理已知的底線組合
366
+ for underscore_combo, replacement in self.underscore_replacements.items():
367
+ if underscore_combo in response:
368
+ response = response.replace(underscore_combo, replacement)
369
+ self.logger.debug(f"Replaced underscore pattern '{underscore_combo}' with '{replacement}'")
370
+
371
+ # 處理三個詞的底線組合:word_word_word → word word word
372
+ response = re.sub(r'\b([a-z]+)_([a-z]+)_([a-z]+)\b', r'\1 \2 \3', response)
373
+
374
+ # 處理任何剩餘的底線模式:word_word → word word
375
+ response = re.sub(r'\b([a-zA-Z]+)_([a-zA-Z]+)\b', r'\1 \2', response)
376
+
377
+ # 第三優先級:修正不完整句子
378
+ incomplete_sentence_fixes = [
379
+ (r'\bIn\s*,\s*', 'Throughout the area, '),
380
+ (r'\bOverall,\s+exudes\b', 'Overall, the scene exudes'),
381
+ (r'\bThe overall atmosphere of\s+is\b', 'The overall atmosphere'),
382
+ (r'\bwith its lights turned illuminating\b', 'with its lights illuminating'),
383
+ (r'\bwhere it stands as\b', 'where it stands as'),
384
+ ]
385
+
386
+ for pattern, replacement in incomplete_sentence_fixes:
387
+ response = re.sub(pattern, replacement, response, flags=re.IGNORECASE)
388
+
389
+ # 第四優先級:語法修正處理(像是person and people)
390
+ grammar_fixes = [
391
+ (r'\b(\d+)\s+persons\b', r'\1 people'),
392
+ (r'\bone\s+persons\b', 'one person'),
393
+ (r'\btwo\s+persons\b', 'two people'),
394
+ (r'\bthree\s+persons\b', 'three people'),
395
+ (r'\bfour\s+persons\b', 'four people'),
396
+ (r'\bfive\s+persons\b', 'five people'),
397
+ (r'\bsix\s+persons\b', 'six people'),
398
+ (r'\bseven\s+persons\b', 'seven people'),
399
+ (r'\beight\s+persons\b', 'eight people'),
400
+ (r'\bnine\s+persons\b', 'nine people'),
401
+ (r'\bten\s+persons\b', 'ten people'),
402
+ (r'\bmultiple\s+persons\b', 'multiple people'),
403
+ (r'\bseveral\s+persons\b', 'several people'),
404
+ (r'\bmany\s+persons\b', 'many people'),
405
+ (r'\ba\s+few\s+persons\b', 'a few people'),
406
+ (r'\bsome\s+persons\b', 'some people')
407
+ ]
408
+
409
+ for pattern, replacement in grammar_fixes:
410
+ response = re.sub(pattern, replacement, response, flags=re.IGNORECASE)
411
+
412
+ return response
413
+
414
+ except Exception as e:
415
+ self.logger.warning(f"Error in critical format preprocessing: {str(e)}")
416
+ return response
417
+
418
+ def _remove_system_markers(self, response: str) -> str:
419
+ """移除系統樣式標記"""
420
+ # 移除對話remark
421
+ response = re.sub(r'<\|.*?\|>', '', response)
422
+
423
+ # 移除輸出remark
424
+ output_start = response.find("[OUTPUT_START]")
425
+ output_end = response.find("[OUTPUT_END]")
426
+ if output_start != -1 and output_end != -1 and output_end > output_start:
427
+ response = response[output_start + len("[OUTPUT_START]"):output_end].strip()
428
+
429
+ # 移除其他remark
430
+ section_markers = [
431
+ r'\[.*?\]',
432
+ r'OUTPUT_START\s*:|OUTPUT_END\s*:',
433
+ r'ENHANCED DESCRIPTION\s*:',
434
+ r'Scene Type\s*:.*?(?=\n|$)',
435
+ r'Original Description\s*:.*?(?=\n|$)',
436
+ r'GOOD\s*:|BAD\s*:',
437
+ r'PROBLEM\s*:.*?(?=\n|$)',
438
+ r'</?\|(?:assistant|system|user)\|>',
439
+ r'\(Note:.*?\)',
440
+ r'\(.*?I\'ve.*?\)',
441
+ r'\(.*?as per your request.*?\)'
442
+ ]
443
+
444
+ for marker in section_markers:
445
+ response = re.sub(marker, '', response, flags=re.IGNORECASE)
446
+
447
+ return response
448
+
449
+ def _remove_introduction_prefixes(self, response: str) -> str:
450
+ """移除介紹性前綴"""
451
+ # 處理 "Here is..." 類型的prefix
452
+ intro_prefixes = [
453
+ r'^Here\s+is\s+(?:a\s+|the\s+)?(?:rewritten\s+|enhanced\s+)?scene\s+description.*?:\s*',
454
+ r'^The\s+(?:rewritten\s+|enhanced\s+)?(?:scene\s+)?description\s+is.*?:\s*',
455
+ r'^Here\'s\s+(?:a\s+|the\s+)?(?:rewritten\s+|enhanced\s+)?description.*?:\s*'
456
+ ]
457
+
458
+ for prefix_pattern in intro_prefixes:
459
+ response = re.sub(prefix_pattern, '', response, flags=re.IGNORECASE)
460
+
461
+ # 處理固定prefix
462
+ for prefix in self.prefixes_to_remove:
463
+ if response.lower().startswith(prefix.lower()):
464
+ response = response[len(prefix):].strip()
465
+
466
+ return response
467
+
468
+ def _remove_format_markers(self, response: str) -> str:
469
+ """移除格式標記和上下文標籤(保留括號內的地理與細節資訊)"""
470
+ # 移除上下文相關remark
471
+ response = re.sub(r'<\s*Context:.*?>', '', response)
472
+ response = re.sub(r'Context:.*?(?=\n|$)', '', response)
473
+ response = re.sub(r'Note:.*?(?=\n|$)', '', response, flags=re.IGNORECASE)
474
+
475
+ # 移除Markdown格式
476
+ response = re.sub(r'\*\*|\*|__|\|', '', response)
477
+
478
+ # 移除任何剩餘的特殊標記 (避開括號內容,以免剔除地理位置等有用資訊)
479
+ response = re.sub(r'</?\|.*?\|>', '', response)
480
+ # ※ 以下移除「刪除整個括號及其內文」的方式已註解,以保留地理位置資訊
481
+ # response = re.sub(r'\(.*?\)', '', response)
482
+
483
+ return response
484
+
485
+
486
+ def _clean_scene_type_references(self, response: str) -> str:
487
+ """清理不當的場景類型引用"""
488
+ scene_type_pattern = r'This ([a-zA-Z_]+) (features|shows|displays|contains)'
489
+ match = re.search(scene_type_pattern, response)
490
+ if match and '_' in match.group(1):
491
+ fixed_text = f"This scene {match.group(2)}"
492
+ response = re.sub(scene_type_pattern, fixed_text, response)
493
+
494
+ return response
495
+
496
+ def _normalize_punctuation(self, response: str) -> str:
497
+ """標準化標點符號"""
498
+ # 減少破折號使用
499
+ response = re.sub(r'—', ', ', response)
500
+ response = re.sub(r' - ', ', ', response)
501
+
502
+ # 處理連續標點符號
503
+ response = re.sub(r'([.,;:!?])\1+', r'\1', response)
504
+
505
+ # 修復不完整句子的標點
506
+ response = re.sub(r',\s*$', '.', response)
507
+
508
+ # 修復句號後缺少空格的問題
509
+ response = re.sub(r'([.!?])([A-Z])', r'\1 \2', response)
510
+
511
+ # 清理多餘空格和換行
512
+ response = response.replace('\r', ' ')
513
+ response = re.sub(r'\n+', ' ', response)
514
+ response = re.sub(r'\s{2,}', ' ', response)
515
+
516
+ return response
517
+
518
+
519
+ def _remove_duplicate_sentences(self, response: str, similarity_threshold: float = 0.85) -> str:
520
+ """
521
+ 移除重複或高度相似的句子,使用 Jaccard 相似度進行比較。
522
+ Args:
523
+ response: 原始回應文本。
524
+ similarity_threshold: 認定句子重複的相似度閾值 (0.0 到 1.0)。
525
+ 較高的閾值表示句子需要非常相似才會被移除。
526
+ Returns:
527
+ str: 移除重複句子後的文本。
528
+ """
529
+ try:
530
+ if not response or not response.strip():
531
+ return ""
532
+
533
+ # (?<=[.!?]) 會保留分隔符在句尾, \s+ 會消耗句尾的空格
534
+ # 這樣用 ' ' join 回去時, 標點和下個句子間剛好一個空格
535
+ sentences = re.split(r'(?<=[.!?])\s+', response.strip())
536
+
537
+ unique_sentences_data = [] # Store tuples of (original_sentence, simplified_word_set)
538
+
539
+ min_sentence_len_for_check = 8 # 簡化後詞彙數少於此值,除非完全相同否則不輕易判斷為重複
540
+
541
+ for sentence in sentences:
542
+ sentence = sentence.strip()
543
+ if not sentence:
544
+ continue
545
+
546
+ # 創建簡化版本用於比較 (小寫,移除標點,分割為詞彙集合)
547
+ # 保留數字,因為數字可能是關鍵資訊
548
+ simplified_text = re.sub(r'[^\w\s\d]', '', sentence.lower())
549
+ current_sentence_words = set(simplified_text.split())
550
+
551
+ if not current_sentence_words: # 如果處理後是空集合,跳過
552
+ continue
553
+
554
+ is_duplicate = False
555
+ # 與已保留的唯一句子比較
556
+ for i, (kept_sentence_text, kept_sentence_words) in enumerate(unique_sentences_data):
557
+ # Jaccard Index
558
+ intersection_len = len(current_sentence_words.intersection(kept_sentence_words))
559
+ union_len = len(current_sentence_words.union(kept_sentence_words))
560
+
561
+ if union_len == 0: # 兩個都是空集合,代表相同句子
562
+ jaccard_similarity = 1.0
563
+ else:
564
+ jaccard_similarity = intersection_len / union_len
565
+
566
+ # 用Jaccard 相似度超過閾值,不是兩個都非常短的句子 (避免 "Yes." 和 "No." 被錯誤合併)
567
+ # 新句子完全被舊句子包含 (且舊句子更長)
568
+ # 舊句子完全被新句子包含 (且新句子更長) -> 這種情況就需要替換
569
+ if jaccard_similarity >= similarity_threshold:
570
+ # 如果當前句子比已保留的句子短,且高度相似,則認為是重複
571
+ if len(current_sentence_words) < len(kept_sentence_words):
572
+ is_duplicate = True
573
+ self.logger.debug(f"Sentence \"{sentence[:30]}...\" marked duplicate (shorter, similar to \"{kept_sentence_text[:30]}...\") Jaccard: {jaccard_similarity:.2f}")
574
+ break
575
+ # 如果當前句子比已保留的句子長,且高度相似,則替換掉已保留的
576
+ elif len(current_sentence_words) > len(kept_sentence_words):
577
+ self.logger.debug(f"Sentence \"{kept_sentence_text[:30]}...\" replaced by longer similar sentence \"{sentence[:30]}...\" Jaccard: {jaccard_similarity:.2f}")
578
+ unique_sentences_data.pop(i) # 移除舊的、較短的句子
579
+
580
+ # 如果長度差不多,但相似度高,保留第一個出現的
581
+ elif current_sentence_words != kept_sentence_words : # 避免完全相同的句子被錯誤地跳過替換邏輯
582
+ is_duplicate = True # 保留先出現的
583
+ self.logger.debug(f"Sentence \"{sentence[:30]}...\" marked duplicate (similar length, similar to \"{kept_sentence_text[:30]}...\") Jaccard: {jaccard_similarity:.2f}")
584
+ break
585
+
586
+ if not is_duplicate:
587
+ unique_sentences_data.append((sentence, current_sentence_words))
588
+
589
+ # 重組唯一句子
590
+ final_sentences = [s_data[0] for s_data in unique_sentences_data]
591
+
592
+ # 確保每個句子以標點結尾 (因為 split 可能會產生沒有標點的最後一個片段)
593
+ reconstructed_response = ""
594
+ for i, s in enumerate(final_sentences):
595
+ s = s.strip()
596
+ if not s: continue
597
+ if not s[-1] in ".!?":
598
+ s += "."
599
+ reconstructed_response += s
600
+ if i < len(final_sentences) - 1:
601
+ reconstructed_response += " " # 在句子間添加空格
602
+
603
+ return reconstructed_response.strip()
604
+
605
+ except Exception as e:
606
+ self.logger.error(f"Error in _remove_duplicate_sentences: {str(e)}")
607
+ self.logger.error(traceback.format_exc())
608
+ return response # 發生錯誤時返回原始回應
609
+
610
+ def _handle_repetitive_vocabulary(self, response: str) -> str:
611
+ """處理重複詞彙,使用 re.sub 和可呼叫的替換函數以提高效率和準確性。"""
612
+ try:
613
+ # 檢測重複模式 (僅警告)
614
+ if hasattr(self, 'repetitive_patterns'):
615
+ for pattern, issue in self.repetitive_patterns:
616
+ if re.search(pattern, response, re.IGNORECASE | re.DOTALL):
617
+ self.logger.warning(f"Text quality issue detected: {issue} in response: \"{response[:100]}...\"")
618
+
619
+ if not hasattr(self, 'replacement_alternatives') or not self.replacement_alternatives:
620
+ return response
621
+
622
+ processed_response = response
623
+
624
+ for word_to_replace, alternatives in self.replacement_alternatives.items():
625
+ if not alternatives: # 如果沒有可用的替代詞,則跳過
626
+ continue
627
+
628
+ # 為每個詞創建一個獨立的計數器和替代索引
629
+ # 使用閉包或一個小類來封裝狀態
630
+ class WordReplacer:
631
+ def __init__(self, alternatives_list):
632
+ self.count = 0
633
+ self.alternative_idx = 0
634
+ self.alternatives_list = alternatives_list
635
+
636
+ def __call__(self, match_obj):
637
+ self.count += 1
638
+ original_word = match_obj.group(0)
639
+ if self.count > 1: # 從第二次出現開始替換
640
+ replacement = self.alternatives_list[self.alternative_idx % len(self.alternatives_list)]
641
+ self.alternative_idx += 1
642
+ # 保持原始大小寫格式
643
+ if original_word.isupper():
644
+ return replacement.upper()
645
+ elif original_word.istitle():
646
+ return replacement.capitalize()
647
+ return replacement
648
+ return original_word # 因為第一次出現, 就不用替換
649
+
650
+ replacer_instance = WordReplacer(alternatives)
651
+ # 使用 \b 確保匹配的是整個單詞
652
+ pattern = re.compile(r'\b' + re.escape(word_to_replace) + r'\b', re.IGNORECASE)
653
+ processed_response = pattern.sub(replacer_instance, processed_response)
654
+
655
+ return processed_response
656
+
657
+ except Exception as e:
658
+ self.logger.error(f"Error in _handle_repetitive_vocabulary: {str(e)}")
659
+ self.logger.error(traceback.format_exc())
660
+ return response # 發生錯誤時返回原始回應
661
+
662
+ def _ensure_grammatical_completeness(self, response: str) -> str:
663
+ """
664
+ 確保語法完整性,處理不��整句子和格式問題
665
+
666
+ Args:
667
+ response: 待檢查的回應文本
668
+
669
+ Returns:
670
+ str: 語法完整的回應文本
671
+ """
672
+ try:
673
+ if not response or not response.strip():
674
+ return response
675
+
676
+ # 第一階段:檢查並修正不完整的句子模式
677
+ incomplete_patterns = [
678
+ # 介詞後直接結束的問題(針對 "over ." 等情況)
679
+ (r'\b(over|under|through|across|along|beneath|beyond|throughout)\s*\.', 'incomplete_preposition'),
680
+ (r'\b(with|without|against|towards|beside|between|among)\s*\.', 'incomplete_preposition'),
681
+ (r'\b(into|onto|upon|within|behind|below|above)\s*\.', 'incomplete_preposition'),
682
+
683
+ # 處理 "In ," 這類缺失詞彙的問題
684
+ (r'\bIn\s*,', 'incomplete_location'),
685
+ (r'\bAt\s*,', 'incomplete_location'),
686
+ (r'\bOn\s*,', 'incomplete_location'),
687
+ (r'\bWith\s*,', 'incomplete_context'),
688
+
689
+ # 不完整的描述模式
690
+ (r'\b(fine|the)\s+(the\s+)?(?:urban|area|scene)\b(?!\s+\w)', 'incomplete_description'),
691
+
692
+ # 連詞或介詞後直接標點的問題
693
+ (r'\b(and|or|but|with|from|in|at|on|by|for|to)\s*[.!?]', 'incomplete_conjunction'),
694
+
695
+ # 重複詞彙
696
+ (r'\b(\w+)\s+\1\b', 'word_repetition'),
697
+
698
+ # 不完整的場景類型引用(如 "urban_intersection" 格式問題)
699
+ (r'\b(\w+)_(\w+)\b', 'underscore_format'),
700
+
701
+ # 地標場景特有問題
702
+ (r'\btourist_landmark\b', 'underscore_format'),
703
+ (r'\burban_intersection\b', 'underscore_format'),
704
+ (r'\bIn\s*,\s*(?=\w)', 'incomplete_prepositional'),
705
+ (r'\bOverall,\s+(?=exudes|shows|displays)(?!\s+(?:the|this|it))', 'missing_subject'),
706
+ (r'\batmosphere of\s+is one of\b', 'redundant_structure'),
707
+ (r'\bwith.*?turned\s+illuminating\b', 'redundant_participle')
708
+ ]
709
+
710
+ for pattern, issue_type in incomplete_patterns:
711
+ try:
712
+ matches = list(re.finditer(pattern, response, re.IGNORECASE))
713
+
714
+ for match in matches:
715
+ if issue_type == 'incomplete_preposition':
716
+ # 處理介詞後直接結束的情況
717
+ response = self._fix_incomplete_preposition(response, match)
718
+
719
+ elif issue_type == 'underscore_format':
720
+ # 將下劃線格式轉換為空格分隔
721
+ original = match.group(0)
722
+ replacement = original.replace('_', ' ')
723
+ response = response.replace(original, replacement)
724
+
725
+ elif issue_type == 'word_repetition':
726
+ # 移除重複的詞彙
727
+ repeated_word = match.group(1)
728
+ response = response.replace(f"{repeated_word} {repeated_word}", repeated_word)
729
+
730
+ elif issue_type == 'incomplete_location' or issue_type == 'incomplete_context':
731
+ # 移除不完整的位置或上下文引用
732
+ response = response.replace(match.group(0), '')
733
+
734
+ elif issue_type == 'incomplete_prepositional':
735
+ # 處理不完整的介詞短語
736
+ response = re.sub(r'\bIn\s*,\s*', 'Throughout the scene, ', response)
737
+
738
+ elif issue_type == 'missing_subject':
739
+ # 為Overall句子添加主語
740
+ response = re.sub(r'\bOverall,\s+(?=exudes)', 'Overall, the scene ', response)
741
+
742
+ elif issue_type == 'redundant_structure':
743
+ # 簡化冗餘結構
744
+ response = re.sub(r'\batmosphere of\s+is one of\b', 'atmosphere is one of', response)
745
+
746
+ elif issue_type == 'redundant_participle':
747
+ # 清理冗餘分詞
748
+ response = re.sub(r'turned\s+illuminating', 'illuminating', response)
749
+
750
+ else:
751
+ # 對於其他不完整模式,直接移除
752
+ response = response.replace(match.group(0), '')
753
+
754
+ # 清理多餘空格
755
+ response = re.sub(r'\s{2,}', ' ', response).strip()
756
+
757
+ except re.error as e:
758
+ self.logger.warning(f"Regular expression pattern error for {issue_type}: {pattern} - {str(e)}")
759
+ continue
760
+
761
+ # 第二階段:處理物件類別格式問題
762
+ response = self._clean_object_class_references(response)
763
+
764
+ # 第三階段:確保句子正確結束
765
+ response = self._ensure_proper_sentence_ending(response)
766
+
767
+ # 第四階段:最終語法檢查
768
+ response = self._final_grammar_check(response)
769
+
770
+ return response.strip()
771
+
772
+ except Exception as e:
773
+ self.logger.error(f"Error in _ensure_grammatical_completeness: {str(e)}")
774
+ return response
775
+
776
+ def _fix_incomplete_preposition(self, response: str, match) -> str:
777
+ """
778
+ 修正不完整的介詞短語
779
+
780
+ Args:
781
+ response: 回應文本
782
+ match: 正則匹配對象
783
+
784
+ Returns:
785
+ str: 修正後的回應
786
+ """
787
+ preposition = match.group(1)
788
+ match_start = match.start()
789
+
790
+ # 找到句子的開始位置
791
+ sentence_start = response.rfind('.', 0, match_start)
792
+ sentence_start = sentence_start + 1 if sentence_start != -1 else 0
793
+
794
+ # 提取句子片段
795
+ sentence_fragment = response[sentence_start:match_start].strip()
796
+
797
+ # 如果句子片段有意義,嘗試移除不完整的介詞部分
798
+ if len(sentence_fragment) > 10:
799
+ # 移除介詞及其後的內容,添加適當的句號
800
+ response = response[:match_start].rstrip() + '.'
801
+ else:
802
+ # 如果句子片段太短,移除整個不完整的句子
803
+ response = response[:sentence_start] + response[match.end():]
804
+
805
+ return response
806
+
807
+ def _clean_object_class_references(self, response: str) -> str:
808
+ """
809
+ 清理物件類別引用中的格式問題
810
+
811
+ Args:
812
+ response: 回應文本
813
+
814
+ Returns:
815
+ str: 清理後的回應
816
+ """
817
+ # 移除類別ID引用(如 "unknown-class 2", "Class 0" 等)
818
+ class_id_patterns = [
819
+ r'\bunknown[- ]?class\s*\d+\s*objects?',
820
+ r'\bclass[- ]?\d+\s*objects?',
821
+ r'\b[Cc]lass\s*\d+\s*objects?',
822
+ r'\bunknown[- ][Cc]lass\s*\d+\s*objects?'
823
+ ]
824
+
825
+ for pattern in class_id_patterns:
826
+ try:
827
+ # 替換為更自然的描述
828
+ response = re.sub(pattern, 'objects', response, flags=re.IGNORECASE)
829
+ except re.error as e:
830
+ self.logger.warning(f"Error cleaning class reference pattern {pattern}: {str(e)}")
831
+ continue
832
+
833
+ # 處理數量描述中的問題
834
+ response = re.sub(r'\b(\w+)\s+unknown[- ]?\w*\s*objects?', r'\1 objects', response, flags=re.IGNORECASE)
835
+
836
+ return response
837
+
838
+ def _ensure_proper_sentence_ending(self, response: str) -> str:
839
+ """
840
+ 確保句子有適當的結尾
841
+
842
+ Args:
843
+ response: 回應文本
844
+
845
+ Returns:
846
+ str: 具有適當結尾的回應
847
+ """
848
+ if not response or not response.strip():
849
+ return response
850
+
851
+ response = response.strip()
852
+
853
+ # 檢查是否以標點符號結尾
854
+ if response and response[-1] not in ['.', '!', '?']:
855
+
856
+ # 常見介詞和連詞列表
857
+ problematic_endings = [
858
+ "into", "onto", "about", "above", "across", "after", "along", "around",
859
+ "at", "before", "behind", "below", "beneath", "beside", "between",
860
+ "beyond", "by", "down", "during", "except", "for", "from", "in",
861
+ "inside", "near", "of", "off", "on", "over", "through", "to",
862
+ "toward", "under", "up", "upon", "with", "within", "and", "or", "but"
863
+ ]
864
+
865
+ words = response.split()
866
+ if words:
867
+ last_word = words[-1].lower().rstrip('.,!?')
868
+
869
+ if last_word in problematic_endings:
870
+ # 找到最後完整的句子
871
+ last_period_pos = max(
872
+ response.rfind('.'),
873
+ response.rfind('!'),
874
+ response.rfind('?')
875
+ )
876
+
877
+ if last_period_pos > len(response) // 2: # 如果有較近的完整句子
878
+ response = response[:last_period_pos + 1]
879
+ else:
880
+ # 移除問題詞彙並添加句號
881
+ if len(words) > 1:
882
+ response = " ".join(words[:-1]) + "."
883
+ else:
884
+ response = "The scene displays various elements."
885
+ else:
886
+ # 正常情況下添加句號
887
+ response += "."
888
+
889
+ return response
890
+
891
+ def _final_grammar_check(self, response: str) -> str:
892
+ """
893
+ 最終語法檢查和清理
894
+
895
+ Args:
896
+ response: 回應文本
897
+
898
+ Returns:
899
+ str: 最終清理後的回應
900
+ """
901
+ if not response:
902
+ return response
903
+
904
+ # 修正連續標點符號
905
+ response = re.sub(r'([.!?]){2,}', r'\1', response)
906
+
907
+ # 修正句號前的空格
908
+ response = re.sub(r'\s+([.!?])', r'\1', response)
909
+
910
+ # 修正句號後缺少空格的問題
911
+ response = re.sub(r'([.!?])([A-Z])', r'\1 \2', response)
912
+
913
+ # 確保首字母大寫
914
+ if response and response[0].islower():
915
+ response = response[0].upper() + response[1:]
916
+
917
+ # 移除多餘的空格
918
+ response = re.sub(r'\s{2,}', ' ', response)
919
+
920
+ # 處理空句子或過短的回應
921
+ if len(response.strip()) < 20:
922
+ return "The scene contains various visual elements."
923
+
924
+ return response.strip()
925
+
926
+ def _control_word_length(self, response: str) -> str:
927
+ """控制文字長度在合理範圍內"""
928
+ words = response.split()
929
+ if len(words) > 200:
930
+ # 找到接近字數限制的句子結束處
931
+ truncated = ' '.join(words[:200])
932
+ last_period = max(truncated.rfind('.'), truncated.rfind('!'), truncated.rfind('?'))
933
+
934
+ if last_period > 0:
935
+ response = truncated[:last_period+1]
936
+ else:
937
+ response = truncated + "."
938
+
939
+ return response
940
+
941
+ def _final_formatting(self, response: str) -> str:
942
+ """最終格式化處理"""
943
+ # 確保首字母大寫
944
+ if response and response[0].islower():
945
+ response = response[0].upper() + response[1:]
946
+
947
+ # 統一格式為單一段落
948
+ response = re.sub(r'\s*\n\s*', ' ', response)
949
+ response = ' '.join(response.split())
950
+
951
+ return response.strip()
952
+
953
+ def _recover_from_overcleaning(self, original_response: str) -> str:
954
+ """從過度清理中恢復內容"""
955
+ try:
956
+ # 嘗試從原始回應中找到最佳段落
957
+ paragraphs = [p for p in original_response.split('\n\n') if p.strip()]
958
+ if paragraphs:
959
+ # 選擇最長的段落作為主要描述
960
+ best_para = max(paragraphs, key=len)
961
+ # 使用基本清理規則
962
+ best_para = re.sub(r'\[.*?\]', '', best_para)
963
+ best_para = re.sub(r'\s{2,}', ' ', best_para).strip()
964
+
965
+ if len(best_para) >= 40:
966
+ return best_para
967
+
968
+ return "Unable to generate a valid enhanced description."
969
+
970
+ except Exception as e:
971
+ self.logger.error(f"Recovery from overcleaning failed: {str(e)}")
972
+ return "Description generation error."
973
+
974
+ def _validate_cleaned_response(self, response: str):
975
+ """驗證清理後的回應"""
976
+ if not response:
977
+ raise ResponseProcessingError("Response is empty after cleaning")
978
+
979
+ if len(response.strip()) < 20:
980
+ raise ResponseProcessingError("Response is too short after cleaning")
981
+
982
+ # 檢查是否包含基本的句子結構
983
+ if not re.search(r'[.!?]', response):
984
+ raise ResponseProcessingError("Response lacks proper sentence structure")
985
+
986
+ def remove_explanatory_notes(self, response: str) -> str:
987
+ """
988
+ 移除解釋性注釋和說明
989
+
990
+ Args:
991
+ response: 包含可能注釋的回應
992
+
993
+ Returns:
994
+ str: 移除注釋後的回應
995
+ """
996
+ try:
997
+ # 識別常見的注釋和解釋模式
998
+ note_patterns = [
999
+ r'(?:^|\n)Note:.*?(?:\n|$)',
1000
+ r'(?:^|\n)I have (?:followed|adhered to|ensured).*?(?:\n|$)',
1001
+ r'(?:^|\n)This description (?:follows|adheres to|maintains).*?(?:\n|$)',
1002
+ r'(?:^|\n)The enhanced description (?:maintains|preserves).*?(?:\n|$)'
1003
+ ]
1004
+
1005
+ # 尋找段落
1006
+ paragraphs = [p.strip() for p in response.split('\n\n') if p.strip()]
1007
+
1008
+ # 如果只有一個段落,檢查並清理它
1009
+ if len(paragraphs) == 1:
1010
+ for pattern in note_patterns:
1011
+ paragraphs[0] = re.sub(pattern, '', paragraphs[0], flags=re.IGNORECASE)
1012
+ return paragraphs[0].strip()
1013
+
1014
+ # 如果有多個段落,移除注釋段落
1015
+ content_paragraphs = []
1016
+ for paragraph in paragraphs:
1017
+ is_note = False
1018
+ for pattern in note_patterns:
1019
+ if re.search(pattern, paragraph, flags=re.IGNORECASE):
1020
+ is_note = True
1021
+ break
1022
+
1023
+ # 檢查段落是否以常見的注釋詞開頭
1024
+ if paragraph.lower().startswith(('note:', 'please note:', 'remember:')):
1025
+ is_note = True
1026
+
1027
+ if not is_note:
1028
+ content_paragraphs.append(paragraph)
1029
+
1030
+ return '\n\n'.join(content_paragraphs).strip()
1031
+
1032
+ except Exception as e:
1033
+ self.logger.error(f"Failed to remove explanatory notes: {str(e)}")
1034
+ return response
1035
+
1036
+ def get_processor_info(self) -> Dict[str, Any]:
1037
+ """
1038
+ 獲取處理器信息
1039
+
1040
+ Returns:
1041
+ Dict[str, Any]: 包含處理器狀態和配置的信息
1042
+ """
1043
+ return {
1044
+ "replacement_alternatives_count": len(self.replacement_alternatives),
1045
+ "prefixes_to_remove_count": len(self.prefixes_to_remove),
1046
+ "suffixes_to_remove_count": len(self.suffixes_to_remove),
1047
+ "repetitive_patterns_count": len(self.repetitive_patterns),
1048
+ "initialization_status": "success"
1049
+ }
result_cache_manager.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import logging
3
+ import traceback
4
+ from typing import Dict, Any, Tuple, Optional, Union
5
+ from PIL import Image
6
+ import numpy as np
7
+
8
+ class ResultCacheManager:
9
+ """
10
+ 專門處理結果快取和性能優化,包括快取策略管理、快取大小控制和快取命中率優化
11
+ """
12
+
13
+ def __init__(self, cache_max_size: int = 100):
14
+ """
15
+ 初始化結果快取管理器
16
+
17
+ Args:
18
+ cache_max_size: 最大快取項目數
19
+ """
20
+ self.logger = logging.getLogger(__name__)
21
+
22
+ # 初始化結果快取
23
+ self.results_cache = {} # 使用圖像hash作為鍵
24
+ self.cache_max_size = cache_max_size # 最大快取項目數
25
+
26
+ def generate_cache_key(self, image_hash: int, additional_params: Tuple) -> Tuple:
27
+ """
28
+ 生成快取鍵
29
+
30
+ Args:
31
+ image_hash
32
+ additional_params: 附加參數元組
33
+
34
+ Returns:
35
+ Tuple: 快取鍵
36
+ """
37
+ try:
38
+ return (image_hash, additional_params)
39
+ except Exception as e:
40
+ self.logger.error(f"Error generating cache key: {e}")
41
+ self.logger.error(traceback.format_exc())
42
+ return (0, additional_params)
43
+
44
+ def get_region_cache_key(self, image_hash: int, box: Tuple[float, ...],
45
+ detection_type: str) -> Tuple:
46
+ """
47
+ 生成區域分析的快取鍵
48
+
49
+ Args:
50
+ image_hash
51
+ box: 邊界框
52
+ detection_type: 檢測類型
53
+
54
+ Returns:
55
+ Tuple: 區域快取鍵
56
+ """
57
+ try:
58
+ return self.generate_cache_key(image_hash, (tuple(box), detection_type))
59
+ except Exception as e:
60
+ self.logger.error(f"Error generating region cache key: {e}")
61
+ self.logger.error(traceback.format_exc())
62
+ return (0, (tuple(box), detection_type))
63
+
64
+ def get_image_cache_key(self, image_hash: int, analysis_type: str,
65
+ detailed_analysis: bool = False) -> Tuple:
66
+ """
67
+ 生成整張圖像分析的快取鍵
68
+
69
+ Args:
70
+ image_hash: 圖像哈希值
71
+ analysis_type: 分析類型
72
+ detailed_analysis: 是否詳細分析
73
+
74
+ Returns:
75
+ Tuple: 圖像快取鍵
76
+ """
77
+ try:
78
+ return self.generate_cache_key(image_hash, (analysis_type, detailed_analysis))
79
+ except Exception as e:
80
+ self.logger.error(f"Error generating image cache key: {e}")
81
+ self.logger.error(traceback.format_exc())
82
+ return (0, (analysis_type, detailed_analysis))
83
+
84
+ def get_cached_result(self, cache_key: Tuple) -> Optional[Dict[str, Any]]:
85
+ """
86
+ 獲取快取結果
87
+
88
+ Args:
89
+ cache_key: 快取鍵
90
+
91
+ Returns:
92
+ Optional[Dict[str, Any]]: 快取結果,如果不存在則返回None
93
+ """
94
+ try:
95
+ return self.results_cache.get(cache_key)
96
+ except Exception as e:
97
+ self.logger.error(f"Error getting cached result: {e}")
98
+ self.logger.error(traceback.format_exc())
99
+ return None
100
+
101
+ def set_cached_result(self, cache_key: Tuple, result: Dict[str, Any]):
102
+ """
103
+ 設置快取結果
104
+
105
+ Args:
106
+ cache_key: 快取鍵
107
+ result: 要快取的結果
108
+ """
109
+ try:
110
+ self.results_cache[cache_key] = result
111
+ self.manage_cache_size()
112
+ except Exception as e:
113
+ self.logger.error(f"Error setting cached result: {e}")
114
+ self.logger.error(traceback.format_exc())
115
+
116
+ def manage_cache_size(self):
117
+ """
118
+ 管理結果快取大小
119
+ """
120
+ try:
121
+ if len(self.results_cache) > self.cache_max_size:
122
+ oldest_key = next(iter(self.results_cache))
123
+ del self.results_cache[oldest_key]
124
+ except Exception as e:
125
+ self.logger.error(f"Error managing cache size: {e}")
126
+ self.logger.error(traceback.format_exc())
127
+
128
+ def clear_cache(self):
129
+ """
130
+ 清空快取
131
+ """
132
+ try:
133
+ self.results_cache.clear()
134
+ self.logger.info("Cache cleared successfully")
135
+ except Exception as e:
136
+ self.logger.error(f"Error clearing cache: {e}")
137
+ self.logger.error(traceback.format_exc())
138
+
139
+ def get_cache_stats(self) -> Dict[str, Any]:
140
+ """
141
+ 獲取快取統計信息
142
+
143
+ Returns:
144
+ Dict[str, Any]: 快取統計信息
145
+ """
146
+ try:
147
+ return {
148
+ "cache_size": len(self.results_cache),
149
+ "max_cache_size": self.cache_max_size,
150
+ "cache_usage_ratio": len(self.results_cache) / self.cache_max_size if self.cache_max_size > 0 else 0
151
+ }
152
+ except Exception as e:
153
+ self.logger.error(f"Error getting cache stats: {e}")
154
+ self.logger.error(traceback.format_exc())
155
+ return {
156
+ "cache_size": 0,
157
+ "max_cache_size": self.cache_max_size,
158
+ "cache_usage_ratio": 0
159
+ }
160
+
161
+ def set_max_cache_size(self, max_size: int):
162
+ """
163
+ 設置最大快取大小
164
+
165
+ Args:
166
+ max_size: 新的最大快取大小
167
+ """
168
+ try:
169
+ self.cache_max_size = max(1, max_size)
170
+ self.manage_cache_size()
171
+ self.logger.info(f"Max cache size set to {self.cache_max_size}")
172
+ except Exception as e:
173
+ self.logger.error(f"Error setting max cache size: {e}")
174
+ self.logger.error(traceback.format_exc())
175
+
176
+ def remove_cached_result(self, cache_key: Tuple) -> bool:
177
+ """
178
+ 移除特定的快取結果
179
+
180
+ Args:
181
+ cache_key: 快取鍵
182
+
183
+ Returns:
184
+ bool: 是否成功移除
185
+ """
186
+ try:
187
+ if cache_key in self.results_cache:
188
+ del self.results_cache[cache_key]
189
+ return True
190
+ return False
191
+ except Exception as e:
192
+ self.logger.error(f"Error removing cached result: {e}")
193
+ self.logger.error(traceback.format_exc())
194
+ return False
195
+
196
+ def is_cache_enabled(self) -> bool:
197
+ """
198
+ 檢查快取是否啟用
199
+
200
+ Returns:
201
+ bool: 快取啟用狀態
202
+ """
203
+ return self.cache_max_size > 0
204
+
205
+ def get_cache_keys(self) -> list:
206
+ """
207
+ 獲取所有快取鍵
208
+
209
+ Returns:
210
+ list: 快取鍵列表
211
+ """
212
+ try:
213
+ return list(self.results_cache.keys())
214
+ except Exception as e:
215
+ self.logger.error(f"Error getting cache keys: {e}")
216
+ self.logger.error(traceback.format_exc())
217
+ return []
218
+
219
+ def has_cached_result(self, cache_key: Tuple) -> bool:
220
+ """
221
+ 檢查是否存在快取結果
222
+
223
+ Args:
224
+ cache_key: 快取鍵
225
+
226
+ Returns:
227
+ bool: 是否存在快取結果
228
+ """
229
+ try:
230
+ return cache_key in self.results_cache
231
+ except Exception as e:
232
+ self.logger.error(f"Error checking cached result: {e}")
233
+ self.logger.error(traceback.format_exc())
234
+ return False
scene_analysis_coordinator.py ADDED
@@ -0,0 +1,973 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import traceback
3
+ import numpy as np
4
+ from typing import Dict, List, Tuple, Optional, Any
5
+ from PIL import Image
6
+
7
+ class SceneAnalysisCoordinator:
8
+ """
9
+ 負責整個場景分析流程的協調和控制邏輯,包含主要的分析流程、
10
+ 處理無檢測結果的回退邏輯,以及多源分析結果的整合。
11
+ """
12
+
13
+ def __init__(self, component_initializer, scene_scoring_engine, landmark_processing_manager,
14
+ scene_confidence_threshold: float = 0.6):
15
+ """
16
+ 初始化場景分析協調器。
17
+
18
+ Args:
19
+ component_initializer: 組件初始化器實例
20
+ scene_scoring_engine: 場景評分引擎實例
21
+ landmark_processing_manager: 地標處理管理器實例
22
+ scene_confidence_threshold: 場景置信度閾值
23
+ """
24
+ self.logger = logging.getLogger(__name__)
25
+ self.component_initializer = component_initializer
26
+ self.scene_scoring_engine = scene_scoring_engine
27
+ self.landmark_processing_manager = landmark_processing_manager
28
+ self.scene_confidence_threshold = scene_confidence_threshold
29
+
30
+ # 獲取必要的組件和數據
31
+ self.spatial_analyzer = component_initializer.get_component('spatial_analyzer')
32
+ self.descriptor = component_initializer.get_component('descriptor')
33
+ self.scene_describer = component_initializer.get_component('scene_describer')
34
+ self.clip_analyzer = component_initializer.get_component('clip_analyzer')
35
+ self.llm_enhancer = component_initializer.get_component('llm_enhancer')
36
+
37
+ self.scene_types = component_initializer.get_data_structure('SCENE_TYPES')
38
+
39
+ # 從組件初始化器獲取功能開關狀態
40
+ self.use_clip = component_initializer.use_clip
41
+ self.use_llm = component_initializer.use_llm
42
+ self.enable_landmark = component_initializer.enable_landmark
43
+
44
+ def analyze(self, detection_result: Any, lighting_info: Optional[Dict] = None,
45
+ class_confidence_threshold: float = 0.25, scene_confidence_threshold: float = 0.6,
46
+ enable_landmark: bool = True, places365_info: Optional[Dict] = None) -> Dict:
47
+ """
48
+ 分析檢測結果以確定場景類型並提供理解。
49
+
50
+ Args:
51
+ detection_result: 來自 YOLOv8 或類似系統的檢測結果
52
+ lighting_info: 可選的照明條件分析結果
53
+ class_confidence_threshold: 考慮物體的最小置信度
54
+ scene_confidence_threshold: 確定場景的最小置信度
55
+ enable_landmark: 是否為此次運行啟用地標檢測和識別
56
+ places365_info: 可選的 Places365 場景分類結果
57
+
58
+ Returns:
59
+ 包含場景分析結果的字典
60
+ """
61
+ current_run_enable_landmark = enable_landmark
62
+ self.logger.info(f"DIAGNOSTIC (SceneAnalyzer.analyze): Called with current_run_enable_landmark={current_run_enable_landmark}")
63
+ self.logger.debug(f"SceneAnalyzer received lighting_info type: {type(lighting_info)}")
64
+ self.logger.debug(f"SceneAnalyzer lighting_info source: {lighting_info.get('source', 'unknown') if isinstance(lighting_info, dict) else 'not_dict'}")
65
+
66
+ # 記錄 Places365 資訊
67
+ if places365_info:
68
+ self.logger.info(f"DIAGNOSTIC: Places365 info received - scene: {places365_info.get('scene_label', 'unknown')}, "
69
+ f"mapped: {places365_info.get('mapped_scene_type', 'unknown')}, "
70
+ f"confidence: {places365_info.get('confidence', 0.0):.3f}")
71
+
72
+ # 同步 enable_landmark 狀態到子組件(為此次分析運行)
73
+ self._sync_landmark_status_to_components(current_run_enable_landmark)
74
+
75
+ # 提取和處理原始圖像
76
+ original_image_pil, image_dims_val = self._extract_image_info(detection_result)
77
+
78
+ # 處理無 YOLO 檢測結果的情況
79
+ no_yolo_detections = self._check_no_yolo_detections(detection_result)
80
+
81
+ if no_yolo_detections:
82
+ return self._handle_no_yolo_detections(
83
+ original_image_pil, image_dims_val, current_run_enable_landmark,
84
+ lighting_info, places365_info
85
+ )
86
+
87
+ # 主處理流程(有 YOLO 檢測結果)
88
+ return self._handle_main_analysis_flow(
89
+ detection_result, original_image_pil, image_dims_val,
90
+ class_confidence_threshold, scene_confidence_threshold,
91
+ current_run_enable_landmark, lighting_info, places365_info
92
+ )
93
+
94
+ def _sync_landmark_status_to_components(self, current_run_enable_landmark: bool):
95
+ """同步地標狀態到所有相關組件。"""
96
+ # 更新場景評分引擎
97
+ self.scene_scoring_engine.update_enable_landmark_status(current_run_enable_landmark)
98
+
99
+ # 更新地標處理管理器
100
+ self.landmark_processing_manager.update_enable_landmark_status(current_run_enable_landmark)
101
+
102
+ # 更新其他組件的地標狀態
103
+ for component_name in ['scene_describer', 'clip_analyzer', 'landmark_classifier']:
104
+ component = self.component_initializer.get_component(component_name)
105
+ if component and hasattr(component, 'enable_landmark'):
106
+ component.enable_landmark = current_run_enable_landmark
107
+
108
+ # 更新實例狀態
109
+ self.enable_landmark = current_run_enable_landmark
110
+
111
+ def _extract_image_info(self, detection_result) -> Tuple[Optional[Image.Image], Optional[Tuple[int, int]]]:
112
+ """從檢測結果中提取圖像信息。"""
113
+ original_image_pil = None
114
+ image_dims_val = None # 將是 (width, height)
115
+
116
+ if (detection_result is not None and hasattr(detection_result, 'orig_img') and
117
+ detection_result.orig_img is not None):
118
+ if isinstance(detection_result.orig_img, np.ndarray):
119
+ try:
120
+ img_array = detection_result.orig_img
121
+ if img_array.ndim == 3 and img_array.shape[2] == 4: # RGBA
122
+ img_array = img_array[:, :, :3] # 轉換為 RGB
123
+ if img_array.ndim == 2: # 灰度
124
+ original_image_pil = Image.fromarray(img_array).convert("RGB")
125
+ else: # 假設 RGB 或 BGR(如果源是 cv2 BGR,PIL 在 fromarray 時會處理 BGR->RGB,但明確處理更好)
126
+ original_image_pil = Image.fromarray(img_array)
127
+
128
+ if hasattr(original_image_pil, 'mode') and original_image_pil.mode == 'BGR': # 明確將 OpenCV 的 BGR 轉換為 PIL 的 RGB
129
+ original_image_pil = original_image_pil.convert('RGB')
130
+
131
+ image_dims_val = (original_image_pil.width, original_image_pil.height)
132
+ except Exception as e:
133
+ self.logger.warning(f"Error converting NumPy orig_img to PIL: {e}")
134
+ elif hasattr(detection_result.orig_img, 'size') and callable(getattr(detection_result.orig_img, 'convert', None)):
135
+ original_image_pil = detection_result.orig_img.copy().convert("RGB") # 確保 RGB
136
+ image_dims_val = original_image_pil.size
137
+ else:
138
+ self.logger.warning(f"detection_result.orig_img (type: {type(detection_result.orig_img)}) is not a recognized NumPy array or PIL Image.")
139
+ else:
140
+ self.logger.warning("detection_result.orig_img not available. Image-based analysis will be limited.")
141
+
142
+ return original_image_pil, image_dims_val
143
+
144
+ def _check_no_yolo_detections(self, detection_result) -> bool:
145
+ """檢查是否沒有 YOLO 檢測結果。"""
146
+ return (detection_result is None or
147
+ not hasattr(detection_result, 'boxes') or
148
+ not hasattr(detection_result.boxes, 'xyxy') or
149
+ len(detection_result.boxes.xyxy) == 0)
150
+
151
+ def _handle_no_yolo_detections(self, original_image_pil, image_dims_val,
152
+ current_run_enable_landmark, lighting_info, places365_info) -> Dict:
153
+ """處理無 YOLO 檢測結果的情況。"""
154
+ tried_landmark_detection = False
155
+ landmark_detection_result = None
156
+
157
+ # 嘗試地標檢測
158
+ if original_image_pil and self.use_clip and current_run_enable_landmark:
159
+ landmark_detection_result = self._attempt_landmark_detection_no_yolo(
160
+ original_image_pil, image_dims_val, lighting_info
161
+ )
162
+ tried_landmark_detection = True
163
+
164
+ if landmark_detection_result:
165
+ return landmark_detection_result
166
+
167
+ # 如果地標檢測失敗或未嘗試,使用 CLIP 進行一般場景分析
168
+ if not landmark_detection_result and self.use_clip and original_image_pil:
169
+ clip_fallback_result = self._attempt_clip_fallback_analysis(
170
+ original_image_pil, image_dims_val, current_run_enable_landmark, lighting_info
171
+ )
172
+ if clip_fallback_result:
173
+ return clip_fallback_result
174
+
175
+ # 最終回退邏輯
176
+ return self._get_final_fallback_result(places365_info, lighting_info)
177
+
178
+ def _attempt_landmark_detection_no_yolo(self, original_image_pil, image_dims_val, lighting_info) -> Optional[Dict]:
179
+ """在無 YOLO 檢測的情況下嘗試地標檢測。"""
180
+ try:
181
+ # 初始化地標分類器(如果需要)
182
+ landmark_classifier = self.component_initializer.get_component('landmark_classifier')
183
+ if not landmark_classifier and self.clip_analyzer:
184
+ if hasattr(self.clip_analyzer, 'get_clip_instance'):
185
+ try:
186
+ model, preprocess, device = self.clip_analyzer.get_clip_instance()
187
+ landmark_classifier = CLIPZeroShotClassifier(device=device)
188
+ self.landmark_processing_manager.set_landmark_classifier(landmark_classifier)
189
+ self.logger.info("Initialized landmark classifier with shared CLIP model")
190
+ except Exception as e:
191
+ self.logger.warning(f"Could not initialize landmark classifier: {e}")
192
+ return None
193
+
194
+ if landmark_classifier:
195
+ self.logger.info("Attempting landmark detection with no YOLO boxes")
196
+ landmark_results_no_yolo = landmark_classifier.intelligent_landmark_search(
197
+ original_image_pil, yolo_boxes=None, base_threshold=0.2 # 略微降低閾值,提高靈敏度
198
+ )
199
+
200
+ # 確保在無地標場景時返回有效結果
201
+ if landmark_results_no_yolo is None:
202
+ landmark_results_no_yolo = {"is_landmark_scene": False, "detected_landmarks": []}
203
+
204
+ if (landmark_results_no_yolo and landmark_results_no_yolo.get("is_landmark_scene", False)):
205
+ return self._process_landmark_detection_result(
206
+ landmark_results_no_yolo, image_dims_val, lighting_info
207
+ )
208
+ except Exception as e:
209
+ self.logger.error(f"Error in landmark-only detection path (analyze method): {e}")
210
+ traceback.print_exc()
211
+
212
+ return None
213
+
214
+ def _process_landmark_detection_result(self, landmark_results, image_dims_val, lighting_info) -> Dict:
215
+ """處理地標檢測結果並生成最終輸出。"""
216
+ primary_landmark = landmark_results.get("primary_landmark")
217
+
218
+ # 放寬閾值條件,以便捕獲更多潛在地標
219
+ if not primary_landmark or primary_landmark.get("confidence", 0) <= 0.25:
220
+ return None
221
+
222
+ detected_objects_from_landmarks_list = []
223
+ w_img, h_img = image_dims_val if image_dims_val else (1, 1)
224
+
225
+ for lm_info_item in landmark_results.get("detected_landmarks", []):
226
+ if lm_info_item.get("confidence", 0) > 0.25: # 降低閾值與上面保持一致
227
+ # 安全獲取 box 值,避免索引錯誤
228
+ box = lm_info_item.get("box", [0, 0, w_img, h_img])
229
+ if len(box) < 4:
230
+ box = [0, 0, w_img, h_img]
231
+
232
+ # 計算中心點和標準化坐標
233
+ center_x, center_y = (box[0] + box[2]) / 2, (box[1] + box[3]) / 2
234
+ norm_cx = center_x / w_img if w_img > 0 else 0.5
235
+ norm_cy = center_y / h_img if h_img > 0 else 0.5
236
+
237
+ # 決定地標類型
238
+ landmark_type = "architectural" # 預設類型
239
+ landmark_id = lm_info_item.get("landmark_id", "")
240
+
241
+ landmark_classifier = self.component_initializer.get_component('landmark_classifier')
242
+ if (landmark_classifier and hasattr(landmark_classifier, '_determine_landmark_type') and landmark_id):
243
+ try:
244
+ landmark_type = landmark_classifier._determine_landmark_type(landmark_id)
245
+ except Exception as e:
246
+ self.logger.error(f"Error determining landmark type: {e}")
247
+ else:
248
+ # 使用簡單的基於 ID 的啟發式方法推斷類型
249
+ landmark_id_lower = landmark_id.lower() if isinstance(landmark_id, str) else ""
250
+ if "natural" in landmark_id_lower or any(term in landmark_id_lower for term in ["mountain", "waterfall", "canyon", "lake"]):
251
+ landmark_type = "natural"
252
+ elif "monument" in landmark_id_lower or "memorial" in landmark_id_lower or "historical" in landmark_id_lower:
253
+ landmark_type = "monument"
254
+
255
+ # 決定區域位置
256
+ region = "center" # 預設值
257
+ if self.spatial_analyzer and hasattr(self.spatial_analyzer, '_determine_region'):
258
+ try:
259
+ region = self.spatial_analyzer._determine_region(norm_cx, norm_cy)
260
+ except Exception as e:
261
+ self.logger.error(f"Error determining region: {e}")
262
+
263
+ # 取得並補 location
264
+ loc_lm = lm_info_item.get("location", "")
265
+ if not loc_lm and landmark_id in ALL_LANDMARKS:
266
+ loc_lm = ALL_LANDMARKS[landmark_id].get("location", "")
267
+
268
+ # 創建地標物體
269
+ landmark_obj = {
270
+ "class_id": lm_info_item.get("landmark_id", f"LM_{lm_info_item.get('landmark_name','unk')}")[:15],
271
+ "class_name": lm_info_item.get("landmark_name", "Unknown Landmark"),
272
+ "confidence": lm_info_item.get("confidence", 0.0),
273
+ "box": box,
274
+ "center": (center_x, center_y),
275
+ "normalized_center": (norm_cx, norm_cy),
276
+ "size": (box[2] - box[0], box[3] - box[1]),
277
+ "normalized_size": (
278
+ (box[2] - box[0])/(w_img if w_img>0 else 1),
279
+ (box[3] - box[1])/(h_img if h_img>0 else 1)
280
+ ),
281
+ "area": (box[2] - box[0]) * (box[3] - box[1]),
282
+ "normalized_area": (
283
+ (box[2] - box[0]) * (box[3] - box[1])
284
+ ) / ((w_img*h_img) if w_img*h_img >0 else 1),
285
+ "is_landmark": True,
286
+ "landmark_id": landmark_id,
287
+ "location": loc_lm or "Unknown Location",
288
+ "region": region,
289
+ "year_built": lm_info_item.get("year_built", ""),
290
+ "architectural_style": lm_info_item.get("architectural_style", ""),
291
+ "significance": lm_info_item.get("significance", ""),
292
+ "landmark_type": landmark_type
293
+ }
294
+ detected_objects_from_landmarks_list.append(landmark_obj)
295
+
296
+ if not detected_objects_from_landmarks_list:
297
+ return None
298
+
299
+ # 設定場景類型
300
+ best_scene_val = "tourist_landmark" # 預設
301
+ if primary_landmark:
302
+ try:
303
+ lm_type = primary_landmark.get("landmark_type", "architectural")
304
+ if lm_type and "natural" in lm_type.lower():
305
+ best_scene_val = "natural_landmark"
306
+ elif lm_type and ("historical" in lm_type.lower() or "monument" in lm_type.lower()):
307
+ best_scene_val = "historical_monument"
308
+ except Exception as e:
309
+ self.logger.error(f"Error determining scene type from landmark type: {e}")
310
+
311
+ # 確保場景類型有效
312
+ if best_scene_val not in self.scene_types:
313
+ best_scene_val = "tourist_landmark" # 預設場景類型
314
+
315
+ # 設定置信度
316
+ scene_confidence = primary_landmark.get("confidence", 0.0) if primary_landmark else 0.0
317
+
318
+ # 生成其他必要的分析結果
319
+ region_analysis = self._generate_region_analysis(detected_objects_from_landmarks_list)
320
+
321
+ functional_zones = self._generate_functional_zones(
322
+ detected_objects_from_landmarks_list,
323
+ best_scene_val
324
+ )
325
+
326
+ scene_description = self._generate_scene_description(
327
+ best_scene_val, detected_objects_from_landmarks_list, scene_confidence,
328
+ lighting_info, functional_zones, image_dims_val
329
+ )
330
+
331
+ enhanced_description = self._enhance_description_with_llm(
332
+ scene_description, best_scene_val, detected_objects_from_landmarks_list,
333
+ scene_confidence, lighting_info, functional_zones, landmark_results, image_dims_val
334
+ )
335
+ possible_activities = self._extract_possible_activities(detected_objects_from_landmarks_list, landmark_results)
336
+
337
+ # 準備最終結果
338
+ return {
339
+ "scene_type": best_scene_val,
340
+ "scene_name": self.scene_types.get(best_scene_val, {}).get("name", "Landmark"),
341
+ "confidence": round(float(scene_confidence), 4),
342
+ "description": scene_description,
343
+ "enhanced_description": enhanced_description,
344
+ "objects_present": detected_objects_from_landmarks_list,
345
+ "object_count": len(detected_objects_from_landmarks_list),
346
+ "regions": region_analysis,
347
+ "possible_activities": possible_activities,
348
+ "functional_zones": functional_zones,
349
+ "detected_landmarks": [lm for lm in detected_objects_from_landmarks_list if lm.get("is_landmark", False)],
350
+ "primary_landmark": primary_landmark,
351
+ "lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0.0}
352
+ }
353
+
354
+
355
+ def _attempt_clip_fallback_analysis(self, original_image_pil, image_dims_val,
356
+ current_run_enable_landmark, lighting_info) -> Optional[Dict]:
357
+ """嘗試使用 CLIP 進行一般場景分析。"""
358
+ try:
359
+ clip_analysis_val = None
360
+ if self.clip_analyzer and hasattr(self.clip_analyzer, 'analyze_image'):
361
+ try:
362
+ clip_analysis_val = self.clip_analyzer.analyze_image(
363
+ original_image_pil,
364
+ enable_landmark=current_run_enable_landmark
365
+ )
366
+ except Exception as e:
367
+ self.logger.error(f"Error in CLIP analysis: {e}")
368
+
369
+ scene_type_llm = "llm_inferred_no_yolo"
370
+ confidence_llm = 0.0
371
+
372
+ if clip_analysis_val and isinstance(clip_analysis_val, dict):
373
+ top_scene = clip_analysis_val.get("top_scene")
374
+ if top_scene and isinstance(top_scene, tuple) and len(top_scene) >= 2:
375
+ confidence_llm = top_scene[1]
376
+ if isinstance(top_scene[0], str):
377
+ scene_type_llm = top_scene[0]
378
+
379
+ desc_llm = "Primary object detection did not yield results. This description is based on overall image context."
380
+
381
+ w_llm, h_llm = image_dims_val if image_dims_val else (1, 1)
382
+ enhanced_desc_llm = self._enhance_no_detection_description(
383
+ desc_llm, scene_type_llm, confidence_llm, lighting_info,
384
+ clip_analysis_val, current_run_enable_landmark, w_llm, h_llm
385
+ )
386
+
387
+ # 安全類型轉換
388
+ try:
389
+ confidence_float = float(confidence_llm)
390
+ except (ValueError, TypeError):
391
+ confidence_float = 0.0
392
+
393
+ # 確保增強描述不為空
394
+ if not enhanced_desc_llm or not isinstance(enhanced_desc_llm, str):
395
+ enhanced_desc_llm = desc_llm
396
+
397
+ # 返回結果
398
+ return {
399
+ "scene_type": scene_type_llm,
400
+ "confidence": round(confidence_float, 4),
401
+ "description": desc_llm,
402
+ "enhanced_description": enhanced_desc_llm,
403
+ "objects_present": [],
404
+ "object_count": 0,
405
+ "regions": {},
406
+ "possible_activities": [],
407
+ "safety_concerns": [],
408
+ "lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0.0}
409
+ }
410
+ except Exception as e:
411
+ self.logger.error(f"Error in CLIP no-detection fallback (analyze method): {e}")
412
+ traceback.print_exc()
413
+ return None
414
+
415
+ def _get_final_fallback_result(self, places365_info, lighting_info) -> Dict:
416
+ """獲取最終的回退結果。"""
417
+ # 檢查 Places365 是否提供有用的場景信息(即使沒有 YOLO 檢測)
418
+ fallback_scene_type = "unknown"
419
+ fallback_confidence = 0.0
420
+ fallback_description = "No objects were detected in the image, and contextual analysis could not be performed or failed."
421
+
422
+ if places365_info and places365_info.get('confidence', 0) > 0.3:
423
+ fallback_scene_type = places365_info.get('mapped_scene_type', 'unknown')
424
+ fallback_confidence = places365_info.get('confidence', 0.0)
425
+ fallback_description = f"Scene appears to be {places365_info.get('scene_label', 'an unidentified location')} based on overall visual context."
426
+
427
+ return {
428
+ "scene_type": fallback_scene_type,
429
+ "confidence": fallback_confidence,
430
+ "description": fallback_description,
431
+ "enhanced_description": "The image analysis system could not detect any recognizable objects or landmarks in this image.",
432
+ "objects_present": [],
433
+ "object_count": 0,
434
+ "regions": {},
435
+ "possible_activities": [],
436
+ "safety_concerns": [],
437
+ "lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0.0}
438
+ }
439
+
440
+ def _handle_main_analysis_flow(self, detection_result, original_image_pil, image_dims_val,
441
+ class_confidence_threshold, scene_confidence_threshold,
442
+ current_run_enable_landmark, lighting_info, places365_info) -> Dict:
443
+ """處理主要的分析流程(有 YOLO 檢測結果)。"""
444
+ # 更新類別名稱映射
445
+ if hasattr(detection_result, 'names'):
446
+ if hasattr(self.spatial_analyzer, 'class_names'):
447
+ self.spatial_analyzer.class_names = detection_result.names
448
+
449
+ # 提取檢測到的物體
450
+ detected_objects_main = self.spatial_analyzer._extract_detected_objects(
451
+ detection_result,
452
+ confidence_threshold=class_confidence_threshold
453
+ )
454
+
455
+ if not detected_objects_main:
456
+ return {
457
+ "scene_type": "unknown", "confidence": 0.0,
458
+ "description": "No objects detected with sufficient confidence by the primary vision system.",
459
+ "objects_present": [], "object_count": 0, "regions": {}, "possible_activities": [],
460
+ "safety_concerns": [], "lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0.0}
461
+ }
462
+
463
+ # 空間分析
464
+ region_analysis_val = self.spatial_analyzer._analyze_regions(detected_objects_main)
465
+
466
+ # 地標處理和整合
467
+ landmark_objects_identified = []
468
+ landmark_specific_activities = []
469
+ final_landmark_info = {}
470
+
471
+ if self.use_clip and current_run_enable_landmark:
472
+ detected_objects_main, landmark_objects_identified = self.landmark_processing_manager.process_unknown_objects(
473
+ detection_result, detected_objects_main, self.clip_analyzer
474
+ )
475
+
476
+ if landmark_objects_identified:
477
+ landmark_specific_activities = self.landmark_processing_manager.extract_landmark_specific_activities(
478
+ landmark_objects_identified
479
+ )
480
+ final_landmark_info = {
481
+ "detected_landmarks": landmark_objects_identified,
482
+ "primary_landmark": max(landmark_objects_identified, key=lambda x: x.get("confidence", 0.0), default=None),
483
+ "detailed_landmarks": landmark_objects_identified
484
+ }
485
+
486
+ # 如果當前運行禁用地標檢測��清理地標物體
487
+ if not current_run_enable_landmark:
488
+ detected_objects_main = [obj for obj in detected_objects_main if not obj.get("is_landmark", False)]
489
+ final_landmark_info = {}
490
+
491
+ # 計算場景分數並進行融合
492
+ yolo_scene_scores = self.scene_scoring_engine.compute_scene_scores(
493
+ detected_objects_main, spatial_analysis_results=region_analysis_val
494
+ )
495
+
496
+ clip_scene_scores = {}
497
+ clip_analysis_results = None
498
+ if self.use_clip and original_image_pil is not None:
499
+ clip_analysis_results, clip_scene_scores = self._perform_clip_analysis(
500
+ original_image_pil, current_run_enable_landmark, lighting_info
501
+ )
502
+
503
+ # 融合場景分數
504
+ yolo_only_objects = [obj for obj in detected_objects_main if not obj.get("is_landmark")]
505
+ num_yolo_detections = len(yolo_only_objects)
506
+ avg_yolo_confidence = (sum(obj.get('confidence', 0.0) for obj in yolo_only_objects) / num_yolo_detections
507
+ if num_yolo_detections > 0 else 0.0)
508
+
509
+ scene_scores_fused = self.scene_scoring_engine.fuse_scene_scores(
510
+ yolo_scene_scores, clip_scene_scores,
511
+ num_yolo_detections=num_yolo_detections,
512
+ avg_yolo_confidence=avg_yolo_confidence,
513
+ lighting_info=lighting_info,
514
+ places365_info=places365_info
515
+ )
516
+
517
+ # 確定最終場景類型
518
+ final_best_scene, final_scene_confidence = self.scene_scoring_engine.determine_scene_type(scene_scores_fused)
519
+
520
+ # 處理禁用地標檢測時的替代場景類型
521
+ if (not current_run_enable_landmark and
522
+ final_best_scene in ["tourist_landmark", "natural_landmark", "historical_monument"]):
523
+ alt_scene_type = self.landmark_processing_manager.get_alternative_scene_type(
524
+ final_best_scene, detected_objects_main, scene_scores_fused
525
+ )
526
+ final_best_scene = alt_scene_type
527
+ final_scene_confidence = scene_scores_fused.get(alt_scene_type, 0.6)
528
+
529
+ # 生成最終的描述性內容
530
+ final_result = self._generate_final_result(
531
+ final_best_scene, final_scene_confidence, detected_objects_main,
532
+ landmark_specific_activities, landmark_objects_identified, final_landmark_info,
533
+ region_analysis_val, lighting_info, scene_scores_fused, current_run_enable_landmark,
534
+ clip_analysis_results, image_dims_val, scene_confidence_threshold
535
+ )
536
+
537
+ return final_result
538
+
539
+ def _perform_clip_analysis(self, original_image_pil, current_run_enable_landmark, lighting_info) -> Tuple[Optional[Dict], Dict]:
540
+ """執行 CLIP 分析。"""
541
+ clip_analysis_results = None
542
+ clip_scene_scores = {}
543
+
544
+ try:
545
+ clip_analysis_results = self.clip_analyzer.analyze_image(
546
+ original_image_pil,
547
+ enable_landmark=current_run_enable_landmark,
548
+ exclude_categories=["landmark", "tourist", "monument", "tower", "attraction", "scenic", "historical", "famous"] if not current_run_enable_landmark else None
549
+ )
550
+
551
+ if isinstance(clip_analysis_results, dict):
552
+ clip_scene_scores = clip_analysis_results.get("scene_scores", {})
553
+
554
+ # 如果禁用地標檢測,再次過濾
555
+ if not current_run_enable_landmark:
556
+ clip_scene_scores = {k: v for k, v in clip_scene_scores.items()
557
+ if not any(kw in k.lower() for kw in ["landmark", "monument", "tourist"])}
558
+ if "cultural_analysis" in clip_analysis_results:
559
+ del clip_analysis_results["cultural_analysis"]
560
+ if ("top_scene" in clip_analysis_results and
561
+ any(term in clip_analysis_results.get("top_scene", ["unknown", 0.0])[0].lower()
562
+ for term in ["landmark", "monument", "tourist"])):
563
+ non_lm_cs = sorted([item for item in clip_scene_scores.items() if item[1] > 0],
564
+ key=lambda x: x[1], reverse=True)
565
+ clip_analysis_results["top_scene"] = non_lm_cs[0] if non_lm_cs else ("unknown", 0.0)
566
+
567
+ # 處理照明信息回退
568
+ if (not lighting_info and "lighting_condition" in clip_analysis_results):
569
+ lt, lc = clip_analysis_results.get("lighting_condition", ("unknown", 0.0))
570
+ lighting_info = {"time_of_day": lt, "confidence": lc, "source": "CLIP_fallback"}
571
+ except Exception as e:
572
+ self.logger.error(f"Error in main CLIP analysis for YOLO path (analyze method): {e}")
573
+
574
+ return clip_analysis_results, clip_scene_scores
575
+
576
+ def _generate_final_result(self, final_best_scene, final_scene_confidence, detected_objects_main,
577
+ landmark_specific_activities, landmark_objects_identified, final_landmark_info,
578
+ region_analysis_val, lighting_info, scene_scores_fused, current_run_enable_landmark,
579
+ clip_analysis_results, image_dims_val, scene_confidence_threshold) -> Dict:
580
+ """生成最終的分析結果。"""
581
+ # 生成最終的描述性內容(活動、安全、區域)
582
+ final_activities = []
583
+
584
+ # 通用活動推斷
585
+ generic_activities = []
586
+ if self.descriptor and hasattr(self.descriptor, '_infer_possible_activities'):
587
+ generic_activities = self.descriptor._infer_possible_activities(
588
+ final_best_scene, detected_objects_main,
589
+ enable_landmark=current_run_enable_landmark, scene_scores=scene_scores_fused
590
+ )
591
+
592
+ # 優先處理策略:使用特定地標活動,不足時才從通用活動補充
593
+ if landmark_specific_activities:
594
+ # 如果有特定活動,優先保留,去除與特定活動重複的通用活動
595
+ unique_generic_activities = [act for act in generic_activities if act not in landmark_specific_activities]
596
+
597
+ # 如果特定活動少於3個,從通用活動中補充
598
+ if len(landmark_specific_activities) < 3:
599
+ # 補充通用活動但總數不超過7個
600
+ supplement_count = min(3 - len(landmark_specific_activities), len(unique_generic_activities))
601
+ if supplement_count > 0:
602
+ final_activities.extend(unique_generic_activities[:supplement_count])
603
+ else:
604
+ # 若無特定活動,則使用所有通用活動
605
+ final_activities.extend(generic_activities)
606
+
607
+ # 去重並排序,但確保特定地標活動保持在前面
608
+ final_activities_set = set(final_activities)
609
+ final_activities = []
610
+
611
+ # 先加入特定地標活動(按原順序)
612
+ for activity in landmark_specific_activities:
613
+ if activity in final_activities_set:
614
+ final_activities.append(activity)
615
+ final_activities_set.remove(activity)
616
+
617
+ # 再加入通用活動(按字母排序)
618
+ final_activities.extend(sorted(list(final_activities_set)))
619
+
620
+ # 安全問題識別
621
+ final_safety_concerns = []
622
+ if self.descriptor and hasattr(self.descriptor, '_identify_safety_concerns'):
623
+ final_safety_concerns = self.descriptor._identify_safety_concerns(detected_objects_main, final_best_scene)
624
+
625
+ # 功能區域識別
626
+ final_functional_zones = {}
627
+ if self.spatial_analyzer and hasattr(self.spatial_analyzer, '_identify_functional_zones'):
628
+ general_zones = self.spatial_analyzer._identify_functional_zones(detected_objects_main, final_best_scene)
629
+ final_functional_zones.update(general_zones)
630
+
631
+ # 地標相關的功能區域
632
+ if landmark_objects_identified and self.spatial_analyzer and hasattr(self.spatial_analyzer, '_identify_landmark_zones'):
633
+ landmark_zones = self.spatial_analyzer._identify_landmark_zones(landmark_objects_identified)
634
+ final_functional_zones.update(landmark_zones)
635
+
636
+ # 如果當前運行禁用地標檢測,過濾相關內容
637
+ if not current_run_enable_landmark:
638
+ final_functional_zones = {
639
+ str(k): v
640
+ for k, v in final_functional_zones.items()
641
+ if (not str(k).isdigit())
642
+ and (not any(kw in str(k).lower() for kw in ["landmark", "monument", "viewing", "tourist"]))
643
+ }
644
+
645
+
646
+ current_activities_temp = [act for act in final_activities
647
+ if not any(kw in act.lower() for kw in ["sightsee", "photograph", "tour", "histor", "landmark", "monument", "cultur"])]
648
+ final_activities = current_activities_temp
649
+ if not final_activities and self.descriptor and hasattr(self.descriptor, '_infer_possible_activities'):
650
+ final_activities = self.descriptor._infer_possible_activities("generic_street_view", detected_objects_main, enable_landmark=False)
651
+
652
+ # 創建淨化的光線資訊,避免不合理的時間描述
653
+ lighting_info_clean = None
654
+ if lighting_info:
655
+ lighting_info_clean = {
656
+ "is_indoor": lighting_info.get("is_indoor"),
657
+ "confidence": lighting_info.get("confidence", 0.0),
658
+ "time_of_day": lighting_info.get("time_of_day", "unknown")
659
+ }
660
+
661
+ # 生成場景描述
662
+ base_scene_description = self._generate_scene_description(
663
+ final_best_scene, detected_objects_main, final_scene_confidence,
664
+ lighting_info_clean, final_functional_zones, image_dims_val
665
+ )
666
+
667
+ # 清理地標引用(如果禁用地標檢測)
668
+ if not current_run_enable_landmark:
669
+ base_scene_description = self.landmark_processing_manager.remove_landmark_references(base_scene_description)
670
+
671
+ # LLM 增強
672
+ enhanced_final_description = self._enhance_final_description(
673
+ base_scene_description, final_best_scene, final_scene_confidence, detected_objects_main,
674
+ final_functional_zones, final_activities, final_safety_concerns, lighting_info,
675
+ clip_analysis_results, current_run_enable_landmark, image_dims_val, final_landmark_info
676
+ )
677
+
678
+ # 清理增強描述的地標引用
679
+ if not current_run_enable_landmark:
680
+ enhanced_final_description = self.landmark_processing_manager.remove_landmark_references(enhanced_final_description)
681
+
682
+ # 構建最終輸出字典
683
+ output_result = {
684
+ "scene_type": final_best_scene if final_scene_confidence >= scene_confidence_threshold else "unknown",
685
+ "scene_name": (self.scene_types.get(final_best_scene, {}).get("name", "Unknown Scene")
686
+ if final_scene_confidence >= scene_confidence_threshold else "Unknown Scene"),
687
+ "confidence": round(float(final_scene_confidence), 4),
688
+ "description": base_scene_description,
689
+ "enhanced_description": enhanced_final_description,
690
+ "objects_present": [{"class_id": obj.get("class_id", -1),
691
+ "class_name": obj.get("class_name", "unknown"),
692
+ "confidence": round(float(obj.get("confidence", 0.0)), 4)}
693
+ for obj in detected_objects_main],
694
+ "object_count": len(detected_objects_main),
695
+ "regions": region_analysis_val,
696
+ "possible_activities": final_activities,
697
+ "safety_concerns": final_safety_concerns,
698
+ "functional_zones": final_functional_zones,
699
+ "lighting_conditions": lighting_info if lighting_info else {"time_of_day": "unknown", "confidence": 0.0, "source": "default"}
700
+ }
701
+
702
+ # 添加替代場景
703
+ if self.descriptor and hasattr(self.descriptor, '_get_alternative_scenes'):
704
+ output_result["alternative_scenes"] = self.descriptor._get_alternative_scenes(
705
+ scene_scores_fused, scene_confidence_threshold, top_k=2
706
+ )
707
+
708
+ # 添加地標相關信息
709
+ if current_run_enable_landmark and final_landmark_info and final_landmark_info.get("detected_landmarks"):
710
+ output_result.update(final_landmark_info)
711
+ if final_best_scene in ["tourist_landmark", "natural_landmark", "historical_monument"]:
712
+ output_result["scene_source"] = "landmark_detection"
713
+ elif not current_run_enable_landmark:
714
+ for key_rm in ["detected_landmarks", "primary_landmark", "detailed_landmarks", "scene_source"]:
715
+ if key_rm in output_result:
716
+ del output_result[key_rm]
717
+
718
+ # 添加 CLIP 分析結果
719
+ if clip_analysis_results and isinstance(clip_analysis_results, dict) and "error" not in clip_analysis_results:
720
+ top_scene_clip = clip_analysis_results.get("top_scene", ("unknown", 0.0))
721
+ output_result["clip_analysis"] = {
722
+ "top_scene": (top_scene_clip[0], round(float(top_scene_clip[1]), 4)),
723
+ "cultural_analysis": clip_analysis_results.get("cultural_analysis", {}) if current_run_enable_landmark else {}
724
+ }
725
+
726
+ return output_result
727
+
728
+ # 輔助方法
729
+ def _generate_region_analysis(self, detected_objects):
730
+ """生成區域分析結果。"""
731
+ if self.spatial_analyzer and hasattr(self.spatial_analyzer, '_analyze_regions'):
732
+ try:
733
+ return self.spatial_analyzer._analyze_regions(detected_objects)
734
+ except Exception as e:
735
+ self.logger.error(f"Error analyzing regions: {e}")
736
+ return {}
737
+
738
+ def _generate_functional_zones(self, detected_objects, scene_type):
739
+ """
740
+ 生成功能區域。
741
+ 由於原本直接呼叫 _identify_landmark_zones,導致非地標場景必定回 {}。
742
+ 這裡改為呼叫 _identify_functional_zones,並帶入 scene_type。
743
+ """
744
+ try:
745
+ # 如果 spatial_analyzer 可以識別 functional zones,就調用它
746
+ if self.spatial_analyzer and hasattr(self.spatial_analyzer, '_identify_functional_zones'):
747
+ return self.spatial_analyzer._identify_functional_zones(detected_objects, scene_type)
748
+ except Exception as e:
749
+ self.logger.error(f"Error identifying functional zones: {e}")
750
+ self.logger.error(traceback.format_exc())
751
+ return {}
752
+
753
+
754
+ def _generate_scene_description(self, scene_type, detected_objects, confidence,
755
+ lighting_info, functional_zones, image_dims):
756
+ """生成場景描述。"""
757
+ if self.scene_describer and hasattr(self.scene_describer, 'generate_description'):
758
+ try:
759
+ for obj in detected_objects:
760
+ if obj.get("is_landmark"):
761
+ loc_obj = obj.get("location", "")
762
+ lm_id_obj = obj.get("landmark_id")
763
+ if (not loc_obj) and lm_id_obj and lm_id_obj in ALL_LANDMARKS:
764
+ obj["location"] = ALL_LANDMARKS[lm_id_obj].get("location", "")
765
+
766
+ return self.scene_describer.generate_description(
767
+ scene_type=scene_type,
768
+ detected_objects=detected_objects,
769
+ confidence=confidence,
770
+ lighting_info=lighting_info,
771
+ functional_zones=list(functional_zones.keys()) if functional_zones else [],
772
+ enable_landmark=self.enable_landmark,
773
+ scene_scores={scene_type: confidence},
774
+ spatial_analysis={},
775
+ image_dimensions=image_dims
776
+ )
777
+ except Exception as e:
778
+ self.logger.error(f"Error generating scene description: {e}")
779
+ return f"A {scene_type} scene."
780
+
781
+ def _enhance_description_with_llm(self, scene_description, scene_type, detected_objects,
782
+ confidence, lighting_info, functional_zones, landmark_results, image_dims):
783
+ """使用 LLM 增強描述。"""
784
+ if not self.use_llm or not self.llm_enhancer:
785
+ return scene_description
786
+
787
+ try:
788
+ prominent_objects_detail = ""
789
+ if self.scene_describer and hasattr(self.scene_describer, 'format_object_list_for_description'):
790
+ try:
791
+ prominent_objects_detail = self.scene_describer.format_object_list_for_description(
792
+ detected_objects[:min(1, len(detected_objects))]
793
+ )
794
+ except Exception as e:
795
+ self.logger.error(f"Error formatting object list: {e}")
796
+
797
+ w_img, h_img = image_dims if image_dims else (1, 1)
798
+ scene_data_llm = {
799
+ "original_description": scene_description,
800
+ "scene_type": scene_type,
801
+ "scene_name": self.scene_types.get(scene_type, {}).get("name", "Landmark"),
802
+ "detected_objects": detected_objects,
803
+ "object_list": "landmark",
804
+ "confidence": confidence,
805
+ "lighting_info": lighting_info,
806
+ "functional_zones": functional_zones,
807
+ "clip_analysis": landmark_results.get("clip_analysis_on_full_image", {}),
808
+ "enable_landmark": True,
809
+ "image_width": w_img,
810
+ "image_height": h_img,
811
+ "prominent_objects_detail": prominent_objects_detail
812
+ }
813
+
814
+ return self.llm_enhancer.enhance_description(scene_data_llm)
815
+ except Exception as e:
816
+ self.logger.error(f"Error enhancing description with LLM: {e}")
817
+ traceback.print_exc()
818
+ return scene_description
819
+
820
+ def _enhance_no_detection_description(self, desc, scene_type, confidence, lighting_info,
821
+ clip_analysis, enable_landmark, width, height):
822
+ """增強無檢測結果的描述。"""
823
+ if not self.use_llm or not self.llm_enhancer:
824
+ return desc
825
+
826
+ try:
827
+ clip_analysis_safe = {}
828
+ if isinstance(clip_analysis, dict):
829
+ clip_analysis_safe = clip_analysis
830
+
831
+ scene_data_llm = {
832
+ "original_description": desc,
833
+ "scene_type": scene_type,
834
+ "scene_name": "Contextually Inferred (No Detections)",
835
+ "detected_objects": [],
836
+ "object_list": "general ambiance",
837
+ "confidence": confidence,
838
+ "lighting_info": lighting_info or {"time_of_day": "unknown", "confidence": 0.0},
839
+ "clip_analysis": clip_analysis_safe,
840
+ "enable_landmark": enable_landmark,
841
+ "image_width": width,
842
+ "image_height": height,
843
+ "prominent_objects_detail": "the overall visual context"
844
+ }
845
+
846
+ if hasattr(self.llm_enhancer, 'enhance_description'):
847
+ try:
848
+ enhanced = self.llm_enhancer.enhance_description(scene_data_llm)
849
+ if enhanced and len(enhanced.strip()) >= 20:
850
+ return enhanced
851
+ except Exception as e:
852
+ self.logger.error(f"Error in enhance_description: {e}")
853
+
854
+ if hasattr(self.llm_enhancer, 'handle_no_detection'):
855
+ try:
856
+ return self.llm_enhancer.handle_no_detection(clip_analysis_safe)
857
+ except Exception as e:
858
+ self.logger.error(f"Error in handle_no_detection: {e}")
859
+ except Exception as e:
860
+ self.logger.error(f"Error preparing data for LLM enhancement: {e}")
861
+ traceback.print_exc()
862
+
863
+ return desc
864
+
865
+ def _extract_possible_activities(self, detected_objects, landmark_results):
866
+ """提取可能的活動。"""
867
+ possible_activities = ["Sightseeing"]
868
+
869
+ # 檢查是否��主要地標活動從 CLIP 分析結果中獲取
870
+ primary_landmark_activities = landmark_results.get("primary_landmark_activities", [])
871
+
872
+ if primary_landmark_activities:
873
+ self.logger.info(f"Using {len(primary_landmark_activities)} landmark-specific activities")
874
+ possible_activities = primary_landmark_activities
875
+ else:
876
+ # 從檢測到的地標中提取特定活動
877
+ landmark_specific_activities = self.landmark_processing_manager.extract_landmark_specific_activities(detected_objects)
878
+
879
+ if landmark_specific_activities:
880
+ possible_activities = list(set(landmark_specific_activities)) # 去重
881
+ self.logger.info(f"Extracted {len(possible_activities)} activities from landmark data")
882
+ else:
883
+ # 回退到通用活動推斷
884
+ if self.descriptor and hasattr(self.descriptor, '_infer_possible_activities'):
885
+ try:
886
+ possible_activities = self.descriptor._infer_possible_activities(
887
+ "tourist_landmark",
888
+ detected_objects,
889
+ enable_landmark=True,
890
+ scene_scores={"tourist_landmark": 0.8}
891
+ )
892
+ except Exception as e:
893
+ self.logger.error(f"Error inferring possible activities: {e}")
894
+
895
+ return possible_activities
896
+
897
+ def _enhance_final_description(self, base_description, scene_type, scene_confidence, detected_objects,
898
+ functional_zones, activities, safety_concerns, lighting_info,
899
+ clip_analysis_results, enable_landmark, image_dims, landmark_info):
900
+ """增強最終描述。"""
901
+ if not self.use_llm or not self.llm_enhancer:
902
+ return base_description
903
+
904
+ try:
905
+ obj_list_for_llm = ", ".join(sorted(list(set(
906
+ obj["class_name"] for obj in detected_objects
907
+ if obj.get("confidence", 0) > 0.4 and not obj.get("is_landmark")
908
+ ))))
909
+
910
+ if not obj_list_for_llm and enable_landmark and landmark_info.get("primary_landmark"):
911
+ obj_list_for_llm = landmark_info["primary_landmark"].get("class_name", "a prominent feature")
912
+ elif not obj_list_for_llm:
913
+ obj_list_for_llm = "various visual elements"
914
+
915
+ # 生成物體統計信息
916
+ object_statistics = {}
917
+ for obj in detected_objects:
918
+ class_name = obj.get("class_name", "unknown")
919
+ if class_name not in object_statistics:
920
+ object_statistics[class_name] = {
921
+ "count": 0,
922
+ "avg_confidence": 0.0,
923
+ "max_confidence": 0.0,
924
+ "instances": []
925
+ }
926
+
927
+ stats = object_statistics[class_name]
928
+ stats["count"] += 1
929
+ stats["instances"].append(obj)
930
+ stats["max_confidence"] = max(stats["max_confidence"], obj.get("confidence", 0.0))
931
+
932
+ # 計算平均信心度
933
+ for class_name, stats in object_statistics.items():
934
+ if stats["count"] > 0:
935
+ total_conf = sum(inst.get("confidence", 0.0) for inst in stats["instances"])
936
+ stats["avg_confidence"] = total_conf / stats["count"]
937
+
938
+ llm_scene_data = {
939
+ "original_description": base_description,
940
+ "scene_type": scene_type,
941
+ "scene_name": self.scene_types.get(scene_type, {}).get("name", "Unknown Scene"),
942
+ "detected_objects": detected_objects,
943
+ "object_list": obj_list_for_llm,
944
+ "object_statistics": object_statistics,
945
+ "confidence": scene_confidence,
946
+ "lighting_info": lighting_info,
947
+ "functional_zones": functional_zones,
948
+ "activities": activities,
949
+ "safety_concerns": safety_concerns,
950
+ "clip_analysis": clip_analysis_results if isinstance(clip_analysis_results, dict) else None,
951
+ "enable_landmark": enable_landmark,
952
+ "image_width": image_dims[0] if image_dims else None,
953
+ "image_height": image_dims[1] if image_dims else None,
954
+ "prominent_objects_detail": ""
955
+ }
956
+
957
+ # 添加顯著物體詳細信息
958
+ if self.scene_describer and hasattr(self.scene_describer, 'get_prominent_objects') and hasattr(self.scene_describer, 'format_object_list_for_description'):
959
+ try:
960
+ prominent_objects = self.scene_describer.get_prominent_objects(
961
+ detected_objects, min_prominence_score=0.1, max_categories_to_return=3, max_total_objects=7
962
+ )
963
+ llm_scene_data["prominent_objects_detail"] = self.scene_describer.format_object_list_for_description(prominent_objects)
964
+ except Exception as e:
965
+ self.logger.error(f"Error getting prominent objects: {e}")
966
+
967
+ if enable_landmark and landmark_info.get("primary_landmark"):
968
+ llm_scene_data["primary_landmark_info"] = landmark_info["primary_landmark"]
969
+
970
+ return self.llm_enhancer.enhance_description(llm_scene_data)
971
+ except Exception as e:
972
+ self.logger.error(f"Error in LLM Enhancement in main flow (analyze method): {e}")
973
+ return base_description
scene_analyzer.py CHANGED
The diff for this file is too large to render. See raw diff
 
scene_scoring_engine.py ADDED
@@ -0,0 +1,491 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import traceback
3
+ from typing import Dict, List, Tuple, Optional, Any
4
+
5
+ from scene_type import SCENE_TYPES
6
+
7
+ class SceneScoringEngine:
8
+ """
9
+ 負責場景評分相關的所有計算邏輯,包括基於 YOLO 檢測的場景評分、
10
+ 多種場景分數融合,以及最終場景類型的確定。
11
+ 這邊會有YOLO, CLIP, Places365混合運用的分數計算
12
+ """
13
+
14
+ # 日常場景,用於特殊評分
15
+ EVERYDAY_SCENE_TYPE_KEYS = [
16
+ "general_indoor_space", "generic_street_view",
17
+ "desk_area_workspace", "outdoor_gathering_spot",
18
+ "kitchen_counter_or_utility_area"
19
+ ]
20
+
21
+ def __init__(self, scene_types: Dict[str, Any], enable_landmark: bool = True):
22
+ """
23
+ 初始化場景評分引擎。
24
+
25
+ Args:
26
+ scene_types: 場景類型定義字典
27
+ enable_landmark: 是否啟用地標檢測功能
28
+ """
29
+ self.logger = logging.getLogger(__name__)
30
+ self.scene_types = scene_types
31
+ self.enable_landmark = enable_landmark
32
+
33
+ def compute_scene_scores(self, detected_objects: List[Dict],
34
+ spatial_analysis_results: Optional[Dict] = None) -> Dict[str, float]:
35
+ """
36
+ 基於檢測到的物體計算各場景類型的置信度分數。
37
+ 增強了對日常場景的評分能力,並考慮物體豐富度和空間聚合性。
38
+
39
+ Args:
40
+ detected_objects: 檢測到的物體列表,包含物體詳細資訊
41
+ spatial_analysis_results: 空間分析器的輸出結果,特別是 'objects_by_region' 部分
42
+
43
+ Returns:
44
+ 場景類型到置信度分數的映射字典
45
+ """
46
+ scene_scores = {}
47
+ if not detected_objects:
48
+ for scene_type_key in self.scene_types:
49
+ scene_scores[scene_type_key] = 0.0
50
+ return scene_scores
51
+
52
+ # 準備檢測物體的數據
53
+ detected_class_ids_all = [obj["class_id"] for obj in detected_objects]
54
+ detected_classes_set_all = set(detected_class_ids_all)
55
+ class_counts_all = {}
56
+ for obj in detected_objects:
57
+ class_id = obj["class_id"]
58
+ class_counts_all[class_id] = class_counts_all.get(class_id, 0) + 1
59
+
60
+ # 評估 scene_types 中定義的每個場景類型
61
+ for scene_type, scene_def in self.scene_types.items():
62
+ required_obj_ids_defined = set(scene_def.get("required_objects", []))
63
+ optional_obj_ids_defined = set(scene_def.get("optional_objects", []))
64
+ min_required_matches_needed = scene_def.get("minimum_required", 0)
65
+
66
+ # 確定哪些實際檢測到的物體與此場景類型相關
67
+ # 這些列表將存儲實際檢測到的物體字典,而不僅僅是 class_ids
68
+ actual_required_objects_found_list = []
69
+ for req_id in required_obj_ids_defined:
70
+ if req_id in detected_classes_set_all:
71
+ # 找到此必需物體的第一個實例添加到列表中(用於後續的聚合性檢查)
72
+ for dobj in detected_objects:
73
+ if dobj['class_id'] == req_id:
74
+ actual_required_objects_found_list.append(dobj)
75
+ break
76
+
77
+ num_required_matches_found = len(actual_required_objects_found_list)
78
+
79
+ actual_optional_objects_found_list = []
80
+ for opt_id in optional_obj_ids_defined:
81
+ if opt_id in detected_classes_set_all:
82
+ for dobj in detected_objects:
83
+ if dobj['class_id'] == opt_id:
84
+ actual_optional_objects_found_list.append(dobj)
85
+ break
86
+
87
+ num_optional_matches_found = len(actual_optional_objects_found_list)
88
+
89
+ # 初始分數計算權重
90
+ # 基礎分數:55% 來自必需物體,25% 來自可選物體,10% 豐富度,10% 聚合性(最大值)
91
+ required_weight = 0.55
92
+ optional_weight = 0.25
93
+ richness_bonus_max = 0.10
94
+ cohesion_bonus_max = 0.10 # _get_object_spatial_cohesion_score 的最大獎勵是 0.1
95
+
96
+ current_scene_score = 0.0
97
+ objects_to_check_for_cohesion = [] # 用於空間聚合性評分
98
+
99
+ # 檢查 minimum_required 條件並計算基礎分數
100
+ if num_required_matches_found >= min_required_matches_needed:
101
+ if len(required_obj_ids_defined) > 0:
102
+ required_ratio = num_required_matches_found / len(required_obj_ids_defined)
103
+ else: # 沒有定義必需物體,但 min_required_matches_needed 可能為 0
104
+ required_ratio = 1.0 if min_required_matches_needed == 0 else 0.0
105
+
106
+ current_scene_score = required_ratio * required_weight
107
+ objects_to_check_for_cohesion.extend(actual_required_objects_found_list)
108
+
109
+ # 從可選物體添加分數
110
+ if len(optional_obj_ids_defined) > 0:
111
+ optional_ratio = num_optional_matches_found / len(optional_obj_ids_defined)
112
+ current_scene_score += optional_ratio * optional_weight
113
+ objects_to_check_for_cohesion.extend(actual_optional_objects_found_list)
114
+
115
+ # 日常場景的靈活處理,如果嚴格的 minimum_required(基於 'required_objects')未滿足
116
+ elif scene_type in self.EVERYDAY_SCENE_TYPE_KEYS:
117
+ # 如果日常場景有許多可選項目,它仍可能是一個弱候選
118
+ # 檢查是否存在相當比例的 'optional_objects'
119
+ if (len(optional_obj_ids_defined) > 0 and
120
+ (num_optional_matches_found / len(optional_obj_ids_defined)) >= 0.25): # 例如,至少 25% 的典型可選項目
121
+ # 對這些類型的基礎分數更多地基於可選物體的滿足度
122
+ current_scene_score = (num_optional_matches_found / len(optional_obj_ids_defined)) * (required_weight + optional_weight * 0.5) # 給予一些基礎分數
123
+ objects_to_check_for_cohesion.extend(actual_optional_objects_found_list)
124
+ else:
125
+ scene_scores[scene_type] = 0.0
126
+ continue # 跳過此場景類型
127
+ else: # 對於非日常場景,如果未滿足 minimum_required,分數為 0
128
+ scene_scores[scene_type] = 0.0
129
+ continue
130
+
131
+ # 物體豐富度/多樣性的獎勵
132
+ # 考慮找到的與場景定義相關的唯一物體類別
133
+ relevant_defined_class_ids = required_obj_ids_defined.union(optional_obj_ids_defined)
134
+ unique_relevant_detected_classes = relevant_defined_class_ids.intersection(detected_classes_set_all)
135
+
136
+ object_richness_score = 0.0
137
+ if len(relevant_defined_class_ids) > 0:
138
+ richness_ratio = len(unique_relevant_detected_classes) / len(relevant_defined_class_ids)
139
+ object_richness_score = min(richness_bonus_max, richness_ratio * 0.15) # 豐富度最大 10% 獎勵
140
+ current_scene_score += object_richness_score
141
+
142
+ # 空間聚合性的獎勵(如果提供了 spatial_analysis_results)
143
+ spatial_cohesion_bonus = 0.0
144
+ if spatial_analysis_results and objects_to_check_for_cohesion:
145
+ spatial_cohesion_bonus = self._get_object_spatial_cohesion_score(
146
+ objects_to_check_for_cohesion, # 傳遞實際檢測到的物體字典列表
147
+ spatial_analysis_results
148
+ )
149
+ current_scene_score += spatial_cohesion_bonus # 此獎勵最大 0.1
150
+
151
+ # 關鍵物體多個實例的獎勵(原始邏輯的精煉版)
152
+ multiple_instance_bonus = 0.0
153
+ # 對於多實例獎勵,專注於場景定義中心的物體
154
+ key_objects_for_multi_instance_check = required_obj_ids_defined
155
+ if scene_type in self.EVERYDAY_SCENE_TYPE_KEYS and len(optional_obj_ids_defined) > 0:
156
+ # 對於日常場景,如果某些可選物體多次出現,也可以是關鍵的
157
+ # 例如,"general_indoor_space" 中的多把椅子
158
+ key_objects_for_multi_instance_check = key_objects_for_multi_instance_check.union(
159
+ set(list(optional_obj_ids_defined)[:max(1, len(optional_obj_ids_defined)//2)]) # 考慮前半部分的可選物體
160
+ )
161
+
162
+ for class_id_check in key_objects_for_multi_instance_check:
163
+ if class_id_check in detected_classes_set_all and class_counts_all.get(class_id_check, 0) > 1:
164
+ multiple_instance_bonus += 0.025 # 每種類型稍微小一點的獎勵
165
+ current_scene_score += min(0.075, multiple_instance_bonus) # 最大 7.5% 獎勵
166
+
167
+ # 應用 SCENE_TYPES 中定義的場景特定優先級
168
+ if "priority" in scene_def:
169
+ current_scene_score *= scene_def["priority"]
170
+
171
+ scene_scores[scene_type] = min(1.0, max(0.0, current_scene_score))
172
+
173
+ # 如果通過實例屬性 self.enable_landmark 禁用地標檢測,
174
+ # 確保地標特定場景類型的分數被歸零。
175
+ if not self.enable_landmark:
176
+ landmark_scene_types = ["tourist_landmark", "natural_landmark", "historical_monument"]
177
+ for lm_scene_type in landmark_scene_types:
178
+ if lm_scene_type in scene_scores:
179
+ scene_scores[lm_scene_type] = 0.0
180
+
181
+ return scene_scores
182
+
183
+ def _get_object_spatial_cohesion_score(self, objects_for_scene: List[Dict],
184
+ spatial_analysis_results: Optional[Dict]) -> float:
185
+ """
186
+ 基於場景關鍵物體的空間聚合程度計算分數。
187
+ 較高的分數意味著物體在較少的區域中更加集中。
188
+ 這是一個啟發式方法,可以進一步精煉。
189
+
190
+ Args:
191
+ objects_for_scene: 與當前評估��景類型相關的檢測物體列表(至少包含 'class_id' 的字典)
192
+ spatial_analysis_results: SpatialAnalyzer._analyze_regions 的輸出
193
+ 預期格式:{'objects_by_region': {'region_name': [{'class_id': id, ...}, ...]}}
194
+
195
+ Returns:
196
+ float: 聚合性分數,通常是小額獎勵(例如,0.0 到 0.1)
197
+ """
198
+ if (not objects_for_scene or not spatial_analysis_results or
199
+ "objects_by_region" not in spatial_analysis_results or
200
+ not spatial_analysis_results["objects_by_region"]):
201
+ return 0.0
202
+
203
+ # 獲取定義當前場景類型的關鍵物體的 class_ids 集合
204
+ key_object_class_ids = {obj.get('class_id') for obj in objects_for_scene if obj.get('class_id') is not None}
205
+ if not key_object_class_ids:
206
+ return 0.0
207
+
208
+ # 找出這些關鍵物體出現在哪些區域
209
+ regions_containing_key_objects = set()
210
+ # 計算找到的關鍵物體實例數量
211
+ # 這有助於區分 1 個區域中的 1 把椅子與分佈在 5 個區域中的 5 把椅子
212
+ total_key_object_instances_found = 0
213
+
214
+ for region_name, objects_in_region_list in spatial_analysis_results["objects_by_region"].items():
215
+ region_has_key_object = False
216
+ for obj_in_region in objects_in_region_list:
217
+ if obj_in_region.get('class_id') in key_object_class_ids:
218
+ region_has_key_object = True
219
+ total_key_object_instances_found += 1 # 計算每個實例
220
+ if region_has_key_object:
221
+ regions_containing_key_objects.add(region_name)
222
+
223
+ num_distinct_key_objects_in_scene = len(key_object_class_ids) # 關鍵物體的類型數量
224
+ num_instances_of_key_objects_passed = len(objects_for_scene) # 傳遞的實例數量
225
+
226
+ if not regions_containing_key_objects or num_instances_of_key_objects_passed == 0:
227
+ return 0.0
228
+
229
+ # 簡單的啟發式方法:
230
+ if (len(regions_containing_key_objects) == 1 and
231
+ total_key_object_instances_found >= num_instances_of_key_objects_passed * 0.75):
232
+ return 0.10 # 最強聚合性:大部分/所有關鍵物體實例在單個區域中
233
+ elif (len(regions_containing_key_objects) <= 2 and
234
+ total_key_object_instances_found >= num_instances_of_key_objects_passed * 0.60):
235
+ return 0.05 # 中等聚合性:大部分/所有關鍵物體實例在最多兩個區域中
236
+ elif (len(regions_containing_key_objects) <= 3 and
237
+ total_key_object_instances_found >= num_instances_of_key_objects_passed * 0.50):
238
+ return 0.02 # 較弱聚合性
239
+
240
+ return 0.0
241
+
242
+ def determine_scene_type(self, scene_scores: Dict[str, float]) -> Tuple[str, float]:
243
+ """
244
+ 基於分數確定最可能的場景類型。如果偵測到地標分數夠高,則優先回傳 "tourist_landmark"。
245
+
246
+ Args:
247
+ scene_scores: 場景類型到置信度分數的映射字典
248
+
249
+ Returns:
250
+ (最佳場景類型, 置信度) 的元組
251
+ """
252
+ if not scene_scores:
253
+ return "unknown", 0.0
254
+
255
+ # 檢查地標相關分數是否達到門檻,如果是,直接回傳 "tourist_landmark"
256
+ # 假設場景分數 dictionary 中,"tourist_landmark"、"historical_monument"、"natural_landmark" 三個 key
257
+ # 分別代表不同類型地標。將它們加總,若總分超過 0.3,就認定為地標場景。
258
+ landmark_score = (
259
+ scene_scores.get("tourist_landmark", 0.0) +
260
+ scene_scores.get("historical_monument", 0.0) +
261
+ scene_scores.get("natural_landmark", 0.0)
262
+ )
263
+ if landmark_score >= 0.3:
264
+ # 回傳地標場景類型,以及該分數總和
265
+ return "tourist_landmark", float(landmark_score)
266
+
267
+ # 找分數最高的那個場景
268
+ best_scene = max(scene_scores, key=scene_scores.get)
269
+ best_score = scene_scores[best_scene]
270
+ return best_scene, float(best_score)
271
+
272
+ def fuse_scene_scores(self, yolo_scene_scores: Dict[str, float],
273
+ clip_scene_scores: Dict[str, float],
274
+ num_yolo_detections: int = 0,
275
+ avg_yolo_confidence: float = 0.0,
276
+ lighting_info: Optional[Dict] = None,
277
+ places365_info: Optional[Dict] = None) -> Dict[str, float]:
278
+ """
279
+ 融合來自 YOLO 物體檢測、CLIP 分析和 Places365 場景分類的場景分數。
280
+ 根據場景類型、YOLO 檢測的豐富度、照明資訊和 Places365 置信度調整權重。
281
+
282
+ Args:
283
+ yolo_scene_scores: 基於 YOLO 物體檢測的場景分數
284
+ clip_scene_scores: 基於 CLIP 分析的場景分數
285
+ num_yolo_detections: YOLO 檢測到的置信度足夠的非地標物體總數
286
+ avg_yolo_confidence: YOLO 檢測到的非地標物體��平均置信度
287
+ lighting_info: 可選的照明條件分析結果,預期包含 'is_indoor' (bool) 和 'confidence' (float)
288
+ places365_info: 可選的 Places365 場景分類結果,預期包含 'mapped_scene_type'、'confidence' 和 'is_indoor'
289
+
290
+ Returns:
291
+ Dict: 融合了所有三個分析來源的場景分數
292
+ """
293
+ # 處理其中一個分數字典可能為空或所有分數實際上為零的情況
294
+ # 提取和處理 Places365 場景分數
295
+ places365_scene_scores_map = {} # 修改變數名稱以避免與傳入的字典衝突
296
+ if places365_info and places365_info.get('confidence', 0) > 0.1:
297
+ mapped_scene_type = places365_info.get('mapped_scene_type', 'unknown')
298
+ places365_confidence = places365_info.get('confidence', 0.0)
299
+
300
+ if mapped_scene_type in self.scene_types.keys():
301
+ places365_scene_scores_map[mapped_scene_type] = places365_confidence # 使用新的字典
302
+ self.logger.info(f"Places365 contributing: {mapped_scene_type} with confidence {places365_confidence:.3f}")
303
+
304
+ # 檢查各個數據來源是否具有有意義的分數
305
+ yolo_has_meaningful_scores = bool(yolo_scene_scores and any(s > 1e-5 for s in yolo_scene_scores.values())) # 確保是布林值
306
+ clip_has_meaningful_scores = bool(clip_scene_scores and any(s > 1e-5 for s in clip_scene_scores.values())) # 確保是布林值
307
+ places365_has_meaningful_scores = bool(places365_scene_scores_map and any(s > 1e-5 for s in places365_scene_scores_map.values()))
308
+
309
+ # 計算有意義的數據來源數量
310
+ meaningful_sources_count = sum([
311
+ yolo_has_meaningful_scores,
312
+ clip_has_meaningful_scores,
313
+ places365_has_meaningful_scores
314
+ ])
315
+
316
+ # 處理特殊情況:無有效數據源或僅有單一數據源
317
+ if meaningful_sources_count == 0:
318
+ return {st: 0.0 for st in self.scene_types.keys()}
319
+ elif meaningful_sources_count == 1:
320
+ if yolo_has_meaningful_scores:
321
+ return {st: yolo_scene_scores.get(st, 0.0) for st in self.scene_types.keys()}
322
+ elif clip_has_meaningful_scores:
323
+ return {st: clip_scene_scores.get(st, 0.0) for st in self.scene_types.keys()}
324
+ elif places365_has_meaningful_scores:
325
+ return {st: places365_scene_scores_map.get(st, 0.0) for st in self.scene_types.keys()}
326
+
327
+ # 初始化融合分數結果字典
328
+ fused_scores = {}
329
+ all_relevant_scene_types = set(self.scene_types.keys())
330
+ all_possible_scene_types = all_relevant_scene_types.union(
331
+ set(yolo_scene_scores.keys()),
332
+ set(clip_scene_scores.keys()),
333
+ set(places365_scene_scores_map.keys())
334
+ )
335
+
336
+ # 基礎權重 - 調整以適應三個來源
337
+ default_yolo_weight = 0.5
338
+ default_clip_weight = 0.3
339
+ default_places365_weight = 0.2
340
+
341
+ is_lighting_indoor = None
342
+ lighting_analysis_confidence = 0.0
343
+ if lighting_info and isinstance(lighting_info, dict):
344
+ is_lighting_indoor = lighting_info.get("is_indoor")
345
+ lighting_analysis_confidence = lighting_info.get("confidence", 0.0)
346
+
347
+ for scene_type in all_possible_scene_types:
348
+ yolo_score = yolo_scene_scores.get(scene_type, 0.0)
349
+ clip_score = clip_scene_scores.get(scene_type, 0.0)
350
+ places365_score = places365_scene_scores_map.get(scene_type, 0.0)
351
+
352
+ current_yolo_weight = default_yolo_weight
353
+ current_clip_weight = default_clip_weight
354
+ current_places365_weight = default_places365_weight
355
+
356
+ scene_definition = self.scene_types.get(scene_type, {})
357
+
358
+ # 基於場景類型性質和 YOLO 豐富度的權重調整
359
+ if scene_type in self.EVERYDAY_SCENE_TYPE_KEYS:
360
+ # Places365 在日常場景分類方面表現出色
361
+ if num_yolo_detections >= 5 and avg_yolo_confidence >= 0.45: # 豐富的 YOLO 用於日常場景
362
+ current_yolo_weight = 0.60
363
+ current_clip_weight = 0.15
364
+ current_places365_weight = 0.25
365
+ elif num_yolo_detections >= 3: # 中等 YOLO 用於日常場景
366
+ current_yolo_weight = 0.50
367
+ current_clip_weight = 0.20
368
+ current_places365_weight = 0.30
369
+ else: # 降低 YOLO 用於日常場景,更多依賴 Places365
370
+ current_yolo_weight = 0.35
371
+ current_clip_weight = 0.25
372
+ current_places365_weight = 0.40
373
+
374
+ # 對於 CLIP 的全域理解或特定訓練通常更有價值的場景
375
+ elif any(keyword in scene_type.lower() for keyword in ["asian", "cultural", "aerial", "landmark", "monument", "tourist", "natural_landmark", "historical_monument"]):
376
+ current_yolo_weight = 0.25
377
+ current_clip_weight = 0.65
378
+ current_places365_weight = 0.10 # 地標場景的較低權重
379
+
380
+ # 對於特定室內常見場景(非地標),物體檢測是關鍵,但 Places365 提供強大的場景上下文
381
+ elif any(keyword in scene_type.lower() for keyword in
382
+ ["room", "kitchen", "office", "bedroom", "desk_area", "indoor_space",
383
+ "professional_kitchen", "cafe", "library", "gym", "retail_store",
384
+ "supermarket", "classroom", "conference_room", "medical_facility",
385
+ "educational_setting", "dining_area"]):
386
+ current_yolo_weight = 0.55
387
+ current_clip_weight = 0.20
388
+ current_places365_weight = 0.25
389
+
390
+ # 對於特定室外常見場景(非地標),物體仍然重要
391
+ elif any(keyword in scene_type.lower() for keyword in
392
+ ["parking_lot", "park_area", "beach", "harbor", "playground", "sports_field", "bus_stop", "train_station", "airport"]):
393
+ current_yolo_weight = 0.50
394
+ current_clip_weight = 0.25
395
+ current_places365_weight = 0.25
396
+
397
+ # 如果為此次運行全域禁用地標檢測
398
+ if not self.enable_landmark:
399
+ if any(keyword in scene_type.lower() for keyword in ["landmark", "monument", "tourist"]):
400
+ yolo_score = 0.0 # 應該已經從 compute_scene_scores 中為 0
401
+ clip_score *= 0.05 # 重度懲罰
402
+ places365_score *= 0.8 if scene_type not in self.EVERYDAY_SCENE_TYPE_KEYS else 1.0 # 地標場景的輕微懲罰
403
+ elif (scene_type not in self.EVERYDAY_SCENE_TYPE_KEYS and
404
+ not any(keyword in scene_type.lower() for keyword in ["asian", "cultural", "aerial"])):
405
+ # 將權重從 CLIP 重新分配給 YOLO 和 Places365
406
+ weight_boost = 0.05
407
+ current_yolo_weight = min(0.9, current_yolo_weight + weight_boost)
408
+ current_places365_weight = min(0.9, current_places365_weight + weight_boost)
409
+ current_clip_weight = max(0.1, current_clip_weight - weight_boost * 2)
410
+
411
+ # 如果 Places365 對此特定場景類型有高置信度,則提升其權重
412
+ if places365_score > 0.0 and places365_info: # 這裡的 places365_score 已經是從 map 中獲取
413
+ places365_original_confidence = places365_info.get('confidence', 0.0) # 獲取原始的 Places365 信心度
414
+ if places365_original_confidence > 0.7:
415
+ boost_factor = min(0.2, (places365_original_confidence - 0.7) * 0.4)
416
+ current_places365_weight += boost_factor
417
+ total_other_weight = current_yolo_weight + current_clip_weight
418
+ if total_other_weight > 0:
419
+ reduction_factor = boost_factor / total_other_weight
420
+ current_yolo_weight *= (1 - reduction_factor)
421
+ current_clip_weight *= (1 - reduction_factor)
422
+
423
+ # 權重標準化處理
424
+ total_weight = current_yolo_weight + current_clip_weight + current_places365_weight
425
+ if total_weight > 0: # 避免除以零
426
+ current_yolo_weight /= total_weight
427
+ current_clip_weight /= total_weight
428
+ current_places365_weight /= total_weight
429
+ else:
430
+ current_yolo_weight = 1/3
431
+ current_clip_weight = 1/3
432
+ current_places365_weight = 1/3
433
+
434
+ # 計算融合score
435
+ fused_score = (yolo_score * current_yolo_weight) + (clip_score * current_clip_weight) + (places365_score * current_places365_weight)
436
+
437
+ # 處理室內外判斷的衝突分析
438
+ places365_is_indoor = None
439
+ places365_confidence_for_indoor = 0.0
440
+ effective_is_indoor = is_lighting_indoor
441
+ effective_confidence = lighting_analysis_confidence
442
+
443
+ if places365_info and isinstance(places365_info, dict):
444
+ places365_is_indoor = places365_info.get('is_indoor')
445
+ places365_confidence_for_indoor = places365_info.get('confidence', 0.0)
446
+
447
+ # Places365 在置信度高時覆蓋照明分析
448
+ if places365_confidence_for_indoor >= 0.8 and places365_is_indoor is not None:
449
+ effective_is_indoor = places365_is_indoor
450
+ effective_confidence = places365_confidence_for_indoor
451
+
452
+ # 只在特定場景類型首次處理時輸出調試資訊
453
+ if (scene_type == "intersection" or
454
+ (scene_type in ["urban_intersection", "street_view"] and
455
+ scene_type == sorted(all_possible_scene_types)[0])):
456
+ self.logger.debug(f"Using Places365 indoor/outdoor decision: {places365_is_indoor} (confidence: {places365_confidence_for_indoor:.3f}) over lighting analysis")
457
+
458
+ if effective_is_indoor is not None and effective_confidence >= 0.65:
459
+ # 基於其定義確定場景類型本質上是室內還是室外
460
+ is_defined_as_indoor = ("indoor" in scene_definition.get("description", "").lower() or
461
+ any(kw in scene_type.lower() for kw in ["room", "kitchen", "office", "indoor", "library", "cafe", "gym"]))
462
+ is_defined_as_outdoor = ("outdoor" in scene_definition.get("description", "").lower() or
463
+ any(kw in scene_type.lower() for kw in ["street", "park", "aerial", "beach", "harbor", "intersection", "crosswalk"]))
464
+
465
+ lighting_adjustment_strength = 0.20 # 最大調整因子(例如,20%)
466
+ # 根據分析在閾值以上的置信度來縮放調整
467
+ adjustment_scale = (effective_confidence - 0.65) / (1.0 - 0.65) # 從 0 到 1 縮放
468
+ adjustment = lighting_adjustment_strength * adjustment_scale
469
+ adjustment = min(lighting_adjustment_strength, max(0, adjustment)) # 限制調整
470
+
471
+ if effective_is_indoor and is_defined_as_outdoor:
472
+ fused_score *= (1.0 - adjustment)
473
+ elif not effective_is_indoor and is_defined_as_indoor:
474
+ fused_score *= (1.0 - adjustment)
475
+ elif effective_is_indoor and is_defined_as_indoor:
476
+ fused_score = min(1.0, fused_score * (1.0 + adjustment * 0.5))
477
+ elif not effective_is_indoor and is_defined_as_outdoor:
478
+ fused_score = min(1.0, fused_score * (1.0 + adjustment * 0.5))
479
+
480
+ fused_scores[scene_type] = min(1.0, max(0.0, fused_score))
481
+
482
+ return fused_scores
483
+
484
+ def update_enable_landmark_status(self, enable_landmark: bool):
485
+ """
486
+ 更新地標檢測的啟用狀態。
487
+
488
+ Args:
489
+ enable_landmark: 是否啟用地標檢測
490
+ """
491
+ self.enable_landmark = enable_landmark
scene_viewpoint_analyzer.py ADDED
@@ -0,0 +1,311 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import logging
3
+ import traceback
4
+ import numpy as np
5
+ from typing import Dict, List, Any, Optional, Tuple
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+ class SceneViewpointAnalyzer:
10
+ """
11
+ 負責場景視角檢測和模式識別
12
+ 專注於檢測場景視角(俯視、平視等)並識別特殊場景模式(如十字路口、人流方向等)
13
+ 提供詳細的場景空間分析和視角相關的場景理解功能
14
+ """
15
+
16
+ def __init__(self, enhanced_scene_describer=None):
17
+ """
18
+ 初始化場景視角分析器
19
+
20
+ Args:
21
+ enhanced_scene_describer: 增強場景描述器實例,用於基本視角檢測
22
+ """
23
+ try:
24
+ self.enhanced_scene_describer = enhanced_scene_describer
25
+ logger.info("SceneViewpointAnalyzer initialized successfully")
26
+
27
+ except Exception as e:
28
+ logger.error(f"Failed to initialize SceneViewpointAnalyzer: {str(e)}")
29
+ logger.error(traceback.format_exc())
30
+ raise
31
+
32
+ def detect_viewpoint(self, detected_objects: List[Dict]) -> str:
33
+ """
34
+ 檢測圖像視角類型
35
+
36
+ Args:
37
+ detected_objects: 檢測到的物件列表
38
+
39
+ Returns:
40
+ str: 檢測到的視角類型
41
+ """
42
+ try:
43
+ # 使用內部的場景視角檢測方法
44
+ viewpoint_info = self.detect_scene_viewpoint(detected_objects)
45
+ return viewpoint_info.get("viewpoint", "eye_level")
46
+ except Exception as e:
47
+ logger.warning(f"Error detecting viewpoint: {str(e)}")
48
+ return "eye_level"
49
+
50
+ def get_viewpoint_confidence(self, detected_objects: List[Dict]) -> Tuple[str, float]:
51
+ """
52
+ 獲取視角檢測結果及其信心度
53
+
54
+ Args:
55
+ detected_objects: 檢測到的物件列表
56
+
57
+ Returns:
58
+ Tuple[str, float]: (視角類型, 信心度)
59
+ """
60
+ try:
61
+ viewpoint_info = self.detect_scene_viewpoint(detected_objects)
62
+ viewpoint = viewpoint_info.get("viewpoint", "eye_level")
63
+
64
+ # 根據檢測到的模式計算信心度
65
+ patterns = viewpoint_info.get("patterns", [])
66
+ confidence = 0.5 # 基礎信心度
67
+
68
+ if "crosswalk_intersection" in patterns:
69
+ confidence += 0.3
70
+ if "consistent_object_size" in patterns:
71
+ confidence += 0.2
72
+ if "multi_directional_movement" in patterns:
73
+ confidence += 0.1
74
+
75
+ confidence = min(confidence, 1.0)
76
+ return viewpoint, confidence
77
+
78
+ except Exception as e:
79
+ logger.error(f"Error getting viewpoint confidence: {str(e)}")
80
+ return "eye_level", 0.5
81
+
82
+ def detect_scene_viewpoint(self, detected_objects: List[Dict]) -> Dict:
83
+ """
84
+ 檢測場景視角並識別特殊場景模式
85
+
86
+ Args:
87
+ detected_objects: 檢測到的物件列表
88
+
89
+ Returns:
90
+ 包含視角和場景模式資訊的字典
91
+ """
92
+ try:
93
+ if not detected_objects:
94
+ logger.warning("No detected objects provided for viewpoint detection")
95
+ return {"viewpoint": "eye_level", "patterns": []}
96
+
97
+ # 從物件位置中提取資訊
98
+ patterns = []
99
+
100
+ # 檢測行人位置模式 - 篩選出所有行人物件
101
+ pedestrian_objs = [obj for obj in detected_objects if obj.get("class_id") == 0]
102
+
103
+ # 檢查是否有足夠的行人來識別模式 - 至少需要4個行人才能進行模式分析
104
+ if len(pedestrian_objs) >= 4:
105
+ # 提取行人的標準化中心座標用於模式分析
106
+ pedestrian_positions = [obj["normalized_center"] for obj in pedestrian_objs]
107
+
108
+ # 檢測十字交叉模式 - 這通常出現在斑馬線交叉口的俯視圖
109
+ if self._detect_cross_pattern(pedestrian_positions):
110
+ patterns.append("crosswalk_intersection")
111
+
112
+ # 檢測多方向行人流 - 分析行人是否在多個方向移動
113
+ directions = self._analyze_movement_directions(pedestrian_positions)
114
+ if len(directions) >= 2:
115
+ patterns.append("multi_directional_movement")
116
+
117
+ # 檢查物件的大小一致性 - 在空中俯視圖中,物件大小通常更一致
118
+ # 因為距離相對均勻,不像地面視角會有遠近差異
119
+ if len(detected_objects) >= 5:
120
+ sizes = [obj.get("normalized_area", 0) for obj in detected_objects]
121
+ # 計算標準化變異數,避免受平均值影響
122
+ size_variance = np.var(sizes) / (np.mean(sizes) ** 2) if np.mean(sizes) > 0 else 0
123
+
124
+ # 低變異表示大小一致,可能是俯視角度
125
+ if size_variance < 0.3:
126
+ patterns.append("consistent_object_size")
127
+
128
+ # 基本視角檢測 - 使用增強場景描述器進行基礎視角判斷
129
+ viewpoint = "eye_level" # 預設值
130
+ if self.enhanced_scene_describer and hasattr(self.enhanced_scene_describer, '_detect_viewpoint'):
131
+ viewpoint = self.enhanced_scene_describer._detect_viewpoint(detected_objects)
132
+
133
+ # 根據檢測到的模式增強視角判斷
134
+ # 如果檢測到斑馬線交叉但視角判斷不是空中視角,優先採用模式判斷
135
+ if "crosswalk_intersection" in patterns and viewpoint != "aerial":
136
+ viewpoint = "aerial"
137
+
138
+ result = {
139
+ "viewpoint": viewpoint,
140
+ "patterns": patterns
141
+ }
142
+
143
+ logger.info(f"Viewpoint detection completed: {viewpoint}, patterns: {patterns}")
144
+ return result
145
+
146
+ except Exception as e:
147
+ logger.error(f"Error in scene viewpoint detection: {str(e)}")
148
+ logger.error(traceback.format_exc())
149
+ return {"viewpoint": "eye_level", "patterns": []}
150
+
151
+ def _detect_cross_pattern(self, positions: List[List[float]]) -> bool:
152
+ """
153
+ 檢測位置中的十字交叉模式
154
+ 這種模式通常出現在十字路口的俯視圖中,行人分布呈現十字形
155
+
156
+ Args:
157
+ positions: 位置列表 [[x1, y1], [x2, y2], ...]
158
+
159
+ Returns:
160
+ 是否檢測到十字交叉模式
161
+ """
162
+ try:
163
+ if len(positions) < 8: # 需要足夠多的點才能形成有意義的十字模式
164
+ return False
165
+
166
+ # 提取 x 和 y 座標進行分析
167
+ x_coords = [pos[0] for pos in positions]
168
+ y_coords = [pos[1] for pos in positions]
169
+
170
+ # 計算座標的平均值,用於確定中心線位置
171
+ x_mean = np.mean(x_coords)
172
+ y_mean = np.mean(y_coords)
173
+
174
+ # 計算在中心線附近的點數量
175
+ # 如果有足夠多的點在垂直和水平中心線附近,可能是十字交叉
176
+ near_x_center = sum(1 for x in x_coords if abs(x - x_mean) < 0.1) # 容忍10%的偏差
177
+ near_y_center = sum(1 for y in y_coords if abs(y - y_mean) < 0.1) # 容忍10%的偏差
178
+
179
+ # 十字交叉模式的判斷條件:垂直和水平方向都有足夠的點聚集
180
+ is_cross_pattern = near_x_center >= 3 and near_y_center >= 3
181
+
182
+ if is_cross_pattern:
183
+ logger.info(f"Cross pattern detected with {near_x_center} points near vertical center and {near_y_center} points near horizontal center")
184
+
185
+ return is_cross_pattern
186
+
187
+ except Exception as e:
188
+ logger.error(f"Error detecting cross pattern: {str(e)}")
189
+ logger.error(traceback.format_exc())
190
+ return False
191
+
192
+ def _analyze_movement_directions(self, positions: List[List[float]]) -> List[str]:
193
+ """
194
+ 分析位置中的移動方向
195
+ 通過分析座標分布範圍來推斷主要的移動方向
196
+
197
+ Args:
198
+ positions: 位置列表 [[x1, y1], [x2, y2], ...]
199
+
200
+ Returns:
201
+ 檢測到的主要方向列表
202
+ """
203
+ try:
204
+ if len(positions) < 6: # 需要足夠的點才能分析方向性
205
+ return []
206
+
207
+ # 提取 x 和 y 座標
208
+ x_coords = [pos[0] for pos in positions]
209
+ y_coords = [pos[1] for pos in positions]
210
+
211
+ directions = []
212
+
213
+ # 水平移動分析(左右移動)
214
+ # 計算x座標的標準差和範圍來判斷水平方向的分散程度
215
+ x_std = np.std(x_coords)
216
+ x_range = max(x_coords) - min(x_coords)
217
+
218
+ # 垂直移動分析(上下移動)
219
+ # 計算y座標的標準差和範圍來判斷垂直方向的分散程度
220
+ y_std = np.std(y_coords)
221
+ y_range = max(y_coords) - min(y_coords)
222
+
223
+ # 足夠大的範圍表示該方向有明顯的運動或分散
224
+ # 40%的圖像範圍被認為是有意義的移動範圍
225
+ if x_range > 0.4:
226
+ directions.append("horizontal")
227
+ logger.debug(f"Horizontal movement detected with range: {x_range:.3f}")
228
+
229
+ if y_range > 0.4:
230
+ directions.append("vertical")
231
+ logger.debug(f"Vertical movement detected with range: {y_range:.3f}")
232
+
233
+ logger.info(f"Movement directions analyzed: {directions}")
234
+ return directions
235
+
236
+ except Exception as e:
237
+ logger.error(f"Error analyzing movement directions: {str(e)}")
238
+ logger.error(traceback.format_exc())
239
+ return []
240
+
241
+ def detect_aerial_view_indicators(self, detected_objects: List[Dict]) -> Dict:
242
+ """
243
+ 檢測俯視角度的指標
244
+ 分析物件分布特徵來判斷是否為俯視角度
245
+
246
+ Args:
247
+ detected_objects: 檢測到的物件列表
248
+
249
+ Returns:
250
+ 包含俯視角度指標的字典
251
+ """
252
+ try:
253
+ indicators = {
254
+ "consistent_sizing": False,
255
+ "grid_like_distribution": False,
256
+ "high_object_density": False,
257
+ "aerial_score": 0.0
258
+ }
259
+
260
+ if not detected_objects:
261
+ return indicators
262
+
263
+ # 檢查物件大小的一致性
264
+ sizes = [obj.get("normalized_area", 0) for obj in detected_objects]
265
+ if len(sizes) >= 3:
266
+ size_variance = np.var(sizes) / (np.mean(sizes) ** 2) if np.mean(sizes) > 0 else 1
267
+ # 俯視角度通常物件大小較為一致
268
+ indicators["consistent_sizing"] = size_variance < 0.3
269
+
270
+ # 檢查是否有網格狀分布(如停車場的俯視圖)
271
+ positions = [obj.get("normalized_center", [0.5, 0.5]) for obj in detected_objects]
272
+ if len(positions) >= 6:
273
+ # 簡化的網格檢測:檢查是否有規律的行列分布
274
+ x_coords = [pos[0] for pos in positions]
275
+ y_coords = [pos[1] for pos in positions]
276
+
277
+ # 計算座標的分布是否接近規律網格
278
+ x_unique = len(set([round(x, 1) for x in x_coords])) # 四捨五入到0.1精度
279
+ y_unique = len(set([round(y, 1) for y in y_coords]))
280
+
281
+ # 如果x和y方向都有多個不同的規律位置,可能是網格分布
282
+ indicators["grid_like_distribution"] = x_unique >= 3 and y_unique >= 3
283
+
284
+ # 檢查物件密度
285
+ total_objects = len(detected_objects)
286
+ # 俯視角度通常能看到更多物件
287
+ indicators["high_object_density"] = total_objects >= 8
288
+
289
+ # 計算俯視角度評分
290
+ score = 0
291
+ if indicators["consistent_sizing"]:
292
+ score += 0.4
293
+ if indicators["grid_like_distribution"]:
294
+ score += 0.4
295
+ if indicators["high_object_density"]:
296
+ score += 0.2
297
+
298
+ indicators["aerial_score"] = score
299
+
300
+ logger.info(f"Aerial view indicators: score={score:.2f}, consistent_sizing={indicators['consistent_sizing']}, grid_distribution={indicators['grid_like_distribution']}, high_density={indicators['high_object_density']}")
301
+ return indicators
302
+
303
+ except Exception as e:
304
+ logger.error(f"Error detecting aerial view indicators: {str(e)}")
305
+ logger.error(traceback.format_exc())
306
+ return {
307
+ "consistent_sizing": False,
308
+ "grid_like_distribution": False,
309
+ "high_object_density": False,
310
+ "aerial_score": 0.0
311
+ }
scene_zone_identifier.py ADDED
@@ -0,0 +1,1728 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import logging
3
+ import traceback
4
+ import numpy as np
5
+ from typing import Dict, List, Any, Optional
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+ class SceneZoneIdentifier:
10
+ """
11
+ 負責不同場景類型的區域識別邏輯
12
+ 專注於根據場景類型執行相應的功能區域識別策略
13
+ """
14
+
15
+ def __init__(self):
16
+ """初始化場景區域辨識器"""
17
+ try:
18
+ logger.info("SceneZoneIdentifier initialized successfully")
19
+
20
+ except Exception as e:
21
+ logger.error(f"Failed to initialize SceneZoneIdentifier: {str(e)}")
22
+ logger.error(traceback.format_exc())
23
+ raise
24
+
25
+ def identify_indoor_zones(self, category_regions: Dict, detected_objects: List[Dict], scene_type: str) -> Dict:
26
+ """
27
+ 平衡化的室內功能區域識別並標準化命名
28
+ 採用通用的物件關聯性分析,避免只針對特定場景
29
+
30
+ Args:
31
+ category_regions: 按類別和區域分組的物件字典
32
+ detected_objects: 檢測到的物件列表
33
+ scene_type: 場景類型
34
+
35
+ Returns:
36
+ 識別出的室內功能區域字典,使用描述性鍵名
37
+ """
38
+ try:
39
+ zones = {}
40
+
41
+ # 主要功能區域(基於物件關聯性而非場景類型)
42
+ primary_zone = self._identify_primary_functional_area(detected_objects)
43
+ if primary_zone:
44
+ # 基於區域內容生成描述性鍵名
45
+ descriptive_key = self._generate_descriptive_zone_key_from_data(primary_zone, "primary")
46
+ zones[descriptive_key] = primary_zone
47
+
48
+ # 只有明確證據且物件數量足夠時創建次要功能區域
49
+ if len(zones) >= 1 and len(detected_objects) >= 6:
50
+ secondary_zone = self._identify_secondary_functional_area(detected_objects, zones)
51
+ if secondary_zone:
52
+ # 基於區域內容生成描述性鍵名
53
+ descriptive_key = self._generate_descriptive_zone_key_from_data(secondary_zone, "secondary")
54
+ zones[descriptive_key] = secondary_zone
55
+
56
+ logger.info(f"Identified {len(zones)} indoor zones for scene type '{scene_type}'")
57
+ return zones
58
+
59
+ except Exception as e:
60
+ logger.error(f"Error identifying indoor zones: {str(e)}")
61
+ logger.error(traceback.format_exc())
62
+ return {}
63
+
64
+ def _generate_descriptive_zone_key_from_data(self, zone_data: Dict, priority_level: str) -> str:
65
+ """
66
+ 基於區域數據生成描述性鍵名
67
+
68
+ Args:
69
+ zone_data: 區域數據字典
70
+ priority_level: 優先級別(primary/secondary)
71
+
72
+ Returns:
73
+ str: 描述性區域鍵名
74
+ """
75
+ try:
76
+ objects = zone_data.get("objects", [])
77
+ region = zone_data.get("region", "")
78
+ description = zone_data.get("description", "")
79
+
80
+ # 基於物件內容確定功能類型
81
+ if any("dining" in obj.lower() or "table" in obj.lower() for obj in objects):
82
+ base_name = "dining area"
83
+ elif any("chair" in obj.lower() or "sofa" in obj.lower() for obj in objects):
84
+ base_name = "seating area"
85
+ elif any("bed" in obj.lower() for obj in objects):
86
+ base_name = "sleeping area"
87
+ elif any("laptop" in obj.lower() or "keyboard" in obj.lower() for obj in objects):
88
+ base_name = "workspace area"
89
+ elif any("plant" in obj.lower() or "vase" in obj.lower() for obj in objects):
90
+ base_name = "decorative area"
91
+ elif any("refrigerator" in obj.lower() or "microwave" in obj.lower() for obj in objects):
92
+ base_name = "kitchen area"
93
+ else:
94
+ # 基於描述內容推斷
95
+ if "dining" in description.lower():
96
+ base_name = "dining area"
97
+ elif "seating" in description.lower() or "relaxation" in description.lower():
98
+ base_name = "seating area"
99
+ elif "work" in description.lower():
100
+ base_name = "workspace area"
101
+ elif "decorative" in description.lower():
102
+ base_name = "decorative area"
103
+ else:
104
+ base_name = "functional area"
105
+
106
+ # 為次要區域添加位置標識以區分
107
+ if priority_level == "secondary" and region:
108
+ spatial_context = self._get_spatial_context_description(region)
109
+ if spatial_context:
110
+ return f"{spatial_context} {base_name}"
111
+
112
+ return base_name
113
+
114
+ except Exception as e:
115
+ logger.warning(f"Error generating descriptive zone key: {str(e)}")
116
+ return "activity area"
117
+
118
+ def _get_spatial_context_description(self, region: str) -> str:
119
+ """
120
+ 獲取空間上下文描述
121
+
122
+ Args:
123
+ region: 區域位置標識
124
+
125
+ Returns:
126
+ str: 空間上下文描述
127
+ """
128
+ try:
129
+ spatial_mapping = {
130
+ "top_left": "upper left",
131
+ "top_center": "upper",
132
+ "top_right": "upper right",
133
+ "middle_left": "left side",
134
+ "middle_center": "central",
135
+ "middle_right": "right side",
136
+ "bottom_left": "lower left",
137
+ "bottom_center": "lower",
138
+ "bottom_right": "lower right"
139
+ }
140
+
141
+ return spatial_mapping.get(region, "")
142
+
143
+ except Exception as e:
144
+ logger.warning(f"Error getting spatial context for region '{region}': {str(e)}")
145
+ return ""
146
+
147
+ def identify_outdoor_general_zones(self, category_regions: Dict, detected_objects: List[Dict], scene_type: str) -> Dict:
148
+ """
149
+ 識別一般戶外場景的功能區域
150
+
151
+ Args:
152
+ category_regions: 按類別和區域分組的物件字典
153
+ detected_objects: 檢測到的物件列表
154
+ scene_type: 特定戶外場景類型
155
+
156
+ Returns:
157
+ 戶外功能區域字典
158
+ """
159
+ try:
160
+ zones = {}
161
+
162
+ # 識別行人區域
163
+ people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
164
+ if people_objs:
165
+ people_regions = {}
166
+ for obj in people_objs:
167
+ region = obj["region"]
168
+ if region not in people_regions:
169
+ people_regions[region] = []
170
+ people_regions[region].append(obj)
171
+
172
+ if people_regions:
173
+ # 找到主要的行人活動區域
174
+ main_people_regions = sorted(people_regions.items(),
175
+ key=lambda x: len(x[1]),
176
+ reverse=True)[:2] # 取前2個區域
177
+
178
+ for idx, (region, objs) in enumerate(main_people_regions):
179
+ if len(objs) > 0:
180
+ # 生成基於位置的描述性鍵名
181
+ spatial_desc = self._get_directional_description(region)
182
+ if spatial_desc and spatial_desc != "central":
183
+ zone_key = f"{spatial_desc} pedestrian area"
184
+ else:
185
+ zone_key = "main pedestrian area" if idx == 0 else "secondary pedestrian area"
186
+
187
+ zones[zone_key] = {
188
+ "region": region,
189
+ "objects": ["person"] * len(objs),
190
+ "description": f"Pedestrian area with {len(objs)} {'people' if len(objs) > 1 else 'person'}"
191
+ }
192
+
193
+ # 識別車輛區域,適用於街道和停車場
194
+ vehicle_objs = [obj for obj in detected_objects if obj["class_id"] in [1, 2, 3, 5, 6, 7]]
195
+ if vehicle_objs:
196
+ vehicle_regions = {}
197
+ for obj in vehicle_objs:
198
+ region = obj["region"]
199
+ if region not in vehicle_regions:
200
+ vehicle_regions[region] = []
201
+ vehicle_regions[region].append(obj)
202
+
203
+ if vehicle_regions:
204
+ main_vehicle_region = max(vehicle_regions.items(),
205
+ key=lambda x: len(x[1]),
206
+ default=(None, []))
207
+
208
+ if main_vehicle_region[0] is not None:
209
+ vehicle_types = [obj["class_name"] for obj in main_vehicle_region[1]]
210
+ zones["vehicle_zone"] = {
211
+ "region": main_vehicle_region[0],
212
+ "objects": vehicle_types,
213
+ "description": f"Traffic area with {', '.join(list(set(vehicle_types))[:3])}"
214
+ }
215
+
216
+ # 針對公園區域的特殊處理
217
+ if scene_type == "park_area":
218
+ zones.update(self._identify_park_recreational_zones(detected_objects))
219
+
220
+ # 針對停車場的特殊處理
221
+ if scene_type == "parking_lot":
222
+ zones.update(self._identify_parking_zones(detected_objects))
223
+
224
+ logger.info(f"Identified {len(zones)} outdoor zones for scene type '{scene_type}'")
225
+ return zones
226
+
227
+ except Exception as e:
228
+ logger.error(f"Error identifying outdoor general zones: {str(e)}")
229
+ logger.error(traceback.format_exc())
230
+ return {}
231
+
232
+ def identify_intersection_zones(self, category_regions: Dict, detected_objects: List[Dict], viewpoint: str) -> Dict:
233
+ """
234
+ 辨識城市十字路口的功能區域,無論是否有行人,只要偵測到紅綠燈就一定顯示 Traffic Control Area;
235
+ 若有行人,則額外建立 Crossing Zone 並把行人 + 同 region 的紅綠燈歸在一起。
236
+
237
+ Args:
238
+ category_regions: 按類別和 region 分組的物件字典
239
+ detected_objects: YOLO 檢測到的所有物件列表
240
+ viewpoint: 偵測到的視角字串
241
+
242
+ Returns:
243
+ zones: 最終的十字路口功能區域字典
244
+ """
245
+ try:
246
+ zones = {}
247
+
248
+ # 1. 按 class_id 分出行人、車輛、紅綠燈
249
+ pedestrian_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
250
+ vehicle_objs = [obj for obj in detected_objects if obj["class_id"] in [1, 2, 3, 5, 7]]
251
+ traffic_light_objs = [obj for obj in detected_objects if obj["class_id"] == 9]
252
+
253
+ # 2. Step A: 無條件建立 Traffic Control Area
254
+ # 把每個 region 下的紅綠燈都先分群,生成對應 zone,確保「只要偵測到紅綠燈就一定顯示」
255
+ signal_regions_all = {}
256
+ for t in traffic_light_objs:
257
+ region = t["region"]
258
+ signal_regions_all.setdefault(region, []).append(t)
259
+
260
+ for idx, (region, signals) in enumerate(signal_regions_all.items()):
261
+ # 先決定 zone_key (依 direction 或 primary/auxiliary)
262
+ direction = self._get_directional_description(region)
263
+ if direction and direction != "central":
264
+ zone_key = f"{direction} traffic control area"
265
+ else:
266
+ zone_key = "primary traffic control area" if idx == 0 else "auxiliary traffic control area"
267
+
268
+ # 確保命名不衝突
269
+ if zone_key in zones:
270
+ suffix = 1
271
+ new_key = f"{zone_key} ({suffix})"
272
+ while new_key in zones:
273
+ suffix += 1
274
+ new_key = f"{zone_key} ({suffix})"
275
+ zone_key = new_key
276
+
277
+ zones[zone_key] = {
278
+ "region": region,
279
+ "objects": ["traffic light"] * len(signals),
280
+ "description": f"Traffic control area with {len(signals)} traffic lights in {region}"
281
+ }
282
+
283
+ # (用於後面計算 Crossing 使用掉的 traffic light)
284
+ used_tl_count_per_region = dict.fromkeys(signal_regions_all.keys(), 0)
285
+
286
+ # 3. Step B: 如果有行人,就建立 Crossing Zone,並移除已被打包的紅綠燈
287
+ if pedestrian_objs:
288
+ # 先呼叫 _analyze_crossing_patterns,讓它回傳「行人 + 同 region 的紅綠燈」區
289
+ crossing_zones = self._analyze_crossing_patterns(pedestrian_objs, traffic_light_objs)
290
+
291
+ # 把 Crossing Zone 加到最終 zones,並同時記錄已使用掉的紅綠燈數量
292
+ for zone_key, zone_info in crossing_zones.items():
293
+ region = zone_info.get("region", "")
294
+ obj_list = zone_info.get("objects", [])
295
+
296
+ # 如果該 zone_info["objects"] 裡含有紅綠燈,就累加到 used_tl_count_per_region
297
+ count_in_zone = obj_list.count("traffic light")
298
+ if count_in_zone > 0:
299
+ used_tl_count_per_region[region] = used_tl_count_per_region.get(region, 0) + count_in_zone
300
+
301
+ # 加入最終結果
302
+ # 如果 key 重複,也可以在此加上 index,或直接覆蓋
303
+ if zone_key in zones:
304
+ suffix = 1
305
+ new_key = f"{zone_key} ({suffix})"
306
+ while new_key in zones:
307
+ suffix += 1
308
+ new_key = f"{zone_key} ({suffix})"
309
+ zone_key = new_key
310
+
311
+ zones[zone_key] = {
312
+ "region": region,
313
+ "objects": obj_list,
314
+ "description": zone_info.get("description", "")
315
+ }
316
+
317
+ # 4. Step C: 計算並顯示 debug 資訊 (Total / Used / Remaining)
318
+ for region, signals in signal_regions_all.items():
319
+ total = len(signals)
320
+ used = used_tl_count_per_region.get(region, 0)
321
+ remaining = total - used
322
+ # print(f"[DEBUG] Region '{region}': Total TL = {total}, Used in crossing = {used}, Remaining = {remaining}")
323
+
324
+ # 5. Step D: 分析車輛交通區域(Vehicle Zones)
325
+ if vehicle_objs:
326
+ traffic_zones = self._analyze_traffic_zones(vehicle_objs)
327
+ # _analyze_traffic_zones 內部已用英文 debug,直接更新
328
+ for zone_key, zone_info in traffic_zones.items():
329
+ if zone_key in zones:
330
+ suffix = 1
331
+ new_key = f"{zone_key} ({suffix})"
332
+ while new_key in zones:
333
+ suffix += 1
334
+ new_key = f"{zone_key} ({suffix})"
335
+ zone_key = new_key
336
+ zones[zone_key] = zone_info
337
+
338
+ logger.info(f"Identified {len(zones)} intersection zones")
339
+ return zones
340
+
341
+ except Exception as e:
342
+ logger.error(f"Error in identify_intersection_zones: {str(e)}")
343
+ logger.error(traceback.format_exc())
344
+ return {}
345
+
346
+ def identify_aerial_view_zones(self, category_regions: Dict, detected_objects: List[Dict], scene_type: str) -> Dict:
347
+ """
348
+ 辨識空中視角場景的功能區域
349
+ 專注於模式和流動而非特定區域
350
+
351
+ Args:
352
+ category_regions: 按類別和區域分組的物件字典
353
+ detected_objects: 檢測到的物件列表
354
+ scene_type: 特定場景類型
355
+
356
+ Returns:
357
+ 空中視角功能區域字典
358
+ """
359
+ try:
360
+ zones = {}
361
+
362
+ # 識別行人模式
363
+ people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
364
+ if people_objs:
365
+ # 將位置轉換為數組進行模式分析
366
+ positions = np.array([obj["normalized_center"] for obj in people_objs])
367
+
368
+ if len(positions) >= 3:
369
+ # 計算分布指標
370
+ x_coords = positions[:, 0]
371
+ y_coords = positions[:, 1]
372
+
373
+ x_mean = np.mean(x_coords)
374
+ y_mean = np.mean(y_coords)
375
+ x_std = np.std(x_coords)
376
+ y_std = np.std(y_coords)
377
+
378
+ # 判斷人群是否組織成線性模式
379
+ if x_std < 0.1 or y_std < 0.1:
380
+ # 沿一個軸的線性分布
381
+ pattern_direction = "vertical" if x_std < y_std else "horizontal"
382
+
383
+ zones["pedestrian_pattern"] = {
384
+ "region": "central",
385
+ "objects": ["person"] * len(people_objs),
386
+ "description": f"Aerial view shows a {pattern_direction} pedestrian movement pattern"
387
+ }
388
+ else:
389
+ # 更分散的模式
390
+ zones["pedestrian_distribution"] = {
391
+ "region": "wide",
392
+ "objects": ["person"] * len(people_objs),
393
+ "description": f"Aerial view shows pedestrians distributed across the area"
394
+ }
395
+
396
+ # 識別車輛模式進行交通分析
397
+ vehicle_objs = [obj for obj in detected_objects if obj["class_id"] in [1, 2, 3, 5, 6, 7]]
398
+ if vehicle_objs:
399
+ zones.update(self._analyze_aerial_traffic_patterns(vehicle_objs))
400
+
401
+ # 針對十字路口特定空中視角的處理
402
+ if "intersection" in scene_type:
403
+ zones.update(self._identify_aerial_intersection_features(detected_objects))
404
+
405
+ # 針對廣場空中視角的處理
406
+ if "plaza" in scene_type:
407
+ zones.update(self._identify_aerial_plaza_features(people_objs))
408
+
409
+ logger.info(f"Identified {len(zones)} aerial view zones")
410
+ return zones
411
+
412
+ except Exception as e:
413
+ logger.error(f"Error identifying aerial view zones: {str(e)}")
414
+ logger.error(traceback.format_exc())
415
+ return {}
416
+
417
+ def identify_asian_cultural_zones(self, category_regions: Dict, detected_objects: List[Dict], scene_type: str) -> Dict:
418
+ """
419
+ 辨識有亞洲文化背景的場景功能區域
420
+
421
+ Args:
422
+ category_regions: 按類別和區域分組的物件字典
423
+ detected_objects: 檢測到的物件列表
424
+ scene_type: 特定場景類型
425
+
426
+ Returns:
427
+ 亞洲文化功能區域字典
428
+ """
429
+ try:
430
+ zones = {}
431
+
432
+ # 識別店面區域
433
+ # 由於店面不能直接檢測,從情境推斷
434
+ # 例如,尋找有標誌、行人和小物件的區域
435
+ storefront_regions = {}
436
+ for obj in detected_objects:
437
+ if obj["class_id"] == 0: # Person
438
+ region = obj["region"]
439
+ if region not in storefront_regions:
440
+ storefront_regions[region] = []
441
+ storefront_regions[region].append(obj)
442
+
443
+ # 將人最多的區域作為店面區域
444
+ if storefront_regions:
445
+ main_storefront_regions = sorted(storefront_regions.items(),
446
+ key=lambda x: len(x[1]),
447
+ reverse=True)[:2] # 前2個區域
448
+
449
+ for idx, (region, objs) in enumerate(main_storefront_regions):
450
+ # 生成基於位置的描述性鍵名
451
+ spatial_desc = self._get_directional_description(region)
452
+ if spatial_desc and spatial_desc != "central":
453
+ zone_key = f"{spatial_desc} commercial area"
454
+ else:
455
+ zone_key = "main commercial area" if idx == 0 else "secondary commercial area"
456
+
457
+ zones[zone_key] = {
458
+ "region": region,
459
+ "objects": [obj["class_name"] for obj in objs],
460
+ "description": f"Asian commercial storefront with pedestrian activity"
461
+ }
462
+
463
+ # 辨識行人通道
464
+ zones.update(self._identify_asian_pedestrian_pathway(detected_objects))
465
+
466
+ # 辨識攤販區域(小攤/商店 - 從情境推斷)
467
+ zones.update(self._identify_vendor_zones(detected_objects))
468
+
469
+ # 針對夜市的特殊處理
470
+ if scene_type == "asian_night_market":
471
+ zones["food_stall_zone"] = {
472
+ "region": "middle_center",
473
+ "objects": ["inferred food stalls"],
474
+ "description": "Food stall area typical of Asian night markets"
475
+ }
476
+
477
+ logger.info(f"Identified {len(zones)} Asian cultural zones")
478
+ return zones
479
+
480
+ except Exception as e:
481
+ logger.error(f"Error identifying Asian cultural zones: {str(e)}")
482
+ logger.error(traceback.format_exc())
483
+ return {}
484
+
485
+ def identify_upscale_dining_zones(self, category_regions: Dict, detected_objects: List[Dict]) -> Dict:
486
+ """
487
+ 辨識高級餐飲設置的功能區域
488
+
489
+ Args:
490
+ category_regions: 按類別和區域分組的物件字典
491
+ detected_objects: 檢測到的物件列表
492
+
493
+ Returns:
494
+ 高級餐飲功能區域字典
495
+ """
496
+ try:
497
+ zones = {}
498
+
499
+ # 辨識餐桌區域
500
+ dining_items = []
501
+ dining_regions = {}
502
+
503
+ for obj in detected_objects:
504
+ if obj["class_id"] in [40, 41, 42, 43, 44, 45, 60]: # Wine glass, cup, fork, knife, spoon, bowl, table
505
+ region = obj["region"]
506
+ if region not in dining_regions:
507
+ dining_regions[region] = []
508
+ dining_regions[region].append(obj)
509
+ dining_items.append(obj["class_name"])
510
+
511
+ if dining_items:
512
+ main_dining_region = max(dining_regions.items(),
513
+ key=lambda x: len(x[1]),
514
+ default=(None, []))
515
+
516
+ if main_dining_region[0] is not None:
517
+ zones["formal_dining_zone"] = {
518
+ "region": main_dining_region[0],
519
+ "objects": list(set(dining_items)),
520
+ "description": f"Formal dining area with {', '.join(list(set(dining_items))[:3])}"
521
+ }
522
+
523
+ # 識別裝飾區域,增強檢測
524
+ zones.update(self._identify_upscale_decorative_zones(detected_objects))
525
+
526
+ # 識別座位安排區域
527
+ zones.update(self._identify_dining_seating_zones(detected_objects))
528
+
529
+ # 識別服務區域(如果與餐飲區域不同)
530
+ zones.update(self._identify_serving_zones(detected_objects, zones))
531
+
532
+ logger.info(f"Identified {len(zones)} upscale dining zones")
533
+ return zones
534
+
535
+ except Exception as e:
536
+ logger.error(f"Error identifying upscale dining zones: {str(e)}")
537
+ logger.error(traceback.format_exc())
538
+ return {}
539
+
540
+ def identify_financial_district_zones(self, category_regions: Dict, detected_objects: List[Dict]) -> Dict:
541
+ """
542
+ 金融區場景的功能區域
543
+
544
+ Args:
545
+ category_regions: 按類別和區域分組的物件字典
546
+ detected_objects: 檢測到的物件列表
547
+
548
+ Returns:
549
+ 金融區功能區域字典
550
+ """
551
+ try:
552
+ zones = {}
553
+
554
+ # 識別交通區域
555
+ traffic_items = []
556
+ traffic_regions = {}
557
+
558
+ for obj in detected_objects:
559
+ if obj["class_id"] in [1, 2, 3, 5, 6, 7, 9]: # 各種車輛和交通燈
560
+ region = obj["region"]
561
+ if region not in traffic_regions:
562
+ traffic_regions[region] = []
563
+ traffic_regions[region].append(obj)
564
+ traffic_items.append(obj["class_name"])
565
+
566
+ if traffic_items:
567
+ main_traffic_region = max(traffic_regions.items(),
568
+ key=lambda x: len(x[1]),
569
+ default=(None, []))
570
+
571
+ if main_traffic_region[0] is not None:
572
+ zones["traffic_zone"] = {
573
+ "region": main_traffic_region[0],
574
+ "objects": list(set(traffic_items)),
575
+ "description": f"Urban traffic area with {', '.join(list(set(traffic_items))[:3])}"
576
+ }
577
+
578
+ # 側邊建築區域(從場景情境推斷)
579
+ zones.update(self._identify_building_zones(detected_objects))
580
+
581
+ # 行人區域
582
+ zones.update(self._identify_financial_pedestrian_zones(detected_objects))
583
+
584
+ logger.info(f"Identified {len(zones)} financial district zones")
585
+ return zones
586
+
587
+ except Exception as e:
588
+ logger.error(f"Error identifying financial district zones: {str(e)}")
589
+ logger.error(traceback.format_exc())
590
+ return {}
591
+
592
+ def identify_landmark_zones(self, landmark_objects: List[Dict]) -> Dict:
593
+ """
594
+ 辨識與地標相關的功能區域
595
+
596
+ Args:
597
+ landmark_objects: 被辨識為地標的物體列表
598
+
599
+ Returns:
600
+ 地標相關的功能區域字典
601
+ """
602
+ try:
603
+ landmark_zones = {}
604
+
605
+ # 如果沒有任何地標,就直接回空字典
606
+ if not landmark_objects:
607
+ logger.warning("No landmark objects provided to identify_landmark_zones")
608
+ return landmark_zones
609
+
610
+ # 只取第一個地標來示範:至少產生一個地標
611
+ landmark = landmark_objects[0]
612
+ # 確保傳入的 landmark 是 dict
613
+ if not isinstance(landmark, dict):
614
+ logger.warning("First landmark object is not a dict")
615
+ return landmark_zones
616
+
617
+ # 從 landmark dict 拿出必要欄位
618
+ landmark_id = landmark.get("landmark_id", "unknown_landmark")
619
+ landmark_name = landmark.get("class_name", "Landmark")
620
+ landmark_type = landmark.get("landmark_type", "architectural")
621
+ landmark_region = landmark.get("region", "middle_center")
622
+
623
+ # 如果 location 沒提供,就給預設 "this area"
624
+ location = landmark.get("location")
625
+ if not location:
626
+ location = "this area"
627
+
628
+ # 為地標創建主要觀景區
629
+ zone_id = f"{landmark_name.lower().replace(' ', '_')}_viewing_area"
630
+ zone_name = f"{landmark_name} Viewing Area"
631
+
632
+ # 根據地標類型調整描述,並確保帶入地點
633
+ if landmark_type == "natural":
634
+ zone_description = (
635
+ f"Scenic viewpoint for observing {landmark_name}, "
636
+ f"a notable natural landmark in {location}."
637
+ )
638
+ primary_function = "Nature observation and photography"
639
+ elif landmark_type == "monument":
640
+ zone_description = (
641
+ f"Viewing area around {landmark_name}, "
642
+ f"a significant monument in {location}."
643
+ )
644
+ primary_function = "Historical appreciation and cultural tourism"
645
+ else: # architectural
646
+ zone_description = (
647
+ f"Area centered around {landmark_name}, "
648
+ f"where visitors can observe and appreciate this iconic structure in {location}."
649
+ )
650
+ primary_function = "Architectural tourism and photography"
651
+
652
+ # 確定與地標相關的物體(如果被偵測到)
653
+ related_objects = []
654
+ for o in landmark_objects:
655
+ cn = o.get("class_name", "").lower()
656
+ if cn in ["person", "camera", "cell phone", "backpack"]:
657
+ related_objects.append(cn)
658
+
659
+ # 建立地標功能區
660
+ landmark_zones[zone_id] = {
661
+ "name": zone_name,
662
+ "description": zone_description,
663
+ "objects": ["landmark"] + related_objects,
664
+ "region": landmark_region,
665
+ "primary_function": primary_function
666
+ }
667
+
668
+ # 創建相關輔助功能區,如攝影區、紀念品販賣區
669
+ auxiliary_zones = self._create_landmark_auxiliary_zones(landmark, 0)
670
+ if auxiliary_zones:
671
+ landmark_zones.update(auxiliary_zones)
672
+
673
+ logger.info(f"Identified {len(landmark_zones)} landmark zones")
674
+ return landmark_zones
675
+
676
+ except Exception as e:
677
+ logger.error(f"Error in identify_landmark_zones: {str(e)}")
678
+ logger.error(traceback.format_exc())
679
+ return {}
680
+
681
+
682
+ def _identify_primary_functional_area(self, detected_objects: List[Dict]) -> Dict:
683
+ """
684
+ 識別主要功能區域,基於最強的物件關聯性組合
685
+ 採用通用邏輯處理各種室內場景
686
+
687
+ Args:
688
+ detected_objects: 檢測到的物件列表
689
+
690
+ Returns:
691
+ 主要功能區域字典或None
692
+ """
693
+ try:
694
+ # 用餐區域檢測(桌椅組合)
695
+ dining_area = self._detect_functional_combination(
696
+ detected_objects,
697
+ primary_objects=[60], # dining table
698
+ supporting_objects=[56, 40, 41, 42, 43], # chair, wine glass, cup, fork, knife
699
+ min_supporting=2,
700
+ description_template="Dining area with table and seating arrangement"
701
+ )
702
+ if dining_area:
703
+ return dining_area
704
+
705
+ # 休息區域檢測(沙發電視組合或床)
706
+ seating_area = self._detect_functional_combination(
707
+ detected_objects,
708
+ primary_objects=[57, 59], # sofa, bed
709
+ supporting_objects=[62, 58, 56], # tv, potted plant, chair
710
+ min_supporting=1,
711
+ description_template="Seating and relaxation area"
712
+ )
713
+ if seating_area:
714
+ return seating_area
715
+
716
+ # 工作區域檢測(電子設備與家具組合)
717
+ work_area = self._detect_functional_combination(
718
+ detected_objects,
719
+ primary_objects=[63, 66], # laptop, keyboard
720
+ supporting_objects=[60, 56, 64], # dining table, chair, mouse
721
+ min_supporting=2,
722
+ description_template="Workspace area with electronics and furniture"
723
+ )
724
+ if work_area:
725
+ return work_area
726
+
727
+ return None
728
+
729
+ except Exception as e:
730
+ logger.error(f"Error identifying primary functional area: {str(e)}")
731
+ logger.error(traceback.format_exc())
732
+ return None
733
+
734
+ def _identify_secondary_functional_area(self, detected_objects: List[Dict], existing_zones: Dict) -> Dict:
735
+ """
736
+ 識別次要功能區域,避免與主要區域重疊
737
+
738
+ Args:
739
+ detected_objects: 檢測到的物件列表
740
+ existing_zones: 已存在的功能區域
741
+
742
+ Returns:
743
+ 次要功能區域字典或None
744
+ """
745
+ try:
746
+ # 獲取已使用的區域
747
+ used_regions = set(zone.get("region") for zone in existing_zones.values())
748
+
749
+ # 裝飾區域檢測(植物集中區域)
750
+ decorative_area = self._detect_functional_combination(
751
+ detected_objects,
752
+ primary_objects=[58], # potted plant
753
+ supporting_objects=[75], # vase
754
+ min_supporting=0,
755
+ min_primary=3, # 至少需要3個植物
756
+ description_template="Decorative area with plants and ornamental items",
757
+ exclude_regions=used_regions
758
+ )
759
+ if decorative_area:
760
+ return decorative_area
761
+
762
+ # 儲存區域檢測(廚房電器組合)
763
+ storage_area = self._detect_functional_combination(
764
+ detected_objects,
765
+ primary_objects=[72, 68, 69], # refrigerator, microwave, oven
766
+ supporting_objects=[71], # sink
767
+ min_supporting=0,
768
+ min_primary=2,
769
+ description_template="Kitchen appliance and storage area",
770
+ exclude_regions=used_regions
771
+ )
772
+ if storage_area:
773
+ return storage_area
774
+
775
+ return None
776
+
777
+ except Exception as e:
778
+ logger.error(f"Error identifying secondary functional area: {str(e)}")
779
+ logger.error(traceback.format_exc())
780
+ return None
781
+
782
+ def _detect_functional_combination(self, detected_objects: List[Dict], primary_objects: List[int],
783
+ supporting_objects: List[int], min_supporting: int,
784
+ description_template: str, min_primary: int = 1,
785
+ exclude_regions: set = None) -> Dict:
786
+ """
787
+ 通用的功能組合檢測方法
788
+ 基於主要物件和支持物件的組合判斷功能區域
789
+
790
+ Args:
791
+ detected_objects: 檢測到的物件列表
792
+ primary_objects: 主要物件的class_id列表
793
+ supporting_objects: 支持物件的class_id列表
794
+ min_supporting: 最少需要的支持物件數量
795
+ description_template: 描述模板
796
+ min_primary: 最少需要的主要物件數量
797
+ exclude_regions: 需要排除的區域集合
798
+
799
+ Returns:
800
+ 功能區域資訊字典,如果不符合條件則返回None
801
+ """
802
+ try:
803
+ if exclude_regions is None:
804
+ exclude_regions = set()
805
+
806
+ # 收集主要物件
807
+ primary_objs = [obj for obj in detected_objects
808
+ if obj.get("class_id") in primary_objects and obj.get("confidence", 0) >= 0.4]
809
+
810
+ # 收集支持物件
811
+ supporting_objs = [obj for obj in detected_objects
812
+ if obj.get("class_id") in supporting_objects and obj.get("confidence", 0) >= 0.4]
813
+
814
+ # 檢查是否滿足最少數量要求
815
+ if len(primary_objs) < min_primary or len(supporting_objs) < min_supporting:
816
+ return None
817
+
818
+ # 按區域組織物件
819
+ region_combinations = {}
820
+ all_relevant_objs = primary_objs + supporting_objs
821
+
822
+ for obj in all_relevant_objs:
823
+ region = obj.get("region")
824
+
825
+ # 排除指定區域
826
+ if region in exclude_regions:
827
+ continue
828
+
829
+ if region not in region_combinations:
830
+ region_combinations[region] = {"primary": [], "supporting": [], "all": []}
831
+
832
+ region_combinations[region]["all"].append(obj)
833
+
834
+ if obj.get("class_id") in primary_objects:
835
+ region_combinations[region]["primary"].append(obj)
836
+ else:
837
+ region_combinations[region]["supporting"].append(obj)
838
+
839
+ # 找到最佳區域組合
840
+ best_region = None
841
+ best_score = 0
842
+
843
+ for region, objs in region_combinations.items():
844
+ # 計算該區域的評分
845
+ primary_count = len(objs["primary"])
846
+ supporting_count = len(objs["supporting"])
847
+
848
+ # 必須滿足最低要求
849
+ if primary_count < min_primary or supporting_count < min_supporting:
850
+ continue
851
+
852
+ # 計算組合評分(主要物件權重較高)
853
+ score = primary_count * 2 + supporting_count
854
+
855
+ if score > best_score:
856
+ best_score = score
857
+ best_region = region
858
+
859
+ if best_region is None:
860
+ return None
861
+
862
+ best_combination = region_combinations[best_region]
863
+ all_objects = [obj["class_name"] for obj in best_combination["all"]]
864
+
865
+ return {
866
+ "region": best_region,
867
+ "objects": all_objects,
868
+ "description": description_template
869
+ }
870
+
871
+ except Exception as e:
872
+ logger.error(f"Error detecting functional combination: {str(e)}")
873
+ logger.error(traceback.format_exc())
874
+ return None
875
+
876
+ def _analyze_crossing_patterns(self, pedestrians: List[Dict], traffic_lights: List[Dict]) -> Dict:
877
+ """
878
+ Analyze pedestrian crossing patterns to identify crossing zones.
879
+ 若同一 region 中同時有行人與紅綠燈,則將兩者都放入該區域的 objects。
880
+
881
+ Args:
882
+ pedestrians: 行人物件列表(每個 obj 應包含 'class_id', 'region', 'confidence' 等)
883
+ traffic_lights: 紅綠燈物件列表(每個 obj 應包含 'class_id', 'region', 'confidence' 等)
884
+
885
+ Returns:
886
+ crossing_zones: 字典,key 為 zone 名稱,value 包含 'region', 'objects', 'description'
887
+ """
888
+ try:
889
+ crossing_zones = {}
890
+
891
+ # 如果沒有任何行人,就不辨識任何 crossing zone
892
+ if not pedestrians:
893
+ return crossing_zones
894
+
895
+ # (1) 按照 region 分組行人
896
+ pedestrian_regions = {}
897
+ for p in pedestrians:
898
+ region = p["region"]
899
+ pedestrian_regions.setdefault(region, []).append(p)
900
+
901
+ # (2) 針對每個 region,看是否同時有紅綠燈
902
+ # 建立一個 mapping: region -> { "pedestrians": [...], "traffic_lights": [...] }
903
+ combined_regions = {}
904
+ for region, peds in pedestrian_regions.items():
905
+ # 取得該 region 下所有紅綠燈
906
+ tls_in_region = [t for t in traffic_lights if t["region"] == region]
907
+ combined_regions[region] = {
908
+ "pedestrians": peds,
909
+ "traffic_lights": tls_in_region
910
+ }
911
+
912
+ # (3) 按照行人數量排序,找出前兩個需要建立 crossing zone 的 region
913
+ sorted_regions = sorted(
914
+ combined_regions.items(),
915
+ key=lambda x: len(x[1]["pedestrians"]),
916
+ reverse=True
917
+ )
918
+
919
+ # (4) 將前兩個 region 建立 Crossing Zone,objects 同時包含行人與紅綠燈
920
+ for idx, (region, group) in enumerate(sorted_regions[:2]):
921
+ peds = group["pedestrians"]
922
+ tls = group["traffic_lights"]
923
+ has_nearby_signals = len(tls) > 0
924
+
925
+ # 生成 zone_name(基於 region 方向 + idx 決定主/次 crossing)
926
+ direction = self._get_directional_description(region)
927
+ if direction and direction != "central":
928
+ zone_name = f"{direction} crossing area"
929
+ else:
930
+ zone_name = "main crossing area" if idx == 0 else "secondary crossing area"
931
+
932
+ # 組合 description
933
+ description = f"Pedestrian crossing area with {len(peds)} "
934
+ description += "person" if len(peds) == 1 else "people"
935
+ if direction:
936
+ description += f" in {direction} direction"
937
+ if has_nearby_signals:
938
+ description += " near traffic signals"
939
+
940
+ # ======= 將行人 + 同區紅綠燈一併放入 objects =======
941
+ obj_list = ["pedestrian"] * len(peds)
942
+ if has_nearby_signals:
943
+ obj_list += ["traffic light"] * len(tls)
944
+
945
+ crossing_zones[zone_name] = {
946
+ "region": region,
947
+ "objects": obj_list,
948
+ "description": description
949
+ }
950
+
951
+ return crossing_zones
952
+
953
+ except Exception as e:
954
+ logger.error(f"Error in _analyze_crossing_patterns: {str(e)}")
955
+ logger.error(traceback.format_exc())
956
+ return {}
957
+
958
+
959
+ def _analyze_traffic_zones(self, vehicles: List[Dict]) -> Dict:
960
+ """
961
+ 分析車輛分布以識別具有方向感知的交通區域
962
+
963
+ Args:
964
+ vehicles: 車輛物件列表
965
+
966
+ Returns:
967
+ 識別出的交通區域字典
968
+ """
969
+ try:
970
+ traffic_zones = {}
971
+
972
+ if not vehicles:
973
+ return traffic_zones
974
+
975
+ # 按區域分組車輛
976
+ vehicle_regions = {}
977
+ for v in vehicles:
978
+ region = v["region"]
979
+ if region not in vehicle_regions:
980
+ vehicle_regions[region] = []
981
+ vehicle_regions[region].append(v)
982
+
983
+ # 為有車輛的區域創建交通區域
984
+ main_traffic_region = max(vehicle_regions.items(), key=lambda x: len(x[1]), default=(None, []))
985
+
986
+ if main_traffic_region[0] is not None:
987
+ region = main_traffic_region[0]
988
+ vehicles_in_region = main_traffic_region[1]
989
+
990
+ # 獲取車輛類型列表用於描述
991
+ vehicle_types = [v["class_name"] for v in vehicles_in_region]
992
+ unique_types = list(set(vehicle_types))
993
+
994
+ # 獲取方向描述
995
+ direction = self._get_directional_description(region)
996
+
997
+ # 創建描述性區域
998
+ traffic_zones["vehicle_zone"] = {
999
+ "region": region,
1000
+ "objects": vehicle_types,
1001
+ "description": f"Vehicle traffic area with {', '.join(unique_types[:3])}" +
1002
+ (f" in {direction} area" if direction else "")
1003
+ }
1004
+
1005
+ # 如果車輛分布在多個區域,創建次要區域
1006
+ if len(vehicle_regions) > 1:
1007
+ # 獲取第二大車輛聚集區域
1008
+ sorted_regions = sorted(vehicle_regions.items(), key=lambda x: len(x[1]), reverse=True)
1009
+ if len(sorted_regions) > 1:
1010
+ second_region, second_vehicles = sorted_regions[1]
1011
+ direction = self._get_directional_description(second_region)
1012
+ vehicle_types = [v["class_name"] for v in second_vehicles]
1013
+ unique_types = list(set(vehicle_types))
1014
+
1015
+ traffic_zones["secondary_vehicle_zone"] = {
1016
+ "region": second_region,
1017
+ "objects": vehicle_types,
1018
+ "description": f"Secondary traffic area with {', '.join(unique_types[:2])}" +
1019
+ (f" in {direction} direction" if direction else "")
1020
+ }
1021
+
1022
+ return traffic_zones
1023
+
1024
+ except Exception as e:
1025
+ logger.error(f"Error analyzing traffic zones: {str(e)}")
1026
+ logger.error(traceback.format_exc())
1027
+ return {}
1028
+
1029
+ def _get_directional_description(self, region: str) -> str:
1030
+ """
1031
+ 將區域名稱轉換為方位描述(東西南北)
1032
+
1033
+ Args:
1034
+ region: 區域名稱
1035
+
1036
+ Returns:
1037
+ 方位描述字串
1038
+ """
1039
+ try:
1040
+ region_lower = region.lower()
1041
+
1042
+ if "top" in region_lower and "left" in region_lower:
1043
+ return "northwest"
1044
+ elif "top" in region_lower and "right" in region_lower:
1045
+ return "northeast"
1046
+ elif "bottom" in region_lower and "left" in region_lower:
1047
+ return "southwest"
1048
+ elif "bottom" in region_lower and "right" in region_lower:
1049
+ return "southeast"
1050
+ elif "top" in region_lower:
1051
+ return "north"
1052
+ elif "bottom" in region_lower:
1053
+ return "south"
1054
+ elif "left" in region_lower:
1055
+ return "west"
1056
+ elif "right" in region_lower:
1057
+ return "east"
1058
+ else:
1059
+ return "central"
1060
+
1061
+ except Exception as e:
1062
+ logger.error(f"Error getting directional description for region '{region}': {str(e)}")
1063
+ return "central"
1064
+
1065
+ def _identify_park_recreational_zones(self, detected_objects: List[Dict]) -> Dict:
1066
+ """
1067
+ 識別公園的休閒活動區域
1068
+
1069
+ Args:
1070
+ detected_objects: 檢測到的物件列表
1071
+
1072
+ Returns:
1073
+ 休閒區域字典
1074
+ """
1075
+ try:
1076
+ zones = {}
1077
+
1078
+ # 尋找休閒物件(運動球、風箏等)
1079
+ rec_items = []
1080
+ rec_regions = {}
1081
+
1082
+ for obj in detected_objects:
1083
+ if obj["class_id"] in [32, 33, 34, 35, 38]: # sports ball, kite, baseball bat, glove, tennis racket
1084
+ region = obj["region"]
1085
+ if region not in rec_regions:
1086
+ rec_regions[region] = []
1087
+ rec_regions[region].append(obj)
1088
+ rec_items.append(obj["class_name"])
1089
+
1090
+ if rec_items:
1091
+ main_rec_region = max(rec_regions.items(),
1092
+ key=lambda x: len(x[1]),
1093
+ default=(None, []))
1094
+
1095
+ if main_rec_region[0] is not None:
1096
+ zones["recreational_zone"] = {
1097
+ "region": main_rec_region[0],
1098
+ "objects": list(set(rec_items)),
1099
+ "description": f"Recreational area with {', '.join(list(set(rec_items)))}"
1100
+ }
1101
+
1102
+ return zones
1103
+
1104
+ except Exception as e:
1105
+ logger.error(f"Error identifying park recreational zones: {str(e)}")
1106
+ logger.error(traceback.format_exc())
1107
+ return {}
1108
+
1109
+ def _identify_parking_zones(self, detected_objects: List[Dict]) -> Dict:
1110
+ """
1111
+ 停車場的停車區域
1112
+
1113
+ Args:
1114
+ detected_objects: 檢測到的物件列表
1115
+
1116
+ Returns:
1117
+ 停車區域字典
1118
+ """
1119
+ try:
1120
+ zones = {}
1121
+
1122
+ # 尋找停放的汽車
1123
+ car_objs = [obj for obj in detected_objects if obj["class_id"] == 2] # cars
1124
+
1125
+ if len(car_objs) >= 3:
1126
+ # 檢查汽車是否按模式排列(簡化)
1127
+ car_positions = [obj["normalized_center"] for obj in car_objs]
1128
+
1129
+ # 通過分析垂直位置檢查行模式
1130
+ y_coords = [pos[1] for pos in car_positions]
1131
+ y_clusters = {}
1132
+
1133
+ # 簡化聚類 - 按相似y坐標分組汽車
1134
+ for i, y in enumerate(y_coords):
1135
+ assigned = False
1136
+ for cluster_y in y_clusters.keys():
1137
+ if abs(y - cluster_y) < 0.1: # 圖像高度的10%內
1138
+ y_clusters[cluster_y].append(i)
1139
+ assigned = True
1140
+ break
1141
+
1142
+ if not assigned:
1143
+ y_clusters[y] = [i]
1144
+
1145
+ # 如果有行模式
1146
+ if max(len(indices) for indices in y_clusters.values()) >= 2:
1147
+ zones["parking_row"] = {
1148
+ "region": "central",
1149
+ "objects": ["car"] * len(car_objs),
1150
+ "description": f"Organized parking area with vehicles arranged in rows"
1151
+ }
1152
+ else:
1153
+ zones["parking_area"] = {
1154
+ "region": "wide",
1155
+ "objects": ["car"] * len(car_objs),
1156
+ "description": f"Parking area with {len(car_objs)} vehicles"
1157
+ }
1158
+
1159
+ return zones
1160
+
1161
+ except Exception as e:
1162
+ logger.error(f"Error identifying parking zones: {str(e)}")
1163
+ logger.error(traceback.format_exc())
1164
+ return {}
1165
+
1166
+ def _analyze_aerial_traffic_patterns(self, vehicle_objs: List[Dict]) -> Dict:
1167
+ """
1168
+ 分析空中視角的車輛交通模式
1169
+
1170
+ Args:
1171
+ vehicle_objs: 車輛物件列表
1172
+
1173
+ Returns:
1174
+ 交通模式區域字典
1175
+ """
1176
+ try:
1177
+ zones = {}
1178
+
1179
+ if not vehicle_objs:
1180
+ return zones
1181
+
1182
+ # 將位置轉換為數組進行模式分析
1183
+ positions = np.array([obj["normalized_center"] for obj in vehicle_objs])
1184
+
1185
+ if len(positions) >= 2:
1186
+ # 計算分布指標
1187
+ x_coords = positions[:, 0]
1188
+ y_coords = positions[:, 1]
1189
+
1190
+ x_mean = np.mean(x_coords)
1191
+ y_mean = np.mean(y_coords)
1192
+ x_std = np.std(x_coords)
1193
+ y_std = np.std(y_coords)
1194
+
1195
+ # 判斷車輛是否組織成車道
1196
+ if x_std < y_std * 0.5:
1197
+ # 車輛垂直對齊 - 表示南北交通
1198
+ zones["vertical_traffic_flow"] = {
1199
+ "region": "central_vertical",
1200
+ "objects": [obj["class_name"] for obj in vehicle_objs[:5]],
1201
+ "description": "North-south traffic flow visible from aerial view"
1202
+ }
1203
+ elif y_std < x_std * 0.5:
1204
+ # 車輛水平對齊 - 表示東西交通
1205
+ zones["horizontal_traffic_flow"] = {
1206
+ "region": "central_horizontal",
1207
+ "objects": [obj["class_name"] for obj in vehicle_objs[:5]],
1208
+ "description": "East-west traffic flow visible from aerial view"
1209
+ }
1210
+ else:
1211
+ # 車輛多方向 - 表示十字路口
1212
+ zones["intersection_traffic"] = {
1213
+ "region": "central",
1214
+ "objects": [obj["class_name"] for obj in vehicle_objs[:5]],
1215
+ "description": "Multi-directional traffic at intersection visible from aerial view"
1216
+ }
1217
+
1218
+ return zones
1219
+
1220
+ except Exception as e:
1221
+ logger.error(f"Error analyzing aerial traffic patterns: {str(e)}")
1222
+ logger.error(traceback.format_exc())
1223
+ return {}
1224
+
1225
+ def _identify_aerial_intersection_features(self, detected_objects: List[Dict]) -> Dict:
1226
+ """
1227
+ 空中視角十字路口特徵
1228
+
1229
+ Args:
1230
+ detected_objects: 檢測到的物件列表
1231
+
1232
+ Returns:
1233
+ 十字路口特徵區域字典
1234
+ """
1235
+ try:
1236
+ zones = {}
1237
+
1238
+ # 檢查交通信號
1239
+ traffic_light_objs = [obj for obj in detected_objects if obj["class_id"] == 9]
1240
+ if traffic_light_objs:
1241
+ zones["traffic_control_pattern"] = {
1242
+ "region": "intersection",
1243
+ "objects": ["traffic light"] * len(traffic_light_objs),
1244
+ "description": f"Intersection traffic control with {len(traffic_light_objs)} signals visible from above"
1245
+ }
1246
+
1247
+ # 人行道從空中視角的情境推斷
1248
+ zones["crossing_pattern"] = {
1249
+ "region": "central",
1250
+ "objects": ["inferred crosswalk"],
1251
+ "description": "Crossing pattern visible from aerial perspective"
1252
+ }
1253
+
1254
+ return zones
1255
+
1256
+ except Exception as e:
1257
+ logger.error(f"Error identifying aerial intersection features: {str(e)}")
1258
+ logger.error(traceback.format_exc())
1259
+ return {}
1260
+
1261
+ def _identify_aerial_plaza_features(self, people_objs: List[Dict]) -> Dict:
1262
+ """
1263
+ 識別空中視角廣場特徵
1264
+
1265
+ Args:
1266
+ people_objs: 行人物件列表
1267
+
1268
+ Returns:
1269
+ 廣場特徵區域字典
1270
+ """
1271
+ try:
1272
+ zones = {}
1273
+
1274
+ if people_objs:
1275
+ # 檢查人群是否聚集在中央區域
1276
+ central_people = [obj for obj in people_objs
1277
+ if "middle" in obj["region"]]
1278
+
1279
+ if central_people:
1280
+ zones["central_gathering"] = {
1281
+ "region": "middle_center",
1282
+ "objects": ["person"] * len(central_people),
1283
+ "description": f"Central plaza gathering area with {len(central_people)} people viewed from above"
1284
+ }
1285
+
1286
+ return zones
1287
+
1288
+ except Exception as e:
1289
+ logger.error(f"Error identifying aerial plaza features: {str(e)}")
1290
+ logger.error(traceback.format_exc())
1291
+ return {}
1292
+
1293
+ def _identify_asian_pedestrian_pathway(self, detected_objects: List[Dict]) -> Dict:
1294
+ """
1295
+ 亞洲文化場景中的行人通道
1296
+
1297
+ Args:
1298
+ detected_objects: 檢測到的物件列表
1299
+
1300
+ Returns:
1301
+ 行人通道區域字典
1302
+ """
1303
+ try:
1304
+ zones = {}
1305
+
1306
+ pathway_items = []
1307
+ pathway_regions = {}
1308
+
1309
+ # 提取人群用於通道分析
1310
+ people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
1311
+
1312
+ # 分析人群是否形成線形(商業街的特徵)
1313
+ people_positions = [obj["normalized_center"] for obj in people_objs]
1314
+
1315
+ structured_path = False
1316
+ path_direction = "meandering"
1317
+
1318
+ if len(people_positions) >= 3:
1319
+ # 檢查人群是否沿相似y坐標排列(水平路徑)
1320
+ y_coords = [pos[1] for pos in people_positions]
1321
+ y_mean = sum(y_coords) / len(y_coords)
1322
+ y_variance = sum((y - y_mean)**2 for y in y_coords) / len(y_coords)
1323
+
1324
+ horizontal_path = y_variance < 0.05 # 低變異表示水平對齊
1325
+
1326
+ # 檢查人群是否沿相似x坐標排列(垂直路徑)
1327
+ x_coords = [pos[0] for pos in people_positions]
1328
+ x_mean = sum(x_coords) / len(x_coords)
1329
+ x_variance = sum((x - x_mean)**2 for x in x_coords) / len(x_coords)
1330
+
1331
+ vertical_path = x_variance < 0.05 # 低變異表示垂直對齊
1332
+
1333
+ structured_path = horizontal_path or vertical_path
1334
+ path_direction = "horizontal" if horizontal_path else "vertical" if vertical_path else "meandering"
1335
+
1336
+ # 收集通道物件(人、自行車、摩托車在中間區域)
1337
+ for obj in detected_objects:
1338
+ if obj["class_id"] in [0, 1, 3]: # Person, bicycle, motorcycle
1339
+ y_pos = obj["normalized_center"][1]
1340
+ # 按垂直位置分組(圖像中間可能是通道)
1341
+ if 0.25 <= y_pos <= 0.75:
1342
+ region = obj["region"]
1343
+ if region not in pathway_regions:
1344
+ pathway_regions[region] = []
1345
+ pathway_regions[region].append(obj)
1346
+ pathway_items.append(obj["class_name"])
1347
+
1348
+ if pathway_items:
1349
+ path_desc = "Pedestrian walkway with people moving through the commercial area"
1350
+ if structured_path:
1351
+ path_desc = f"{path_direction.capitalize()} pedestrian walkway with organized foot traffic"
1352
+
1353
+ zones["pedestrian_pathway"] = {
1354
+ "region": "middle_center", # 假設:通道通常在中間
1355
+ "objects": list(set(pathway_items)),
1356
+ "description": path_desc
1357
+ }
1358
+
1359
+ return zones
1360
+
1361
+ except Exception as e:
1362
+ logger.error(f"Error identifying Asian pedestrian pathway: {str(e)}")
1363
+ logger.error(traceback.format_exc())
1364
+ return {}
1365
+
1366
+ def _identify_vendor_zones(self, detected_objects: List[Dict]) -> Dict:
1367
+ """
1368
+ 識別攤販區域
1369
+
1370
+ Args:
1371
+ detected_objects: 檢測到的物件列表
1372
+
1373
+ Returns:
1374
+ 攤販區域字典
1375
+ """
1376
+ try:
1377
+ zones = {}
1378
+
1379
+ # 識別攤販區域(小攤/商店 - 從情境推斷)
1380
+ has_small_objects = any(obj["class_id"] in [24, 26, 39, 41] for obj in detected_objects) # bags, bottles, cups
1381
+ has_people = any(obj["class_id"] == 0 for obj in detected_objects)
1382
+
1383
+ if has_small_objects and has_people:
1384
+ # 可能的攤販區域是人群和小物件聚集的地方
1385
+ small_obj_regions = {}
1386
+
1387
+ for obj in detected_objects:
1388
+ if obj["class_id"] in [24, 26, 39, 41, 67]: # bags, bottles, cups, phones
1389
+ region = obj["region"]
1390
+ if region not in small_obj_regions:
1391
+ small_obj_regions[region] = []
1392
+ small_obj_regions[region].append(obj)
1393
+
1394
+ if small_obj_regions:
1395
+ main_vendor_region = max(small_obj_regions.items(),
1396
+ key=lambda x: len(x[1]),
1397
+ default=(None, []))
1398
+
1399
+ if main_vendor_region[0] is not None:
1400
+ vendor_items = [obj["class_name"] for obj in main_vendor_region[1]]
1401
+ zones["vendor_zone"] = {
1402
+ "region": main_vendor_region[0],
1403
+ "objects": list(set(vendor_items)),
1404
+ "description": "Vendor or market stall area with small merchandise"
1405
+ }
1406
+
1407
+ return zones
1408
+
1409
+ except Exception as e:
1410
+ logger.error(f"Error identifying vendor zones: {str(e)}")
1411
+ logger.error(traceback.format_exc())
1412
+ return {}
1413
+
1414
+ def _identify_upscale_decorative_zones(self, detected_objects: List[Dict]) -> Dict:
1415
+ """
1416
+ 識別高級餐飲的裝飾區域
1417
+
1418
+ Args:
1419
+ detected_objects: 檢測到的物件列表
1420
+
1421
+ Returns:
1422
+ 裝飾區域字典
1423
+ """
1424
+ try:
1425
+ zones = {}
1426
+
1427
+ decor_items = []
1428
+ decor_regions = {}
1429
+
1430
+ # 尋找裝飾元素(花瓶、酒杯、未使用的餐具)
1431
+ for obj in detected_objects:
1432
+ if obj["class_id"] in [75, 40]: # Vase, wine glass
1433
+ region = obj["region"]
1434
+ if region not in decor_regions:
1435
+ decor_regions[region] = []
1436
+ decor_regions[region].append(obj)
1437
+ decor_items.append(obj["class_name"])
1438
+
1439
+ if decor_items:
1440
+ main_decor_region = max(decor_regions.items(),
1441
+ key=lambda x: len(x[1]),
1442
+ default=(None, []))
1443
+
1444
+ if main_decor_region[0] is not None:
1445
+ zones["decorative_zone"] = {
1446
+ "region": main_decor_region[0],
1447
+ "objects": list(set(decor_items)),
1448
+ "description": f"Decorative area with {', '.join(list(set(decor_items)))}"
1449
+ }
1450
+
1451
+ return zones
1452
+
1453
+ except Exception as e:
1454
+ logger.error(f"Error identifying upscale decorative zones: {str(e)}")
1455
+ logger.error(traceback.format_exc())
1456
+ return {}
1457
+
1458
+ def _identify_dining_seating_zones(self, detected_objects: List[Dict]) -> Dict:
1459
+ """
1460
+ 識別餐廳座位安排區域
1461
+
1462
+ Args:
1463
+ detected_objects: 檢測到的物件���表
1464
+
1465
+ Returns:
1466
+ 座位區域字典
1467
+ """
1468
+ try:
1469
+ zones = {}
1470
+
1471
+ # 識別座位安排區域
1472
+ chairs = [obj for obj in detected_objects if obj["class_id"] == 56] # chairs
1473
+ if len(chairs) >= 2:
1474
+ chair_regions = {}
1475
+ for obj in chairs:
1476
+ region = obj["region"]
1477
+ if region not in chair_regions:
1478
+ chair_regions[region] = []
1479
+ chair_regions[region].append(obj)
1480
+
1481
+ if chair_regions:
1482
+ main_seating_region = max(chair_regions.items(),
1483
+ key=lambda x: len(x[1]),
1484
+ default=(None, []))
1485
+
1486
+ if main_seating_region[0] is not None:
1487
+ zones["dining_seating_zone"] = {
1488
+ "region": main_seating_region[0],
1489
+ "objects": ["chair"] * len(main_seating_region[1]),
1490
+ "description": f"Formal dining seating arrangement with {len(main_seating_region[1])} chairs"
1491
+ }
1492
+
1493
+ return zones
1494
+
1495
+ except Exception as e:
1496
+ logger.error(f"Error identifying dining seating zones: {str(e)}")
1497
+ logger.error(traceback.format_exc())
1498
+ return {}
1499
+
1500
+ def _identify_serving_zones(self, detected_objects: List[Dict], existing_zones: Dict) -> Dict:
1501
+ """
1502
+ 識別服務區域
1503
+
1504
+ Args:
1505
+ detected_objects: 檢測到的物件列表
1506
+ existing_zones: 已存在的功能區域
1507
+
1508
+ Returns:
1509
+ 服務區域字典
1510
+ """
1511
+ try:
1512
+ zones = {}
1513
+
1514
+ serving_items = []
1515
+ serving_regions = {}
1516
+
1517
+ # 服務區域可能有瓶子、碗、容器
1518
+ for obj in detected_objects:
1519
+ if obj["class_id"] in [39, 45]: # Bottle, bowl
1520
+ # 檢查是否在與主餐桌不同的區域
1521
+ if "formal_dining_zone" in existing_zones and obj["region"] != existing_zones["formal_dining_zone"]["region"]:
1522
+ region = obj["region"]
1523
+ if region not in serving_regions:
1524
+ serving_regions[region] = []
1525
+ serving_regions[region].append(obj)
1526
+ serving_items.append(obj["class_name"])
1527
+
1528
+ if serving_items:
1529
+ main_serving_region = max(serving_regions.items(),
1530
+ key=lambda x: len(x[1]),
1531
+ default=(None, []))
1532
+
1533
+ if main_serving_region[0] is not None:
1534
+ zones["serving_zone"] = {
1535
+ "region": main_serving_region[0],
1536
+ "objects": list(set(serving_items)),
1537
+ "description": f"Serving or sideboard area with {', '.join(list(set(serving_items)))}"
1538
+ }
1539
+
1540
+ return zones
1541
+
1542
+ except Exception as e:
1543
+ logger.error(f"Error identifying serving zones: {str(e)}")
1544
+ logger.error(traceback.format_exc())
1545
+ return {}
1546
+
1547
+ def _identify_building_zones(self, detected_objects: List[Dict]) -> Dict:
1548
+ """
1549
+ 識別建築區域(從場景情境推斷)
1550
+
1551
+ Args:
1552
+ detected_objects: 檢測到的物件列表
1553
+
1554
+ Returns:
1555
+ 建築區域字典
1556
+ """
1557
+ try:
1558
+ zones = {}
1559
+
1560
+ # 側邊建築區域(從場景情境推斷)
1561
+ # 檢查是否有實際可能包含建築物的區域
1562
+ left_side_regions = ["top_left", "middle_left", "bottom_left"]
1563
+ right_side_regions = ["top_right", "middle_right", "bottom_right"]
1564
+
1565
+ # 檢查左側
1566
+ left_building_evidence = True
1567
+ for region in left_side_regions:
1568
+ # 如果此區域有很多車輛或人群,不太可能是建築物
1569
+ vehicle_in_region = any(obj["region"] == region and obj["class_id"] in [1, 2, 3, 5, 7]
1570
+ for obj in detected_objects)
1571
+ people_in_region = any(obj["region"] == region and obj["class_id"] == 0
1572
+ for obj in detected_objects)
1573
+
1574
+ if vehicle_in_region or people_in_region:
1575
+ left_building_evidence = False
1576
+ break
1577
+
1578
+ # 檢查右側
1579
+ right_building_evidence = True
1580
+ for region in right_side_regions:
1581
+ # 如果此區域有很多車輛或人群,不太可能是建築物
1582
+ vehicle_in_region = any(obj["region"] == region and obj["class_id"] in [1, 2, 3, 5, 7]
1583
+ for obj in detected_objects)
1584
+ people_in_region = any(obj["region"] == region and obj["class_id"] == 0
1585
+ for obj in detected_objects)
1586
+
1587
+ if vehicle_in_region or people_in_region:
1588
+ right_building_evidence = False
1589
+ break
1590
+
1591
+ # 如果證據支持,添加建築區域
1592
+ if left_building_evidence:
1593
+ zones["building_zone_left"] = {
1594
+ "region": "middle_left",
1595
+ "objects": ["building"], # 推斷
1596
+ "description": "Tall buildings line the left side of the street"
1597
+ }
1598
+
1599
+ if right_building_evidence:
1600
+ zones["building_zone_right"] = {
1601
+ "region": "middle_right",
1602
+ "objects": ["building"], # 推斷
1603
+ "description": "Tall buildings line the right side of the street"
1604
+ }
1605
+
1606
+ return zones
1607
+
1608
+ except Exception as e:
1609
+ logger.error(f"Error identifying building zones: {str(e)}")
1610
+ logger.error(traceback.format_exc())
1611
+ return {}
1612
+
1613
+ def _identify_financial_pedestrian_zones(self, detected_objects: List[Dict]) -> Dict:
1614
+ """
1615
+ 識別金融區的行人區域
1616
+
1617
+ Args:
1618
+ detected_objects: 檢測到的物件列表
1619
+
1620
+ Returns:
1621
+ 行人區域字典
1622
+ """
1623
+ try:
1624
+ zones = {}
1625
+
1626
+ # 識別行人區域(如果有人群)
1627
+ people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
1628
+ if people_objs:
1629
+ people_regions = {}
1630
+ for obj in people_objs:
1631
+ region = obj["region"]
1632
+ if region not in people_regions:
1633
+ people_regions[region] = []
1634
+ people_regions[region].append(obj)
1635
+
1636
+ if people_regions:
1637
+ main_pedestrian_region = max(people_regions.items(),
1638
+ key=lambda x: len(x[1]),
1639
+ default=(None, []))
1640
+
1641
+ if main_pedestrian_region[0] is not None:
1642
+ zones["pedestrian_zone"] = {
1643
+ "region": main_pedestrian_region[0],
1644
+ "objects": ["person"] * len(main_pedestrian_region[1]),
1645
+ "description": f"Pedestrian area with {len(main_pedestrian_region[1])} people navigating the financial district"
1646
+ }
1647
+
1648
+ return zones
1649
+
1650
+ except Exception as e:
1651
+ logger.error(f"Error identifying financial pedestrian zones: {str(e)}")
1652
+ logger.error(traceback.format_exc())
1653
+ return {}
1654
+
1655
+ def _create_landmark_auxiliary_zones(self, landmark: Dict, index: int) -> Dict:
1656
+ """
1657
+ 創建地標相關的輔助區域(攝影區、紀念品區等)
1658
+
1659
+ Args:
1660
+ landmark: 地標物件字典
1661
+ index: 地標索引
1662
+
1663
+ Returns:
1664
+ 輔助區域字典
1665
+ """
1666
+ try:
1667
+ auxiliary_zones = {}
1668
+ landmark_region = landmark.get("region", "middle_center")
1669
+ landmark_name = landmark.get("class_name", "Landmark")
1670
+
1671
+ # 創建攝影區
1672
+ # 根據地標位置調整攝影區位置(地標前方通常是攝影區)
1673
+ region_mapping = {
1674
+ "top_left": "bottom_right",
1675
+ "top_center": "bottom_center",
1676
+ "top_right": "bottom_left",
1677
+ "middle_left": "middle_right",
1678
+ "middle_center": "bottom_center",
1679
+ "middle_right": "middle_left",
1680
+ "bottom_left": "top_right",
1681
+ "bottom_center": "top_center",
1682
+ "bottom_right": "top_left"
1683
+ }
1684
+
1685
+ photo_region = region_mapping.get(landmark_region, landmark_region)
1686
+
1687
+ photo_key = f"{landmark_name.lower().replace(' ', '_')}_photography_spot"
1688
+ auxiliary_zones[photo_key] = {
1689
+ "name": f"{landmark_name} Photography Spot",
1690
+ "description": f"Popular position for photographing {landmark_name} with optimal viewing angle.",
1691
+ "objects": ["camera", "person", "cell phone"],
1692
+ "region": photo_region,
1693
+ "primary_function": "Tourist photography"
1694
+ }
1695
+
1696
+ # 如果是著名地標,可能有紀念品販售區
1697
+ if landmark.get("confidence", 0) > 0.7: # 高置信度地標更可能有紀念品區
1698
+ # 根據地標位置找到適合的紀念品區位置(通常在地標附近但不直接在地標上)
1699
+ adjacent_regions = {
1700
+ "top_left": ["top_center", "middle_left"],
1701
+ "top_center": ["top_left", "top_right"],
1702
+ "top_right": ["top_center", "middle_right"],
1703
+ "middle_left": ["top_left", "bottom_left"],
1704
+ "middle_center": ["middle_left", "middle_right"],
1705
+ "middle_right": ["top_right", "bottom_right"],
1706
+ "bottom_left": ["middle_left", "bottom_center"],
1707
+ "bottom_center": ["bottom_left", "bottom_right"],
1708
+ "bottom_right": ["bottom_center", "middle_right"]
1709
+ }
1710
+
1711
+ if landmark_region in adjacent_regions:
1712
+ souvenir_region = adjacent_regions[landmark_region][0] # 選擇第一個相鄰區域
1713
+
1714
+ souvenir_key = f"{landmark_name.lower().replace(' ', '_')}_souvenir_area"
1715
+ auxiliary_zones[souvenir_key] = {
1716
+ "name": f"{landmark_name} Souvenir Area",
1717
+ "description": f"Area where visitors can purchase souvenirs and memorabilia related to {landmark_name}.",
1718
+ "objects": ["person", "handbag", "backpack"],
1719
+ "region": souvenir_region,
1720
+ "primary_function": "Tourism commerce"
1721
+ }
1722
+
1723
+ return auxiliary_zones
1724
+
1725
+ except Exception as e:
1726
+ logger.error(f"Error creating landmark auxiliary zones: {str(e)}")
1727
+ logger.error(traceback.format_exc())
1728
+ return {}
spatial_analyzer.py CHANGED
@@ -1,1895 +1,443 @@
1
 
2
  import os
3
  import numpy as np
 
 
4
  from typing import Dict, List, Tuple, Any, Optional
5
 
6
- from scene_type import SCENE_TYPES
7
- from enhance_scene_describer import EnhancedSceneDescriber
 
 
 
 
 
 
8
 
9
  class SpatialAnalyzer:
10
  """
11
- Analyzes spatial relationships between objects in an image.
12
- Handles region assignment, object positioning, and functional zone identification.
 
13
  """
14
 
15
  def __init__(self, class_names: Dict[int, str] = None, object_categories=None):
16
- """Initialize the spatial analyzer with image regions"""
17
- # Define regions of the image (3x3 grid)
18
- self.regions = {
19
- "top_left": (0, 0, 1/3, 1/3),
20
- "top_center": (1/3, 0, 2/3, 1/3),
21
- "top_right": (2/3, 0, 1, 1/3),
22
- "middle_left": (0, 1/3, 1/3, 2/3),
23
- "middle_center": (1/3, 1/3, 2/3, 2/3),
24
- "middle_right": (2/3, 1/3, 1, 2/3),
25
- "bottom_left": (0, 2/3, 1/3, 1),
26
- "bottom_center": (1/3, 2/3, 2/3, 1),
27
- "bottom_right": (2/3, 2/3, 1, 1)
28
- }
29
-
30
- self.class_names = class_names
31
- self.OBJECT_CATEGORIES = object_categories or {}
32
- self.enhance_descriptor = EnhancedSceneDescriber(scene_types=SCENE_TYPES)
33
-
34
- # Distances thresholds for proximity analysis (normalized)
35
- self.proximity_threshold = 0.2
36
-
37
-
38
- def _determine_region(self, x: float, y: float) -> str:
39
  """
40
- Determine which region a point falls into.
41
 
42
  Args:
43
- x: Normalized x-coordinate (0-1)
44
- y: Normalized y-coordinate (0-1)
45
-
46
- Returns:
47
- Region name
48
  """
49
- for region_name, (x1, y1, x2, y2) in self.regions.items():
50
- if x1 <= x < x2 and y1 <= y < y2:
51
- return region_name
52
-
53
- return "unknown"
54
 
55
- def _analyze_regions(self, detected_objects: List[Dict]) -> Dict:
56
- """
57
- Analyze object distribution across image regions.
58
 
59
- Args:
60
- detected_objects: List of detected objects with position information
 
 
 
 
 
61
 
62
- Returns:
63
- Dictionary with region analysis
64
- """
65
- # Count objects in each region
66
- region_counts = {region: 0 for region in self.regions.keys()}
67
- region_objects = {region: [] for region in self.regions.keys()}
68
-
69
- for obj in detected_objects:
70
- region = obj["region"]
71
- if region in region_counts:
72
- region_counts[region] += 1
73
- region_objects[region].append({
74
- "class_id": obj["class_id"],
75
- "class_name": obj["class_name"]
76
- })
77
-
78
- # Determine main focus regions (top 1-2 regions by object count)
79
- sorted_regions = sorted(region_counts.items(), key=lambda x: x[1], reverse=True)
80
- main_regions = [region for region, count in sorted_regions if count > 0][:2]
81
-
82
- return {
83
- "counts": region_counts,
84
- "main_focus": main_regions,
85
- "objects_by_region": region_objects
86
- }
87
 
88
- def _extract_detected_objects(self, detection_result: Any, confidence_threshold: float = 0.25) -> List[Dict]:
89
- """
90
- Extract detected objects from detection result with position information.
91
 
92
- Args:
93
- detection_result: Detection result from YOLOv8
94
- confidence_threshold: Minimum confidence threshold
95
 
96
- Returns:
97
- List of dictionaries with detected object information
98
- """
99
- boxes = detection_result.boxes.xyxy.cpu().numpy()
100
- classes = detection_result.boxes.cls.cpu().numpy().astype(int)
101
- confidences = detection_result.boxes.conf.cpu().numpy()
102
-
103
- # Image dimensions
104
- img_height, img_width = detection_result.orig_shape[:2]
105
-
106
- detected_objects = []
107
- for box, class_id, confidence in zip(boxes, classes, confidences):
108
- # Skip objects with confidence below threshold
109
- if confidence < confidence_threshold:
110
- continue
111
-
112
- x1, y1, x2, y2 = box
113
- width = x2 - x1
114
- height = y2 - y1
115
-
116
- # Center point
117
- center_x = (x1 + x2) / 2
118
- center_y = (y1 + y2) / 2
119
-
120
- # Normalized positions (0-1)
121
- norm_x = center_x / img_width
122
- norm_y = center_y / img_height
123
- norm_width = width / img_width
124
- norm_height = height / img_height
125
-
126
- # Area calculation
127
- area = width * height
128
- norm_area = area / (img_width * img_height)
129
-
130
- # Region determination
131
- object_region = self._determine_region(norm_x, norm_y)
132
-
133
- detected_objects.append({
134
- "class_id": int(class_id),
135
- "class_name": self.class_names[int(class_id)],
136
- "confidence": float(confidence),
137
- "box": [float(x1), float(y1), float(x2), float(y2)],
138
- "center": [float(center_x), float(center_y)],
139
- "normalized_center": [float(norm_x), float(norm_y)],
140
- "size": [float(width), float(height)],
141
- "normalized_size": [float(norm_width), float(norm_height)],
142
- "area": float(area),
143
- "normalized_area": float(norm_area),
144
- "region": object_region
145
- })
146
-
147
- return detected_objects
148
 
 
 
 
 
149
 
150
- def _detect_scene_viewpoint(self, detected_objects: List[Dict]) -> Dict:
151
  """
152
- 檢測場景視角並識別特殊場景模式。
153
 
154
  Args:
155
- detected_objects: 檢測到的物體列表
156
-
157
- Returns:
158
- Dict: 包含視角和場景模式信息的字典
159
  """
160
- if not detected_objects:
161
- return {"viewpoint": "eye_level", "patterns": []}
162
-
163
- # 從物體位置中提取信息
164
- patterns = []
165
-
166
- # 檢測行人位置模式
167
- pedestrian_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
168
-
169
- # 檢查是否有足夠的行人來識別模式
170
- if len(pedestrian_objs) >= 4:
171
- pedestrian_positions = [obj["normalized_center"] for obj in pedestrian_objs]
172
-
173
- # 檢測十字交叉模式
174
- if self._detect_cross_pattern(pedestrian_positions):
175
- patterns.append("crosswalk_intersection")
176
-
177
- # 檢測多方向行人流
178
- directions = self._analyze_movement_directions(pedestrian_positions)
179
- if len(directions) >= 2:
180
- patterns.append("multi_directional_movement")
181
-
182
- # 檢查物體的大小一致性 - 在空中俯視圖中,物體大小通常更一致
183
- if len(detected_objects) >= 5:
184
- sizes = [obj.get("normalized_area", 0) for obj in detected_objects]
185
- size_variance = np.var(sizes) / (np.mean(sizes) ** 2) # 標準化變異數,不會受到平均值影響
186
-
187
- if size_variance < 0.3: # 低變異表示大小一致
188
- patterns.append("consistent_object_size")
189
-
190
- # 基本視角檢測
191
- viewpoint = self.enhance_descriptor._detect_viewpoint(detected_objects)
192
-
193
- # 根據檢測到的模式增強視角判斷
194
- if "crosswalk_intersection" in patterns and viewpoint != "aerial":
195
- # 如果檢測到斑馬線交叉但視角判斷不是空中視角,優先採用模式判斷
196
- viewpoint = "aerial"
197
-
198
- return {
199
- "viewpoint": viewpoint,
200
- "patterns": patterns
201
- }
202
 
203
- def _detect_cross_pattern(self, positions):
204
  """
205
- 檢測位置中的十字交叉模式
206
 
207
  Args:
208
- positions: 位置列表 [[x1, y1], [x2, y2], ...]
 
209
 
210
  Returns:
211
- bool: 是否檢測到十字交叉模式
212
  """
213
- if len(positions) < 8: # 需要足夠多的點
214
- return False
215
-
216
- # 提取 x 和 y 坐標
217
- x_coords = [pos[0] for pos in positions]
218
- y_coords = [pos[1] for pos in positions]
219
-
220
- # 檢測 x 和 y 方向的聚類
221
- x_clusters = []
222
- y_clusters = []
223
-
224
- # 簡化的聚類分析
225
- x_mean = np.mean(x_coords)
226
- y_mean = np.mean(y_coords)
227
-
228
- # 計算在中心線附近的點
229
- near_x_center = sum(1 for x in x_coords if abs(x - x_mean) < 0.1)
230
- near_y_center = sum(1 for y in y_coords if abs(y - y_mean) < 0.1)
231
-
232
- # 如果有足夠的點在中心線附近,可能是十字交叉
233
- return near_x_center >= 3 and near_y_center >= 3
234
 
235
- def _analyze_movement_directions(self, positions):
236
  """
237
- 分析位置中的移動方向
238
 
239
  Args:
240
- positions: 位置列表 [[x1, y1], [x2, y2], ...]
241
 
242
  Returns:
243
- list: 檢測到的主要方向
244
  """
245
- if len(positions) < 6:
246
- return []
247
-
248
- # extract x 和 y 坐標
249
- x_coords = [pos[0] for pos in positions]
250
- y_coords = [pos[1] for pos in positions]
251
-
252
- directions = []
253
-
254
- # horizontal move (left --> right)
255
- x_std = np.std(x_coords)
256
- x_range = max(x_coords) - min(x_coords)
257
-
258
- # vertical move(up --> down)
259
- y_std = np.std(y_coords)
260
- y_range = max(y_coords) - min(y_coords)
261
-
262
- # 足夠大的範圍表示該方向有運動
263
- if x_range > 0.4:
264
- directions.append("horizontal")
265
- if y_range > 0.4:
266
- directions.append("vertical")
267
-
268
- return directions
269
 
270
- def _identify_functional_zones(self, detected_objects: List[Dict], scene_type: str) -> Dict:
271
  """
272
- Identify functional zones within the scene with improved detection for different viewpoints
273
- and cultural contexts.
274
 
275
  Args:
276
- detected_objects: List of detected objects
277
- scene_type: Identified scene type
278
 
279
  Returns:
280
- Dictionary of functional zones with their descriptions
281
  """
282
- # Group objects by category and region
283
- category_regions = {}
284
-
285
- if not getattr(self, 'enable_landmark', True):
286
- detected_objects = [obj for obj in detected_objects if not obj.get("is_landmark", False)]
287
-
288
- # 過濾地標相關場景類型
289
- if scene_type in ["tourist_landmark", "natural_landmark", "historical_monument"]:
290
- scene_type = "city_street"
291
-
292
- # MODIFIED: Smart threshold evaluation instead of fixed values
293
- should_identify = self._evaluate_zone_identification_feasibility(detected_objects, scene_type)
294
-
295
- if not should_identify:
296
- return {}
297
-
298
- # MODIFIED: Build category_regions mapping (was missing in original)
299
- for obj in detected_objects:
300
- category = self._categorize_object(obj)
301
- if not category:
302
- continue
303
-
304
- if category not in category_regions:
305
- category_regions[category] = {}
306
-
307
- region = obj.get("region", "center")
308
- if region not in category_regions[category]:
309
- category_regions[category][region] = []
310
-
311
- category_regions[category][region].append(obj)
312
-
313
- # Identify zones based on object groupings
314
- zones = {}
315
-
316
- # Detect viewpoint to adjust zone identification strategy
317
- viewpoint = self._detect_scene_viewpoint(detected_objects)
318
-
319
- # Choose appropriate zone identification strategy based on scene type and viewpoint
320
- if scene_type in ["living_room", "bedroom", "dining_area", "kitchen", "office_workspace", "meeting_room"]:
321
- # Indoor scenes
322
- zones.update(self._identify_indoor_zones(category_regions, detected_objects, scene_type))
323
- elif scene_type in ["city_street", "parking_lot", "park_area"]:
324
- # Outdoor general scenes
325
- zones.update(self._identify_outdoor_general_zones(category_regions, detected_objects, scene_type))
326
- elif "aerial" in scene_type or viewpoint == "aerial":
327
- # Aerial viewpoint scenes
328
- zones.update(self._identify_aerial_view_zones(category_regions, detected_objects, scene_type))
329
- elif "asian" in scene_type:
330
- # Asian cultural context scenes
331
- zones.update(self._identify_asian_cultural_zones(category_regions, detected_objects, scene_type))
332
- elif scene_type == "urban_intersection":
333
- # Specific urban intersection logic
334
- zones.update(self._identify_intersection_zones(category_regions, detected_objects, viewpoint))
335
- elif scene_type == "financial_district":
336
- # Financial district specific logic
337
- zones.update(self._identify_financial_district_zones(category_regions, detected_objects))
338
- elif scene_type == "upscale_dining":
339
- # Upscale dining specific logic
340
- zones.update(self._identify_upscale_dining_zones(category_regions, detected_objects))
341
- elif scene_type == "tourist_landmark" or "landmark" in scene_type:
342
- # 處理地標場景類型
343
- landmark_objects = [obj for obj in detected_objects if obj.get("is_landmark", False)]
344
- if landmark_objects:
345
- landmark_zones = self._identify_landmark_zones(landmark_objects)
346
- zones.update(landmark_zones)
347
- else:
348
- # Default zone identification for other scene types
349
- zones.update(self._identify_default_zones(category_regions, detected_objects))
350
-
351
- # 檢查是否有地標物體但場景類型不是地標類型
352
- if scene_type != "tourist_landmark" and "landmark" not in scene_type:
353
- landmark_objects = [obj for obj in detected_objects if obj.get("is_landmark", False)]
354
- if landmark_objects:
355
- # 添加地標功能區,但不覆蓋已有的功能區
356
- landmark_zones = self._identify_landmark_zones(landmark_objects)
357
- # 確保地標區域不會覆蓋已識別的其他重要功能區
358
- for zone_id, zone_info in landmark_zones.items():
359
- if zone_id not in zones:
360
- zones[zone_id] = zone_info
361
-
362
- # MODIFIED: Enhanced fallback strategy - try simplified identification if no zones found
363
- if not zones:
364
- zones.update(self._identify_default_zones(category_regions, detected_objects))
365
-
366
- # Final fallback: create basic zones from high-confidence objects
367
- if not zones:
368
- zones.update(self._create_basic_zones_from_objects(detected_objects, scene_type))
369
-
370
- return zones
371
 
372
- def _identify_core_objects_for_scene(self, detected_objects: List[Dict], scene_type: str) -> List[Dict]:
373
  """
374
- Identify core objects that define a particular scene type.
375
 
376
  Args:
377
- detected_objects: List of detected objects
378
- scene_type: Scene type
379
 
380
  Returns:
381
- List of core objects for the scene
382
  """
383
- core_objects = []
384
-
385
- scene_core_mapping = {
386
- "bedroom": [59], # bed
387
- "kitchen": [68, 69, 71, 72], # microwave, oven, sink, refrigerator
388
- "living_room": [57, 58, 62], # sofa, chair, tv
389
- "dining_area": [60, 46, 47], # dining table, fork, knife
390
- "office_workspace": [63, 64, 66, 73] # laptop, mouse, keyboard, book
391
- }
392
-
393
- if scene_type in scene_core_mapping:
394
- core_class_ids = scene_core_mapping[scene_type]
395
- for obj in detected_objects:
396
- if obj["class_id"] in core_class_ids and obj.get("confidence", 0) >= 0.4:
397
- core_objects.append(obj)
398
-
399
- return core_objects
400
 
401
- def _get_object_categories(self, detected_objects: List[Dict]) -> set:
402
- """Get unique object categories from detected objects."""
403
- object_categories = set()
404
- for obj in detected_objects:
405
- category = self._categorize_object(obj)
406
- if category:
407
- object_categories.add(category)
408
- return object_categories
409
-
410
- def _create_basic_zones_from_objects(self, detected_objects: List[Dict], scene_type: str) -> Dict:
411
  """
412
- Create basic functional zones from individual high-confidence objects.
413
- This is a fallback when standard zone identification fails.
414
 
415
  Args:
416
- detected_objects: List of detected objects
417
- scene_type: Scene type
418
 
419
  Returns:
420
- Dictionary of basic zones
421
  """
422
- zones = {}
423
-
424
- # Focus on high-confidence objects
425
- high_conf_objects = [obj for obj in detected_objects if obj.get("confidence", 0) >= 0.6]
426
-
427
- if not high_conf_objects:
428
- high_conf_objects = detected_objects # Fallback to all objects
429
-
430
- # Create zones based on individual important objects
431
- for i, obj in enumerate(high_conf_objects[:3]): # Limit to top 3 objects
432
- class_name = obj["class_name"]
433
- region = obj.get("region", "center")
434
-
435
- # Create descriptive zone based on object type
436
- zone_description = self._get_basic_zone_description(class_name, scene_type)
437
-
438
- if zone_description:
439
- zones[f"functional_area_{i+1}"] = {
440
- "region": region,
441
- "objects": [class_name],
442
- "description": zone_description
443
- }
444
-
445
- return zones
446
-
447
- def _get_basic_zone_description(self, class_name: str, scene_type: str) -> str:
448
- """Generate basic zone description based on object and scene type."""
449
-
450
- # Object-specific descriptions
451
- descriptions = {
452
- "bed": "Sleeping and rest area",
453
- "sofa": "Seating and relaxation area",
454
- "chair": "Seating area",
455
- "dining table": "Dining and meal area",
456
- "tv": "Entertainment and media area",
457
- "laptop": "Work and computing area",
458
- "potted plant": "Decorative and green space area",
459
- "refrigerator": "Food storage and kitchen area",
460
- "car": "Vehicle and transportation area",
461
- "person": "Activity and social area"
462
- }
463
-
464
- return descriptions.get(class_name, f"Functional area with {class_name}")
465
 
466
  def _categorize_object(self, obj: Dict) -> str:
467
  """
468
- Categorize detected objects into functional categories for zone identification.
469
- """
470
- class_id = obj.get("class_id", -1)
471
- class_name = obj.get("class_name", "").lower()
472
-
473
- # Use existing category mapping if available
474
- if hasattr(self, 'OBJECT_CATEGORIES') and self.OBJECT_CATEGORIES:
475
- for category, ids in self.OBJECT_CATEGORIES.items():
476
- if class_id in ids:
477
- return category
478
-
479
- # Fallback categorization based on class names for common COCO classes
480
- furniture_items = ["chair", "couch", "bed", "dining table", "toilet"]
481
- plant_items = ["potted plant"]
482
- electronic_items = ["tv", "laptop", "mouse", "remote", "keyboard", "cell phone"]
483
- vehicle_items = ["bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat"]
484
- person_items = ["person"]
485
- kitchen_items = ["bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl",
486
- "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog",
487
- "pizza", "donut", "cake", "refrigerator", "oven", "toaster", "sink", "microwave"]
488
- sports_items = ["frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat",
489
- "baseball glove", "skateboard", "surfboard", "tennis racket"]
490
- personal_items = ["handbag", "tie", "suitcase", "umbrella", "backpack"]
491
-
492
- if any(item in class_name for item in furniture_items):
493
- return "furniture"
494
- elif any(item in class_name for item in plant_items):
495
- return "plant"
496
- elif any(item in class_name for item in electronic_items):
497
- return "electronics"
498
- elif any(item in class_name for item in vehicle_items):
499
- return "vehicle"
500
- elif any(item in class_name for item in person_items):
501
- return "person"
502
- elif any(item in class_name for item in kitchen_items):
503
- return "kitchen_items"
504
- elif any(item in class_name for item in sports_items):
505
- return "sports"
506
- elif any(item in class_name for item in personal_items):
507
- return "personal_items"
508
- else:
509
- return "misc"
510
-
511
- def _evaluate_zone_identification_feasibility(self, detected_objects: List[Dict], scene_type: str) -> bool:
512
- """
513
- 基於物件關聯性和分布特徵的彈性可行性評估
514
  """
515
- if len(detected_objects) < 2:
516
- return False
517
-
518
- # 計算不同置信度層級的物件分布
519
- high_conf_objects = [obj for obj in detected_objects if obj.get("confidence", 0) >= 0.6]
520
- medium_conf_objects = [obj for obj in detected_objects if obj.get("confidence", 0) >= 0.4]
521
-
522
- # 基礎條件:至少需要一定數量的可信物件
523
- if len(medium_conf_objects) < 2:
524
- return False
525
-
526
- # evalure relationships
527
- functional_relationships = self._calculate_functional_relationships(detected_objects)
528
-
529
- # 評估space的分布多樣性
530
- spatial_diversity = self._calculate_spatial_diversity(detected_objects)
531
-
532
- # 綜合評分機制
533
- feasibility_score = 0
534
-
535
- # 物件數量的貢獻值(權重30%)
536
- object_count_score = min(len(detected_objects) / 5.0, 1.0) * 0.3
537
-
538
- # 信心度質量貢獻(權重25%)
539
- confidence_score = len(high_conf_objects) / max(len(detected_objects), 1) * 0.25
540
-
541
- # 功能關聯性貢獻(權重25%)
542
- relationship_score = functional_relationships * 0.25
543
-
544
- # space多樣性貢獻(權重20%)
545
- diversity_score = spatial_diversity * 0.20
546
-
547
- feasibility_score = object_count_score + confidence_score + relationship_score + diversity_score
548
-
549
- # 動態閾值:基於場景複雜度調整
550
- complexity_threshold = self._get_complexity_threshold(scene_type)
551
-
552
- return feasibility_score >= complexity_threshold
553
-
554
- def _calculate_functional_relationships(self, detected_objects: List[Dict]) -> float:
555
- """
556
- 計算物件間的功能關聯性評分
557
- 基於常見的物件組合模式評估功能相關性
558
- """
559
- relationship_pairs = {
560
- # 家具組合關係
561
- frozenset([56, 60]): 1.0, # 椅子+桌子 (dining/work area)
562
- frozenset([57, 62]): 0.9, # 沙發+電視 (living area)
563
- frozenset([59, 58]): 0.7, # 床+植物 (bedroom decor)
564
-
565
- # 工作相關組合
566
- frozenset([63, 66]): 0.9, # 筆電+鍵盤 (workspace)
567
- frozenset([63, 64]): 0.8, # 筆電+滑鼠 (workspace)
568
- frozenset([60, 63]): 0.8, # 桌子+筆電 (workspace)
569
-
570
- # 廚房相關組合
571
- frozenset([68, 72]): 0.9, # 微波爐+冰箱 (kitchen)
572
- frozenset([69, 71]): 0.8, # 烤箱+水槽 (kitchen)
573
-
574
- # 用餐相關組合
575
- frozenset([60, 40]): 0.8, # 桌子+酒杯 (dining)
576
- frozenset([60, 41]): 0.8, # 桌子+杯子 (dining)
577
- frozenset([56, 40]): 0.7, # 椅子+酒杯 (dining)
578
-
579
- # 交通相關組合
580
- frozenset([2, 9]): 0.8, # 汽車+交通燈 (traffic)
581
- frozenset([0, 9]): 0.7, # 行人+交通燈 (crosswalk)
582
- }
583
-
584
- detected_class_ids = set(obj["class_id"] for obj in detected_objects)
585
- max_possible_score = 0
586
- actual_score = 0
587
-
588
- for pair, score in relationship_pairs.items():
589
- max_possible_score += score
590
- if pair.issubset(detected_class_ids):
591
- actual_score += score
592
-
593
- return actual_score / max_possible_score if max_possible_score > 0 else 0
594
-
595
- def _calculate_spatial_diversity(self, detected_objects: List[Dict]) -> float:
596
- """
597
- 計算物件空間分布的多樣性
598
- 評估物件是否分散在不同區域,避免所有物件集中在單一區域
599
- """
600
- regions = set(obj.get("region", "center") for obj in detected_objects)
601
- unique_regions = len(regions)
602
-
603
- return min(unique_regions / 2.0, 1.0)
604
-
605
- def _get_complexity_threshold(self, scene_type: str) -> float:
606
- """
607
- 可根據場景類型返回適當的複雜度閾值
608
- 平衡不同場景的區域劃分需求
609
- """
610
- # 較簡單場景需要較高分數才進行區域劃分
611
- simple_scenes = ["bedroom", "bathroom", "closet"]
612
- # 較複雜場景可以較低分數進行區域劃分
613
- complex_scenes = ["living_room", "kitchen", "office_workspace", "dining_area"]
614
-
615
- if scene_type in simple_scenes:
616
- return 0.65 # 較高閾值,避免過度細分
617
- elif scene_type in complex_scenes:
618
- return 0.45 # 較低閾值,允許合理劃分
619
- else:
620
- return 0.55 # 中等閾值,平衡策略
621
-
622
- def _identify_indoor_zones(self, category_regions: Dict, detected_objects: List[Dict], scene_type: str) -> Dict:
623
- """
624
- 平衡化的室內功能區域識別
625
- 採用通用的物件關聯性分析,避免場景特定的硬編碼
626
- """
627
- zones = {}
628
-
629
- # 辨識到主要功能區域(基於物件關聯性而非場景類型)
630
- primary_zone = self._identify_primary_functional_area(detected_objects)
631
- if primary_zone:
632
- zones["primary_area"] = primary_zone
633
-
634
- # 只有明確證據且物件數量足夠時創建次要功能區域
635
- if len(zones) >= 1 and len(detected_objects) >= 6:
636
- secondary_zone = self._identify_secondary_functional_area(detected_objects, zones)
637
- if secondary_zone:
638
- zones["secondary_area"] = secondary_zone
639
 
640
- return zones
 
 
 
641
 
642
- def _identify_primary_functional_area(self, detected_objects: List[Dict]) -> Dict:
643
- """
644
- 辨識主要功能區域,基於最強的物件關聯性組合
645
- 採用通用邏輯處理各種室內場景
646
- """
647
- # 用餐區域檢測(桌椅組合)
648
- dining_area = self._detect_functional_combination(
649
- detected_objects,
650
- primary_objects=[60], # dining table
651
- supporting_objects=[56, 40, 41, 42, 43], # chair, wine glass, cup, fork, knife
652
- min_supporting=2,
653
- description_template="Dining area with table and seating arrangement"
654
- )
655
- if dining_area:
656
- return dining_area
657
-
658
- # 休息區域檢測(沙發電視組合或床)
659
- seating_area = self._detect_functional_combination(
660
- detected_objects,
661
- primary_objects=[57, 59], # sofa, bed
662
- supporting_objects=[62, 58, 56], # tv, potted plant, chair
663
- min_supporting=1,
664
- description_template="Seating and relaxation area"
665
- )
666
- if seating_area:
667
- return seating_area
668
-
669
- # 工作區域檢測(電子設備與家具組合)
670
- work_area = self._detect_functional_combination(
671
- detected_objects,
672
- primary_objects=[63, 66], # laptop, keyboard
673
- supporting_objects=[60, 56, 64], # dining table, chair, mouse
674
- min_supporting=2,
675
- description_template="Workspace area with electronics and furniture"
676
- )
677
- if work_area:
678
- return work_area
679
-
680
- return None
681
-
682
- def _identify_secondary_functional_area(self, detected_objects: List[Dict], existing_zones: Dict) -> Dict:
683
  """
684
- 識別次要功能區域,避免與主要區域重疊
685
- """
686
- # 獲取已使用的區域
687
- used_regions = set(zone["region"] for zone in existing_zones.values())
688
-
689
- # 裝飾區域檢測(植物集中區域)
690
- decorative_area = self._detect_functional_combination(
691
- detected_objects,
692
- primary_objects=[58], # potted plant
693
- supporting_objects=[75], # vase
694
- min_supporting=0,
695
- min_primary=3, # 至少需要3個植物
696
- description_template="Decorative area with plants and ornamental items",
697
- exclude_regions=used_regions
698
- )
699
- if decorative_area:
700
- return decorative_area
701
-
702
- # 儲存區域檢測(廚房電器組合)
703
- storage_area = self._detect_functional_combination(
704
- detected_objects,
705
- primary_objects=[72, 68, 69], # refrigerator, microwave, oven
706
- supporting_objects=[71], # sink
707
- min_supporting=0,
708
- min_primary=2,
709
- description_template="Kitchen appliance and storage area",
710
- exclude_regions=used_regions
711
- )
712
- if storage_area:
713
- return storage_area
714
-
715
- return None
716
-
717
- def _detect_functional_combination(self, detected_objects: List[Dict], primary_objects: List[int],
718
- supporting_objects: List[int], min_supporting: int,
719
- description_template: str, min_primary: int = 1,
720
- exclude_regions: set = None) -> Dict:
721
- """
722
- 通用的功能組合檢測方法
723
- 基於主要物件和支持物件的組合判斷功能區域
724
 
725
  Args:
726
- detected_objects: 檢測到的物件列表
727
- primary_objects: 主要物件的class_id列表
728
- supporting_objects: 支持物件的class_id列表
729
- min_supporting: 最少需要的支持物件數量
730
- description_template: 描述模板
731
- min_primary: 最少需要的主要物件數量
732
- exclude_regions: 需要排除的區域集合
733
 
734
  Returns:
735
- Dict: 功能區域資訊,如果不符合條件則返回None
736
  """
737
- if exclude_regions is None:
738
- exclude_regions = set()
739
-
740
- # 收集主要物件
741
- primary_objs = [obj for obj in detected_objects
742
- if obj["class_id"] in primary_objects and obj.get("confidence", 0) >= 0.4]
743
-
744
- # 收集支持物件
745
- supporting_objs = [obj for obj in detected_objects
746
- if obj["class_id"] in supporting_objects and obj.get("confidence", 0) >= 0.4]
747
-
748
- # 檢查是否滿足最少數量要求
749
- if len(primary_objs) < min_primary or len(supporting_objs) < min_supporting:
750
- return None
751
-
752
- # 按區域組織物件
753
- region_combinations = {}
754
- all_relevant_objs = primary_objs + supporting_objs
755
-
756
- for obj in all_relevant_objs:
757
- region = obj["region"]
758
-
759
- # 排除指定區域
760
- if region in exclude_regions:
761
- continue
762
-
763
- if region not in region_combinations:
764
- region_combinations[region] = {"primary": [], "supporting": [], "all": []}
765
-
766
- region_combinations[region]["all"].append(obj)
767
-
768
- if obj["class_id"] in primary_objects:
769
- region_combinations[region]["primary"].append(obj)
770
- else:
771
- region_combinations[region]["supporting"].append(obj)
772
-
773
- # 找到最佳區域組合
774
- best_region = None
775
- best_score = 0
776
-
777
- for region, objs in region_combinations.items():
778
- # 計算該區域的評分
779
- primary_count = len(objs["primary"])
780
- supporting_count = len(objs["supporting"])
781
-
782
- # 必須滿足最低要求
783
- if primary_count < min_primary or supporting_count < min_supporting:
784
- continue
785
-
786
- # 計算組合評分(主要物件權重較高)
787
- score = primary_count * 2 + supporting_count
788
-
789
- if score > best_score:
790
- best_score = score
791
- best_region = region
792
 
793
- if best_region is None:
794
- return None
795
 
796
- best_combination = region_combinations[best_region]
797
- all_objects = [obj["class_name"] for obj in best_combination["all"]]
 
 
 
 
 
798
 
799
- return {
800
- "region": best_region,
801
- "objects": all_objects,
802
- "description": description_template
803
- }
804
 
805
- def _identify_intersection_zones(self, category_regions: Dict, detected_objects: List[Dict], viewpoint: str) -> Dict:
806
- """
807
- Identify functional zones for urban intersections with enhanced spatial awareness.
808
 
809
- Args:
810
- category_regions: Objects grouped by category and region
811
- detected_objects: List of detected objects
812
- viewpoint: Detected viewpoint
813
 
814
- Returns:
815
- Dict: Refined intersection functional zones
816
- """
817
- zones = {}
818
-
819
- # Get pedestrians, vehicles and traffic signals
820
- pedestrian_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
821
- vehicle_objs = [obj for obj in detected_objects if obj["class_id"] in [1, 2, 3, 5, 7]] # bicycle, car, motorcycle, bus, truck
822
- traffic_light_objs = [obj for obj in detected_objects if obj["class_id"] == 9]
823
-
824
- # Create distribution maps for better spatial understanding
825
- regions_distribution = self._create_distribution_map(detected_objects)
826
-
827
- # Analyze pedestrian crossing patterns
828
- crossing_zones = self._analyze_crossing_patterns(pedestrian_objs, traffic_light_objs, regions_distribution)
829
- zones.update(crossing_zones)
830
-
831
- # Analyze vehicle traffic zones with directional awareness
832
- traffic_zones = self._analyze_traffic_zones(vehicle_objs, regions_distribution)
833
- zones.update(traffic_zones)
834
-
835
- # Identify traffic control zones based on signal placement
836
- if traffic_light_objs:
837
- # Group traffic lights by region for better organization
838
- signal_regions = {}
839
- for obj in traffic_light_objs:
840
- region = obj["region"]
841
- if region not in signal_regions:
842
- signal_regions[region] = []
843
- signal_regions[region].append(obj)
844
-
845
- # Create traffic control zones for each region with signals
846
- for idx, (region, signals) in enumerate(signal_regions.items()):
847
- # Check if this region has a directional name
848
- direction = self._get_directional_description(region)
849
-
850
- zones[f"traffic_control_zone_{idx+1}"] = {
851
- "region": region,
852
- "objects": ["traffic light"] * len(signals),
853
- "description": f"Traffic control area with {len(signals)} traffic signals" +
854
- (f" in {direction} area" if direction else "")
855
- }
856
-
857
- return zones
858
-
859
- def _identify_landmark_zones(self, landmark_objects: List[Dict]) -> Dict:
860
  """
861
- 識別與地標相關的功能區域
862
 
863
  Args:
864
- landmark_objects: 被識別為地標的物體列表
865
 
866
  Returns:
867
- Dict: 地標相關的功能區域
868
  """
869
- landmark_zones = {}
870
-
871
- if not landmark_objects:
872
- print("Warning: No landmark objects provided to _identify_landmark_zones")
873
- return landmark_zones
874
-
875
  try:
876
- for i, landmark in enumerate(landmark_objects):
877
- if not isinstance(landmark, dict):
878
- print(f"Warning: Landmark object at index {i} is not a dictionary: {type(landmark)}")
879
- continue
880
-
881
- landmark_id = landmark.get("landmark_id")
882
- if not landmark_id:
883
- print(f"Warning: Missing landmark_id for landmark at index {i}")
884
- landmark_id = f"unknown_landmark_{i}"
885
-
886
- landmark_name = landmark.get("class_name", "Landmark")
887
- landmark_type = landmark.get("landmark_type", "architectural")
888
- landmark_region = landmark.get("region", "middle_center")
889
-
890
- # 為地標創建主要觀景區
891
- zone_id = f"landmark_zone_{i+1}"
892
- zone_name = f"{landmark_name} Viewing Area"
893
-
894
- # 根據地標類型調整描述
895
- if landmark_type == "natural":
896
- zone_description = f"Scenic viewpoint for observing {landmark_name}, a notable natural landmark in {landmark.get('location', 'this area')}."
897
- primary_function = "Nature observation and photography"
898
- elif landmark_type == "monument":
899
- zone_description = f"Viewing area around {landmark_name}, a significant monument in {landmark.get('location', 'this area')}."
900
- primary_function = "Historical appreciation and cultural tourism"
901
- else: # architectural
902
- zone_description = f"Area centered around {landmark_name}, where visitors can observe and appreciate this iconic structure in {landmark.get('location', 'this area')}."
903
- primary_function = "Architectural tourism and photography"
904
-
905
- # 確定與地標相關的物體
906
- related_objects = ["person", "camera", "cell phone", "backpack"]
907
-
908
- # 創建功能區域
909
- landmark_zones[zone_id] = {
910
- "name": zone_name,
911
- "description": zone_description,
912
- "objects": ["landmark"] + [obj for obj in related_objects if obj in [o.get("class_name") for o in landmark_objects]],
913
- "region": landmark_region,
914
- "primary_function": primary_function
915
- }
916
-
917
- # 如果有建造年份信息,加到描述中
918
- if "year_built" in landmark:
919
- landmark_zones[zone_id]["description"] += f" Built in {landmark['year_built']}."
920
-
921
- # 如果有建築風格信息,加到描述中
922
- if "architectural_style" in landmark:
923
- landmark_zones[zone_id]["description"] += f" Features {landmark['architectural_style']} architectural style."
924
-
925
- # 如果有重要性信息,加到描述中
926
- if "significance" in landmark:
927
- landmark_zones[zone_id]["description"] += f" {landmark['significance']}."
928
-
929
- try:
930
- # 創建照相區
931
- photo_region = landmark_region # 默認與地標在同一區域
932
-
933
- # 根據地標位置調整照相區位置(地標前方通常是照相區)
934
- region_mapping = {
935
- "top_left": "bottom_right",
936
- "top_center": "bottom_center",
937
- "top_right": "bottom_left",
938
- "middle_left": "middle_right",
939
- "middle_center": "bottom_center",
940
- "middle_right": "middle_left",
941
- "bottom_left": "top_right",
942
- "bottom_center": "top_center",
943
- "bottom_right": "top_left"
944
- }
945
-
946
- if landmark_region in region_mapping:
947
- photo_region = region_mapping[landmark_region]
948
-
949
- landmark_zones[f"photo_spot_{i+1}"] = {
950
- "name": f"{landmark_name} Photography Spot",
951
- "description": f"Popular position for photographing {landmark_name} with optimal viewing angle.",
952
- "objects": ["camera", "person", "cell phone"],
953
- "region": photo_region,
954
- "primary_function": "Tourist photography"
955
- }
956
- except Exception as e:
957
- print(f"Error creating photo spot zone: {e}")
958
-
959
- try:
960
- # 如果是著名地標,可能有紀念品販售區
961
- if landmark.get("confidence", 0) > 0.7: # 高置信度地標更可能有紀念品區
962
- # 根據地標位置找到適合的紀念品區位置(通常在地標附近但不直接在地標上)
963
- adjacent_regions = {
964
- "top_left": ["top_center", "middle_left"],
965
- "top_center": ["top_left", "top_right"],
966
- "top_right": ["top_center", "middle_right"],
967
- "middle_left": ["top_left", "bottom_left"],
968
- "middle_center": ["middle_left", "middle_right"],
969
- "middle_right": ["top_right", "bottom_right"],
970
- "bottom_left": ["middle_left", "bottom_center"],
971
- "bottom_center": ["bottom_left", "bottom_right"],
972
- "bottom_right": ["bottom_center", "middle_right"]
973
- }
974
-
975
- if landmark_region in adjacent_regions:
976
- souvenir_region = adjacent_regions[landmark_region][0] # 選擇第一個相鄰區域
977
-
978
- landmark_zones[f"souvenir_area_{i+1}"] = {
979
- "name": f"{landmark_name} Souvenir Area",
980
- "description": f"Area where visitors can purchase souvenirs and memorabilia related to {landmark_name}.",
981
- "objects": ["person", "handbag", "backpack"],
982
- "region": souvenir_region,
983
- "primary_function": "Tourism commerce"
984
- }
985
- except Exception as e:
986
- print(f"Error creating souvenir area zone: {e}")
987
-
988
  except Exception as e:
989
- print(f"Error in _identify_landmark_zones: {e}")
990
- import traceback
991
- traceback.print_exc()
992
 
993
- return landmark_zones
994
-
995
- def _analyze_crossing_patterns(self, pedestrians: List[Dict], traffic_lights: List[Dict],
996
- region_distribution: Dict) -> Dict:
997
  """
998
- Analyze pedestrian crossing patterns to identify crosswalk zones.
999
 
1000
  Args:
1001
- pedestrians: List of pedestrian objects
1002
- traffic_lights: List of traffic light objects
1003
- region_distribution: Distribution of objects by region
1004
 
1005
  Returns:
1006
- Dict: Identified crossing zones
1007
  """
1008
- crossing_zones = {}
1009
-
1010
- if not pedestrians:
1011
- return crossing_zones
1012
-
1013
- # Group pedestrians by region
1014
- pedestrian_regions = {}
1015
- for p in pedestrians:
1016
- region = p["region"]
1017
- if region not in pedestrian_regions:
1018
- pedestrian_regions[region] = []
1019
- pedestrian_regions[region].append(p)
1020
-
1021
- # Sort regions by pedestrian count to find main crossing areas
1022
- sorted_regions = sorted(pedestrian_regions.items(), key=lambda x: len(x[1]), reverse=True)
1023
-
1024
- # Create crossing zones for regions with pedestrians
1025
- for idx, (region, peds) in enumerate(sorted_regions[:2]): # Focus on top 2 regions
1026
- # Check if there are traffic lights nearby to indicate a crosswalk
1027
- has_nearby_signals = any(t["region"] == region for t in traffic_lights)
1028
-
1029
- # Create crossing zone with descriptive naming
1030
- zone_name = f"crossing_zone_{idx+1}"
1031
- direction = self._get_directional_description(region)
1032
-
1033
- description = f"Pedestrian crossing area with {len(peds)} "
1034
- description += "person" if len(peds) == 1 else "people"
1035
- if direction:
1036
- description += f" in {direction} direction"
1037
- if has_nearby_signals:
1038
- description += " near traffic signals"
1039
-
1040
- crossing_zones[zone_name] = {
1041
- "region": region,
1042
- "objects": ["pedestrian"] * len(peds),
1043
- "description": description
1044
- }
1045
-
1046
- return crossing_zones
1047
 
1048
- def _analyze_traffic_zones(self, vehicles: List[Dict], region_distribution: Dict) -> Dict:
1049
  """
1050
- Analyze vehicle distribution to identify traffic zones with directional awareness.
1051
 
1052
  Args:
1053
- vehicles: List of vehicle objects
1054
- region_distribution: Distribution of objects by region
1055
 
1056
  Returns:
1057
- Dict: Identified traffic zones
1058
  """
1059
- traffic_zones = {}
1060
-
1061
- if not vehicles:
1062
- return traffic_zones
1063
-
1064
- # 把運輸工具歸成一區
1065
- vehicle_regions = {}
1066
- for v in vehicles:
1067
- region = v["region"]
1068
- if region not in vehicle_regions:
1069
- vehicle_regions[region] = []
1070
- vehicle_regions[region].append(v)
1071
-
1072
- # Create traffic zones for regions with vehicles
1073
- main_traffic_region = max(vehicle_regions.items(), key=lambda x: len(x[1]), default=(None, []))
1074
-
1075
- if main_traffic_region[0] is not None:
1076
- region = main_traffic_region[0]
1077
- vehicles_in_region = main_traffic_region[1]
1078
-
1079
- # Get a list of vehicle types for description
1080
- vehicle_types = [v["class_name"] for v in vehicles_in_region]
1081
- unique_types = list(set(vehicle_types))
1082
-
1083
- # Get directional description
1084
- direction = self._get_directional_description(region)
1085
-
1086
- # Create descriptive zone
1087
- traffic_zones["vehicle_zone"] = {
1088
- "region": region,
1089
- "objects": vehicle_types,
1090
- "description": f"Vehicle traffic area with {', '.join(unique_types[:3])}" +
1091
- (f" in {direction} area" if direction else "")
1092
- }
1093
-
1094
- # If vehicles are distributed across multiple regions, create secondary zones
1095
- if len(vehicle_regions) > 1:
1096
- # Get second most populated region
1097
- sorted_regions = sorted(vehicle_regions.items(), key=lambda x: len(x[1]), reverse=True)
1098
- if len(sorted_regions) > 1:
1099
- second_region, second_vehicles = sorted_regions[1]
1100
- direction = self._get_directional_description(second_region)
1101
- vehicle_types = [v["class_name"] for v in second_vehicles]
1102
- unique_types = list(set(vehicle_types))
1103
-
1104
- traffic_zones["secondary_vehicle_zone"] = {
1105
- "region": second_region,
1106
- "objects": vehicle_types,
1107
- "description": f"Secondary traffic area with {', '.join(unique_types[:2])}" +
1108
- (f" in {direction} direction" if direction else "")
1109
- }
1110
-
1111
- return traffic_zones
1112
 
1113
- def _get_directional_description(self, region: str) -> str:
1114
  """
1115
- 把方向轉換成方位(東西南北)
1116
 
1117
  Args:
1118
- region: Region name from the grid
1119
 
1120
  Returns:
1121
- str: Directional description
1122
  """
1123
- if "top" in region and "left" in region:
1124
- return "northwest"
1125
- elif "top" in region and "right" in region:
1126
- return "northeast"
1127
- elif "bottom" in region and "left" in region:
1128
- return "southwest"
1129
- elif "bottom" in region and "right" in region:
1130
- return "southeast"
1131
- elif "top" in region:
1132
- return "north"
1133
- elif "bottom" in region:
1134
- return "south"
1135
- elif "left" in region:
1136
- return "west"
1137
- elif "right" in region:
1138
- return "east"
1139
- else:
1140
- return "central"
1141
 
1142
- def _create_distribution_map(self, detected_objects: List[Dict]) -> Dict:
1143
  """
1144
- Create a distribution map of objects across regions for spatial analysis.
1145
 
1146
  Args:
1147
- detected_objects: List of detected objects
1148
 
1149
  Returns:
1150
- Dict: Distribution map of objects by region and class
1151
  """
1152
- distribution = {}
1153
-
1154
- # Initialize all regions
1155
- for region in self.regions.keys():
1156
- distribution[region] = {
1157
- "total": 0,
1158
- "objects": {},
1159
- "density": 0
1160
- }
1161
-
1162
- # Populate the distribution
1163
- for obj in detected_objects:
1164
- region = obj["region"]
1165
- class_id = obj["class_id"]
1166
- class_name = obj["class_name"]
1167
-
1168
- distribution[region]["total"] += 1
1169
-
1170
- if class_id not in distribution[region]["objects"]:
1171
- distribution[region]["objects"][class_id] = {
1172
- "name": class_name,
1173
- "count": 0,
1174
- "positions": []
1175
- }
1176
-
1177
- distribution[region]["objects"][class_id]["count"] += 1
1178
-
1179
- # Store position for spatial relationship analysis
1180
- if "normalized_center" in obj:
1181
- distribution[region]["objects"][class_id]["positions"].append(obj["normalized_center"])
1182
-
1183
- # Calculate object density for each region
1184
- for region, data in distribution.items():
1185
- # Assuming all regions are equal size in the grid
1186
- data["density"] = data["total"] / 1
1187
-
1188
- return distribution
1189
 
1190
- def _identify_asian_cultural_zones(self, category_regions: Dict, detected_objects: List[Dict], scene_type: str) -> Dict:
1191
  """
1192
- Identify functional zones for scenes with Asian cultural context.
1193
 
1194
  Args:
1195
- category_regions: Objects grouped by category and region
1196
- detected_objects: List of detected objects
1197
- scene_type: Specific scene type
1198
 
1199
  Returns:
1200
- Dict: Asian cultural functional zones
1201
- """
1202
- zones = {}
1203
-
1204
- # Identify storefront zone
1205
- storefront_items = []
1206
- storefront_regions = {}
1207
-
1208
- # Since storefronts aren't directly detectable, infer from context
1209
- # For example, look for regions with signs, people, and smaller objects
1210
- sign_regions = set()
1211
- for obj in detected_objects:
1212
- if obj["class_id"] == 0: # Person
1213
- region = obj["region"]
1214
- if region not in storefront_regions:
1215
- storefront_regions[region] = []
1216
- storefront_regions[region].append(obj)
1217
-
1218
- # Add regions with people as potential storefront areas
1219
- sign_regions.add(region)
1220
-
1221
- # Use the areas with most people as storefront zones
1222
- if storefront_regions:
1223
- main_storefront_regions = sorted(storefront_regions.items(),
1224
- key=lambda x: len(x[1]),
1225
- reverse=True)[:2] # Top 2 regions
1226
-
1227
- for idx, (region, objs) in enumerate(main_storefront_regions):
1228
- zones[f"commercial_zone_{idx+1}"] = {
1229
- "region": region,
1230
- "objects": [obj["class_name"] for obj in objs],
1231
- "description": f"Asian commercial storefront with pedestrian activity"
1232
- }
1233
-
1234
- # Identify pedestrian pathway - enhanced to better detect linear pathways
1235
- pathway_items = []
1236
- pathway_regions = {}
1237
-
1238
- # Extract people for pathway analysis
1239
- people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
1240
-
1241
- # Analyze if people form a line (typical of shopping streets)
1242
- people_positions = [obj["normalized_center"] for obj in people_objs]
1243
-
1244
- structured_path = False
1245
- if len(people_positions) >= 3:
1246
- # Check if people are arranged along a similar y-coordinate (horizontal path)
1247
- y_coords = [pos[1] for pos in people_positions]
1248
- y_mean = sum(y_coords) / len(y_coords)
1249
- y_variance = sum((y - y_mean)**2 for y in y_coords) / len(y_coords)
1250
-
1251
- horizontal_path = y_variance < 0.05 # Low variance indicates horizontal alignment
1252
-
1253
- # Check if people are arranged along a similar x-coordinate (vertical path)
1254
- x_coords = [pos[0] for pos in people_positions]
1255
- x_mean = sum(x_coords) / len(x_coords)
1256
- x_variance = sum((x - x_mean)**2 for x in x_coords) / len(x_coords)
1257
-
1258
- vertical_path = x_variance < 0.05 # Low variance indicates vertical alignment
1259
-
1260
- structured_path = horizontal_path or vertical_path
1261
- path_direction = "horizontal" if horizontal_path else "vertical" if vertical_path else "meandering"
1262
-
1263
- # Collect pathway objects (people, bicycles, motorcycles in middle area)
1264
- for obj in detected_objects:
1265
- if obj["class_id"] in [0, 1, 3]: # Person, bicycle, motorcycle
1266
- y_pos = obj["normalized_center"][1]
1267
- # Group by vertical position (middle of image likely pathway)
1268
- if 0.25 <= y_pos <= 0.75:
1269
- region = obj["region"]
1270
- if region not in pathway_regions:
1271
- pathway_regions[region] = []
1272
- pathway_regions[region].append(obj)
1273
- pathway_items.append(obj["class_name"])
1274
-
1275
- if pathway_items:
1276
- path_desc = "Pedestrian walkway with people moving through the commercial area"
1277
- if structured_path:
1278
- path_desc = f"{path_direction.capitalize()} pedestrian walkway with organized foot traffic"
1279
-
1280
- zones["pedestrian_pathway"] = {
1281
- "region": "middle_center", # Assumption: pathway often in middle
1282
- "objects": list(set(pathway_items)),
1283
- "description": path_desc
1284
- }
1285
-
1286
- # Identify vendor zone (small stalls/shops - inferred from context)
1287
- has_small_objects = any(obj["class_id"] in [24, 26, 39, 41] for obj in detected_objects) # bags, bottles, cups
1288
- has_people = any(obj["class_id"] == 0 for obj in detected_objects)
1289
-
1290
- if has_small_objects and has_people:
1291
- # Likely vendor areas are where people and small objects cluster
1292
- small_obj_regions = {}
1293
-
1294
- for obj in detected_objects:
1295
- if obj["class_id"] in [24, 26, 39, 41, 67]: # bags, bottles, cups, phones
1296
- region = obj["region"]
1297
- if region not in small_obj_regions:
1298
- small_obj_regions[region] = []
1299
- small_obj_regions[region].append(obj)
1300
-
1301
- if small_obj_regions:
1302
- main_vendor_region = max(small_obj_regions.items(),
1303
- key=lambda x: len(x[1]),
1304
- default=(None, []))
1305
-
1306
- if main_vendor_region[0] is not None:
1307
- vendor_items = [obj["class_name"] for obj in main_vendor_region[1]]
1308
- zones["vendor_zone"] = {
1309
- "region": main_vendor_region[0],
1310
- "objects": list(set(vendor_items)),
1311
- "description": "Vendor or market stall area with small merchandise"
1312
- }
1313
-
1314
- # For night markets, identify illuminated zones
1315
- if scene_type == "asian_night_market":
1316
- # Night markets typically have bright spots for food stalls
1317
- # This would be enhanced with lighting analysis integration
1318
- zones["food_stall_zone"] = {
1319
- "region": "middle_center",
1320
- "objects": ["inferred food stalls"],
1321
- "description": "Food stall area typical of Asian night markets"
1322
- }
1323
-
1324
- return zones
1325
-
1326
- def _identify_upscale_dining_zones(self, category_regions: Dict, detected_objects: List[Dict]) -> Dict:
1327
  """
1328
- Identify functional zones for upscale dining settings.
1329
-
1330
- Args:
1331
- category_regions: Objects grouped by category and region
1332
- detected_objects: List of detected objects
 
1333
 
1334
- Returns:
1335
- Dict: Upscale dining functional zones
1336
- """
1337
- zones = {}
1338
-
1339
- # Identify dining table zone
1340
- dining_items = []
1341
- dining_regions = {}
1342
-
1343
- for obj in detected_objects:
1344
- if obj["class_id"] in [40, 41, 42, 43, 44, 45, 60]: # Wine glass, cup, fork, knife, spoon, bowl, table
1345
- region = obj["region"]
1346
- if region not in dining_regions:
1347
- dining_regions[region] = []
1348
- dining_regions[region].append(obj)
1349
- dining_items.append(obj["class_name"])
1350
-
1351
- if dining_items:
1352
- main_dining_region = max(dining_regions.items(),
1353
- key=lambda x: len(x[1]),
1354
- default=(None, []))
1355
-
1356
- if main_dining_region[0] is not None:
1357
- zones["formal_dining_zone"] = {
1358
- "region": main_dining_region[0],
1359
- "objects": list(set(dining_items)),
1360
- "description": f"Formal dining area with {', '.join(list(set(dining_items))[:3])}"
1361
- }
1362
-
1363
- # Identify decorative zone with enhanced detection
1364
- decor_items = []
1365
- decor_regions = {}
1366
-
1367
- # Look for decorative elements (vases, wine glasses, unused dishes)
1368
- for obj in detected_objects:
1369
- if obj["class_id"] in [75, 40]: # Vase, wine glass
1370
- region = obj["region"]
1371
- if region not in decor_regions:
1372
- decor_regions[region] = []
1373
- decor_regions[region].append(obj)
1374
- decor_items.append(obj["class_name"])
1375
-
1376
- if decor_items:
1377
- main_decor_region = max(decor_regions.items(),
1378
- key=lambda x: len(x[1]),
1379
- default=(None, []))
1380
-
1381
- if main_decor_region[0] is not None:
1382
- zones["decorative_zone"] = {
1383
- "region": main_decor_region[0],
1384
- "objects": list(set(decor_items)),
1385
- "description": f"Decorative area with {', '.join(list(set(decor_items)))}"
1386
- }
1387
-
1388
- # Identify seating arrangement zone
1389
- chairs = [obj for obj in detected_objects if obj["class_id"] == 56] # chairs
1390
- if len(chairs) >= 2:
1391
- chair_regions = {}
1392
- for obj in chairs:
1393
- region = obj["region"]
1394
- if region not in chair_regions:
1395
- chair_regions[region] = []
1396
- chair_regions[region].append(obj)
1397
-
1398
- if chair_regions:
1399
- main_seating_region = max(chair_regions.items(),
1400
- key=lambda x: len(x[1]),
1401
- default=(None, []))
1402
-
1403
- if main_seating_region[0] is not None:
1404
- zones["dining_seating_zone"] = {
1405
- "region": main_seating_region[0],
1406
- "objects": ["chair"] * len(main_seating_region[1]),
1407
- "description": f"Formal dining seating arrangement with {len(main_seating_region[1])} chairs"
1408
- }
1409
-
1410
- # Identify serving area (if different from dining area)
1411
- serving_items = []
1412
- serving_regions = {}
1413
-
1414
- # Serving areas might have bottles, bowls, containers
1415
- for obj in detected_objects:
1416
- if obj["class_id"] in [39, 45]: # Bottle, bowl
1417
- # Check if it's in a different region from the main dining table
1418
- if "formal_dining_zone" in zones and obj["region"] != zones["formal_dining_zone"]["region"]:
1419
- region = obj["region"]
1420
- if region not in serving_regions:
1421
- serving_regions[region] = []
1422
- serving_regions[region].append(obj)
1423
- serving_items.append(obj["class_name"])
1424
-
1425
- if serving_items:
1426
- main_serving_region = max(serving_regions.items(),
1427
- key=lambda x: len(x[1]),
1428
- default=(None, []))
1429
-
1430
- if main_serving_region[0] is not None:
1431
- zones["serving_zone"] = {
1432
- "region": main_serving_region[0],
1433
- "objects": list(set(serving_items)),
1434
- "description": f"Serving or sideboard area with {', '.join(list(set(serving_items)))}"
1435
- }
1436
-
1437
- return zones
1438
-
1439
- def _identify_financial_district_zones(self, category_regions: Dict, detected_objects: List[Dict]) -> Dict:
1440
  """
1441
- Identify functional zones for financial district scenes.
1442
 
1443
  Args:
1444
- category_regions: Objects grouped by category and region
1445
- detected_objects: List of detected objects
1446
 
1447
  Returns:
1448
- Dict: Financial district functional zones
1449
  """
1450
- zones = {}
1451
-
1452
- # Identify traffic zone
1453
- traffic_items = []
1454
- traffic_regions = {}
1455
-
1456
- for obj in detected_objects:
1457
- if obj["class_id"] in [1, 2, 3, 5, 6, 7, 9]: # Various vehicles and traffic lights
1458
- region = obj["region"]
1459
- if region not in traffic_regions:
1460
- traffic_regions[region] = []
1461
- traffic_regions[region].append(obj)
1462
- traffic_items.append(obj["class_name"])
1463
-
1464
- if traffic_items:
1465
- main_traffic_region = max(traffic_regions.items(),
1466
- key=lambda x: len(x[1]),
1467
- default=(None, []))
1468
-
1469
- if main_traffic_region[0] is not None:
1470
- zones["traffic_zone"] = {
1471
- "region": main_traffic_region[0],
1472
- "objects": list(set(traffic_items)),
1473
- "description": f"Urban traffic area with {', '.join(list(set(traffic_items))[:3])}"
1474
- }
1475
-
1476
- # Building zones on the sides (inferred from scene context)
1477
- # Enhanced to check if there are actual regions that might contain buildings
1478
- # Check for regions without vehicles or pedestrians - likely building areas
1479
- left_side_regions = ["top_left", "middle_left", "bottom_left"]
1480
- right_side_regions = ["top_right", "middle_right", "bottom_right"]
1481
-
1482
- # Check left side
1483
- left_building_evidence = True
1484
- for region in left_side_regions:
1485
- # If many vehicles or people in this region, less likely to be buildings
1486
- vehicle_in_region = any(obj["region"] == region and obj["class_id"] in [1, 2, 3, 5, 7]
1487
- for obj in detected_objects)
1488
- people_in_region = any(obj["region"] == region and obj["class_id"] == 0
1489
- for obj in detected_objects)
1490
-
1491
- if vehicle_in_region or people_in_region:
1492
- left_building_evidence = False
1493
- break
1494
-
1495
- # Check right side
1496
- right_building_evidence = True
1497
- for region in right_side_regions:
1498
- # If many vehicles or people in this region, less likely to be buildings
1499
- vehicle_in_region = any(obj["region"] == region and obj["class_id"] in [1, 2, 3, 5, 7]
1500
- for obj in detected_objects)
1501
- people_in_region = any(obj["region"] == region and obj["class_id"] == 0
1502
- for obj in detected_objects)
1503
-
1504
- if vehicle_in_region or people_in_region:
1505
- right_building_evidence = False
1506
- break
1507
-
1508
- # Add building zones if evidence supports them
1509
- if left_building_evidence:
1510
- zones["building_zone_left"] = {
1511
- "region": "middle_left",
1512
- "objects": ["building"], # Inferred
1513
- "description": "Tall buildings line the left side of the street"
1514
- }
1515
-
1516
- if right_building_evidence:
1517
- zones["building_zone_right"] = {
1518
- "region": "middle_right",
1519
- "objects": ["building"], # Inferred
1520
- "description": "Tall buildings line the right side of the street"
1521
- }
1522
 
1523
- # Identify pedestrian zone if people are present
1524
- people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
1525
- if people_objs:
1526
- people_regions = {}
1527
- for obj in people_objs:
1528
- region = obj["region"]
1529
- if region not in people_regions:
1530
- people_regions[region] = []
1531
- people_regions[region].append(obj)
1532
-
1533
- if people_regions:
1534
- main_pedestrian_region = max(people_regions.items(),
1535
- key=lambda x: len(x[1]),
1536
- default=(None, []))
1537
-
1538
- if main_pedestrian_region[0] is not None:
1539
- zones["pedestrian_zone"] = {
1540
- "region": main_pedestrian_region[0],
1541
- "objects": ["person"] * len(main_pedestrian_region[1]),
1542
- "description": f"Pedestrian area with {len(main_pedestrian_region[1])} people navigating the financial district"
1543
- }
1544
-
1545
- return zones
1546
-
1547
- def _identify_aerial_view_zones(self, category_regions: Dict, detected_objects: List[Dict], scene_type: str) -> Dict:
1548
  """
1549
- Identify functional zones for scenes viewed from an aerial perspective.
1550
 
1551
  Args:
1552
- category_regions: Objects grouped by category and region
1553
- detected_objects: List of detected objects
1554
- scene_type: Specific scene type
1555
 
1556
  Returns:
1557
- Dict: Aerial view functional zones
1558
  """
1559
- zones = {}
1560
-
1561
- # For aerial views, we focus on patterns and flows rather than specific zones
1562
-
1563
- # Identify pedestrian patterns
1564
- people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
1565
- if people_objs:
1566
- # Convert positions to arrays for pattern analysis
1567
- positions = np.array([obj["normalized_center"] for obj in people_objs])
1568
-
1569
- if len(positions) >= 3:
1570
- # Calculate distribution metrics
1571
- x_coords = positions[:, 0]
1572
- y_coords = positions[:, 1]
1573
-
1574
- x_mean = np.mean(x_coords)
1575
- y_mean = np.mean(y_coords)
1576
- x_std = np.std(x_coords)
1577
- y_std = np.std(y_coords)
1578
-
1579
- # Determine if people are organized in a linear pattern
1580
- if x_std < 0.1 or y_std < 0.1:
1581
- # Linear distribution along one axis
1582
- pattern_direction = "vertical" if x_std < y_std else "horizontal"
1583
-
1584
- zones["pedestrian_pattern"] = {
1585
- "region": "central",
1586
- "objects": ["person"] * len(people_objs),
1587
- "description": f"Aerial view shows a {pattern_direction} pedestrian movement pattern"
1588
- }
1589
- else:
1590
- # More dispersed pattern
1591
- zones["pedestrian_distribution"] = {
1592
- "region": "wide",
1593
- "objects": ["person"] * len(people_objs),
1594
- "description": f"Aerial view shows pedestrians distributed across the area"
1595
- }
1596
-
1597
- # Identify vehicle patterns for traffic analysis
1598
- vehicle_objs = [obj for obj in detected_objects if obj["class_id"] in [1, 2, 3, 5, 6, 7]]
1599
- if vehicle_objs:
1600
- # Convert positions to arrays for pattern analysis
1601
- positions = np.array([obj["normalized_center"] for obj in vehicle_objs])
1602
-
1603
- if len(positions) >= 2:
1604
- # Calculate distribution metrics
1605
- x_coords = positions[:, 0]
1606
- y_coords = positions[:, 1]
1607
-
1608
- x_mean = np.mean(x_coords)
1609
- y_mean = np.mean(y_coords)
1610
- x_std = np.std(x_coords)
1611
- y_std = np.std(y_coords)
1612
-
1613
- # Determine if vehicles are organized in lanes
1614
- if x_std < y_std * 0.5:
1615
- # Vehicles aligned vertically - indicates north-south traffic
1616
- zones["vertical_traffic_flow"] = {
1617
- "region": "central_vertical",
1618
- "objects": [obj["class_name"] for obj in vehicle_objs[:5]],
1619
- "description": "North-south traffic flow visible from aerial view"
1620
- }
1621
- elif y_std < x_std * 0.5:
1622
- # Vehicles aligned horizontally - indicates east-west traffic
1623
- zones["horizontal_traffic_flow"] = {
1624
- "region": "central_horizontal",
1625
- "objects": [obj["class_name"] for obj in vehicle_objs[:5]],
1626
- "description": "East-west traffic flow visible from aerial view"
1627
- }
1628
- else:
1629
- # Vehicles in multiple directions - indicates intersection
1630
- zones["intersection_traffic"] = {
1631
- "region": "central",
1632
- "objects": [obj["class_name"] for obj in vehicle_objs[:5]],
1633
- "description": "Multi-directional traffic at intersection visible from aerial view"
1634
- }
1635
-
1636
- # For intersection specific aerial views, identify crossing patterns
1637
- if "intersection" in scene_type:
1638
- # Check for traffic signals
1639
- traffic_light_objs = [obj for obj in detected_objects if obj["class_id"] == 9]
1640
- if traffic_light_objs:
1641
- zones["traffic_control_pattern"] = {
1642
- "region": "intersection",
1643
- "objects": ["traffic light"] * len(traffic_light_objs),
1644
- "description": f"Intersection traffic control with {len(traffic_light_objs)} signals visible from above"
1645
- }
1646
-
1647
- # Crosswalks are inferred from context in aerial views
1648
- zones["crossing_pattern"] = {
1649
- "region": "central",
1650
- "objects": ["inferred crosswalk"],
1651
- "description": "Crossing pattern visible from aerial perspective"
1652
- }
1653
-
1654
- # For plaza aerial views, identify gathering patterns
1655
- if "plaza" in scene_type:
1656
- # Plazas typically have central open area with people
1657
- if people_objs:
1658
- # Check if people are clustered in central region
1659
- central_people = [obj for obj in people_objs
1660
- if "middle" in obj["region"]]
1661
-
1662
- if central_people:
1663
- zones["central_gathering"] = {
1664
- "region": "middle_center",
1665
- "objects": ["person"] * len(central_people),
1666
- "description": f"Central plaza gathering area with {len(central_people)} people viewed from above"
1667
- }
1668
-
1669
- return zones
1670
-
1671
- def _identify_outdoor_general_zones(self, category_regions: Dict, detected_objects: List[Dict], scene_type: str) -> Dict:
1672
- """
1673
- Identify functional zones for general outdoor scenes.
1674
-
1675
- Args:
1676
- category_regions: Objects grouped by category and region
1677
- detected_objects: List of detected objects
1678
- scene_type: Specific outdoor scene type
1679
 
1680
- Returns:
1681
- Dict: Outdoor functional zones
1682
- """
1683
- zones = {}
1684
-
1685
- # Identify pedestrian zones
1686
- people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
1687
- if people_objs:
1688
- people_regions = {}
1689
- for obj in people_objs:
1690
- region = obj["region"]
1691
- if region not in people_regions:
1692
- people_regions[region] = []
1693
- people_regions[region].append(obj)
1694
-
1695
- if people_regions:
1696
- # Find main pedestrian areas
1697
- main_people_regions = sorted(people_regions.items(),
1698
- key=lambda x: len(x[1]),
1699
- reverse=True)[:2] # Top 2 regions
1700
-
1701
- for idx, (region, objs) in enumerate(main_people_regions):
1702
- if len(objs) > 0:
1703
- zones[f"pedestrian_zone_{idx+1}"] = {
1704
- "region": region,
1705
- "objects": ["person"] * len(objs),
1706
- "description": f"Pedestrian area with {len(objs)} {'people' if len(objs) > 1 else 'person'}"
1707
- }
1708
-
1709
- # Identify vehicle zones for streets and parking lots
1710
- vehicle_objs = [obj for obj in detected_objects if obj["class_id"] in [1, 2, 3, 5, 6, 7]]
1711
- if vehicle_objs:
1712
- vehicle_regions = {}
1713
- for obj in vehicle_objs:
1714
- region = obj["region"]
1715
- if region not in vehicle_regions:
1716
- vehicle_regions[region] = []
1717
- vehicle_regions[region].append(obj)
1718
-
1719
- if vehicle_regions:
1720
- main_vehicle_region = max(vehicle_regions.items(),
1721
- key=lambda x: len(x[1]),
1722
- default=(None, []))
1723
-
1724
- if main_vehicle_region[0] is not None:
1725
- vehicle_types = [obj["class_name"] for obj in main_vehicle_region[1]]
1726
- zones["vehicle_zone"] = {
1727
- "region": main_vehicle_region[0],
1728
- "objects": vehicle_types,
1729
- "description": f"Traffic area with {', '.join(list(set(vehicle_types))[:3])}"
1730
- }
1731
-
1732
- # For park areas, identify recreational zones
1733
- if scene_type == "park_area":
1734
- # Look for recreational objects (sports balls, kites, etc.)
1735
- rec_items = []
1736
- rec_regions = {}
1737
-
1738
- for obj in detected_objects:
1739
- if obj["class_id"] in [32, 33, 34, 35, 38]: # sports ball, kite, baseball bat, glove, tennis racket
1740
- region = obj["region"]
1741
- if region not in rec_regions:
1742
- rec_regions[region] = []
1743
- rec_regions[region].append(obj)
1744
- rec_items.append(obj["class_name"])
1745
-
1746
- if rec_items:
1747
- main_rec_region = max(rec_regions.items(),
1748
- key=lambda x: len(x[1]),
1749
- default=(None, []))
1750
-
1751
- if main_rec_region[0] is not None:
1752
- zones["recreational_zone"] = {
1753
- "region": main_rec_region[0],
1754
- "objects": list(set(rec_items)),
1755
- "description": f"Recreational area with {', '.join(list(set(rec_items)))}"
1756
- }
1757
-
1758
- # For parking lots, identify parking zones
1759
- if scene_type == "parking_lot":
1760
- # Look for parked cars with consistent spacing
1761
- car_objs = [obj for obj in detected_objects if obj["class_id"] == 2] # cars
1762
-
1763
- if len(car_objs) >= 3:
1764
- # Check if cars are arranged in patterns (simplified)
1765
- car_positions = [obj["normalized_center"] for obj in car_objs]
1766
-
1767
- # Check for row patterns by analyzing vertical positions
1768
- y_coords = [pos[1] for pos in car_positions]
1769
- y_clusters = {}
1770
-
1771
- # Simplified clustering - group cars by similar y-coordinates
1772
- for i, y in enumerate(y_coords):
1773
- assigned = False
1774
- for cluster_y in y_clusters.keys():
1775
- if abs(y - cluster_y) < 0.1: # Within 10% of image height
1776
- y_clusters[cluster_y].append(i)
1777
- assigned = True
1778
- break
1779
-
1780
- if not assigned:
1781
- y_clusters[y] = [i]
1782
-
1783
- # If we have row patterns
1784
- if max(len(indices) for indices in y_clusters.values()) >= 2:
1785
- zones["parking_row"] = {
1786
- "region": "central",
1787
- "objects": ["car"] * len(car_objs),
1788
- "description": f"Organized parking area with vehicles arranged in rows"
1789
- }
1790
- else:
1791
- zones["parking_area"] = {
1792
- "region": "wide",
1793
- "objects": ["car"] * len(car_objs),
1794
- "description": f"Parking area with {len(car_objs)} vehicles"
1795
- }
1796
-
1797
- return zones
1798
-
1799
- def _identify_default_zones(self, category_regions: Dict, detected_objects: List[Dict]) -> Dict:
1800
- """
1801
- Identify general functional zones when no specific scene type is matched.
1802
 
1803
- Args:
1804
- category_regions: Objects grouped by category and region
1805
- detected_objects: List of detected objects
 
 
 
 
1806
 
1807
- Returns:
1808
- Dict: Default functional zones
1809
- """
1810
- zones = {}
1811
-
1812
- # Group objects by category and find main concentrations
1813
- for category, regions in category_regions.items():
1814
- if not regions:
1815
- continue
1816
-
1817
- # Find region with most objects in this category
1818
- main_region = max(regions.items(),
1819
- key=lambda x: len(x[1]),
1820
- default=(None, []))
1821
-
1822
- if main_region[0] is None or len(main_region[1]) < 2:
1823
- continue
1824
-
1825
- # Create zone based on object category
1826
- zone_objects = [obj["class_name"] for obj in main_region[1]]
1827
-
1828
- # Skip if too few objects
1829
- if len(zone_objects) < 2:
1830
- continue
1831
-
1832
- # Create appropriate zone name and description based on category
1833
- if category == "furniture":
1834
- zones["furniture_zone"] = {
1835
- "region": main_region[0],
1836
- "objects": zone_objects,
1837
- "description": f"Area with furniture including {', '.join(zone_objects[:3])}"
1838
- }
1839
- elif category == "electronics":
1840
- zones["electronics_zone"] = {
1841
- "region": main_region[0],
1842
- "objects": zone_objects,
1843
- "description": f"Area with electronic devices including {', '.join(zone_objects[:3])}"
1844
- }
1845
- elif category == "kitchen_items":
1846
- zones["dining_zone"] = {
1847
- "region": main_region[0],
1848
- "objects": zone_objects,
1849
- "description": f"Dining or food area with {', '.join(zone_objects[:3])}"
1850
- }
1851
- elif category == "vehicles":
1852
- zones["vehicle_zone"] = {
1853
- "region": main_region[0],
1854
- "objects": zone_objects,
1855
- "description": f"Area with vehicles including {', '.join(zone_objects[:3])}"
1856
- }
1857
- elif category == "personal_items":
1858
- zones["personal_items_zone"] = {
1859
- "region": main_region[0],
1860
- "objects": zone_objects,
1861
- "description": f"Area with personal items including {', '.join(zone_objects[:3])}"
1862
- }
1863
-
1864
- # Check for people groups
1865
- people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
1866
- if len(people_objs) >= 2:
1867
- people_regions = {}
1868
- for obj in people_objs:
1869
- region = obj["region"]
1870
- if region not in people_regions:
1871
- people_regions[region] = []
1872
- people_regions[region].append(obj)
1873
-
1874
- if people_regions:
1875
- main_people_region = max(people_regions.items(),
1876
- key=lambda x: len(x[1]),
1877
- default=(None, []))
1878
-
1879
- if main_people_region[0] is not None:
1880
- zones["people_zone"] = {
1881
- "region": main_people_region[0],
1882
- "objects": ["person"] * len(main_people_region[1]),
1883
- "description": f"Area with {len(main_people_region[1])} people"
1884
- }
1885
-
1886
- return zones
1887
 
1888
- def _find_main_region(self, region_objects_dict: Dict) -> str:
1889
- """Find the main region with the most objects"""
1890
- if not region_objects_dict:
1891
- return "unknown"
 
 
 
1892
 
1893
- return max(region_objects_dict.items(),
1894
- key=lambda x: len(x[1]),
1895
- default=("unknown", []))[0]
 
 
1
 
2
  import os
3
  import numpy as np
4
+ import logging
5
+ import traceback
6
  from typing import Dict, List, Tuple, Any, Optional
7
 
8
+ from region_analyzer import RegionAnalyzer
9
+ from object_extractor import ObjectExtractor
10
+ from scene_viewpoint_analyzer import SceneViewpointAnalyzer
11
+ from zone_evaluator import ZoneEvaluator
12
+ from scene_zone_identifier import SceneZoneIdentifier
13
+ from functional_zone_identifier import FunctionalZoneIdentifier
14
+
15
+ logger = logging.getLogger(__name__)
16
 
17
  class SpatialAnalyzer:
18
  """
19
+ 分析圖像中物件間空間關係的主要類別
20
+ 處理區域分配、物件定位和功能區域識別
21
+ 使用Facade模式整合多個子組件,保持外部接口的穩定性
22
  """
23
 
24
  def __init__(self, class_names: Dict[int, str] = None, object_categories=None):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  """
26
+ 初始化空間分析器,包含圖像區域定義
27
 
28
  Args:
29
+ class_names: 類別ID到類別名稱的映射字典
30
+ object_categories: 物件類別分組字典
 
 
 
31
  """
32
+ try:
33
+ # 初始化所有子組件
34
+ self.region_analyzer = RegionAnalyzer()
35
+ self.object_extractor = ObjectExtractor(class_names, object_categories)
 
36
 
37
+ self.scene_viewpoint_analyzer = SceneViewpointAnalyzer()
 
 
38
 
39
+ self.zone_evaluator = ZoneEvaluator()
40
+ self.scene_zone_identifier = SceneZoneIdentifier()
41
+ self.functional_zone_identifier = FunctionalZoneIdentifier(
42
+ zone_evaluator=self.zone_evaluator,
43
+ scene_zone_identifier=self.scene_zone_identifier,
44
+ scene_viewpoint_analyzer=self.scene_viewpoint_analyzer
45
+ )
46
 
47
+ self.class_names = class_names
48
+ self.OBJECT_CATEGORIES = object_categories or {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
+ self.enhance_descriptor = None
 
 
51
 
52
+ # 接近分析的距離閾值(標準化)
53
+ self.proximity_threshold = 0.2
 
54
 
55
+ logger.info("SpatialAnalyzer initialized successfully with all sub-components")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
+ except Exception as e:
58
+ logger.error(f"Failed to initialize SpatialAnalyzer: {str(e)}")
59
+ logger.error(traceback.format_exc())
60
+ raise
61
 
62
+ def update_class_names(self, class_names: Dict[int, str]):
63
  """
64
+ 更新類別名稱映射並傳遞給 ObjectExtractor
65
 
66
  Args:
67
+ class_names: 新的類別名稱映射字典
 
 
 
68
  """
69
+ try:
70
+ self.class_names = class_names
71
+ if hasattr(self, 'object_extractor') and self.object_extractor:
72
+ self.object_extractor.update_class_names(class_names)
73
+ logger.info(f"Updated class names in SpatialAnalyzer and ObjectExtractor")
74
+ except Exception as e:
75
+ logger.error(f"Failed to update class names in SpatialAnalyzer: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
+ def _determine_region(self, x: float, y: float) -> str:
78
  """
79
+ 判斷點位於哪個區域
80
 
81
  Args:
82
+ x: 標準化x座標 (0-1)
83
+ y: 標準化y座標 (0-1)
84
 
85
  Returns:
86
+ 區域名稱
87
  """
88
+ try:
89
+ return self.region_analyzer.determine_region(x, y)
90
+ except Exception as e:
91
+ logger.error(f"Error in _determine_region: {str(e)}")
92
+ logger.error(traceback.format_exc())
93
+ return "unknown"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
+ def _analyze_regions(self, detected_objects: List[Dict]) -> Dict:
96
  """
97
+ 分析物件在各區域的分布情況
98
 
99
  Args:
100
+ detected_objects: 包含位置資訊的檢測物件列表
101
 
102
  Returns:
103
+ 包含區域分析結果的字典
104
  """
105
+ try:
106
+ return self.region_analyzer.analyze_regions(detected_objects)
107
+ except Exception as e:
108
+ logger.error(f"Error in _analyze_regions: {str(e)}")
109
+ logger.error(traceback.format_exc())
110
+ return {
111
+ "counts": {},
112
+ "main_focus": [],
113
+ "objects_by_region": {}
114
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
+ def _extract_detected_objects(self, detection_result: Any, confidence_threshold: float = 0.25) -> List[Dict]:
117
  """
118
+ 從檢測結果中提取物件資訊,包含位置資訊
 
119
 
120
  Args:
121
+ detection_result: YOLOv8檢測結果
122
+ confidence_threshold: 最小信心度閾值
123
 
124
  Returns:
125
+ 包含檢測物件資訊的字典列表
126
  """
127
+ try:
128
+ return self.object_extractor.extract_detected_objects(
129
+ detection_result,
130
+ confidence_threshold,
131
+ region_analyzer=self.region_analyzer
132
+ )
133
+ except Exception as e:
134
+ logger.error(f"Error in _extract_detected_objects: {str(e)}")
135
+ logger.error(traceback.format_exc())
136
+ return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
138
+ def _detect_scene_viewpoint(self, detected_objects: List[Dict]) -> Dict:
139
  """
140
+ 檢測場景視角並識別特殊場景模式
141
 
142
  Args:
143
+ detected_objects: 檢測到的物件列表
 
144
 
145
  Returns:
146
+ 包含視角和場景模式資訊的字典
147
  """
148
+ try:
149
+ # 委託���新的場景視角分析器
150
+ return self.scene_viewpoint_analyzer.detect_scene_viewpoint(detected_objects)
151
+ except Exception as e:
152
+ logger.error(f"Error in _detect_scene_viewpoint: {str(e)}")
153
+ logger.error(traceback.format_exc())
154
+ return {"viewpoint": "eye_level", "patterns": []}
 
 
 
 
 
 
 
 
 
 
155
 
156
+ def _identify_functional_zones(self, detected_objects: List[Dict], scene_type: str) -> Dict:
 
 
 
 
 
 
 
 
 
157
  """
158
+ 識別場景內的功能區域,具有針對不同視角和文化背景的改進檢測能力
 
159
 
160
  Args:
161
+ detected_objects: 檢測到的物件列表
162
+ scene_type: 識別出的場景類型
163
 
164
  Returns:
165
+ 包含功能區域及其描述的字典
166
  """
167
+ try:
168
+ return self.functional_zone_identifier.identify_functional_zones(detected_objects, scene_type)
169
+ except Exception as e:
170
+ logger.error(f"Error in _identify_functional_zones: {str(e)}")
171
+ logger.error(traceback.format_exc())
172
+ return {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
 
174
  def _categorize_object(self, obj: Dict) -> str:
175
  """
176
+ 將檢測到的物件分類到功能類別中,用於區域識別
177
+ 確保所有返回值都使用自然語言格式,避免底線或技術性標識符
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  """
179
+ try:
180
+ class_id = obj.get("class_id", -1)
181
+ class_name = obj.get("class_name", "").lower().strip()
182
+
183
+ # 優先處理 traffic light
184
+ # 只要 class_id == 9 class_name 包含 "traffic light",就分類為 "traffic light"
185
+ if class_id == 9 or "traffic light" in class_name:
186
+ return "traffic light"
187
+
188
+ # 如果有自訂的 OBJECT_CATEGORIES 映射,優先使用它
189
+ if hasattr(self, 'OBJECT_CATEGORIES') and self.OBJECT_CATEGORIES:
190
+ for category, ids in self.OBJECT_CATEGORIES.items():
191
+ if class_id in ids:
192
+ # 確保返回的類別名稱使用自然語言格式
193
+ return self._clean_category_name(category)
194
+
195
+ # COCO class default name
196
+ furniture_items = ["chair", "couch", "bed", "dining table", "toilet"]
197
+ plant_items = ["potted plant"]
198
+ electronic_items = ["tv", "laptop", "mouse", "remote", "keyboard", "cell phone"]
199
+ vehicle_items = ["bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat"]
200
+ person_items = ["person"]
201
+ kitchen_items = [
202
+ "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl",
203
+ "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog",
204
+ "pizza", "donut", "cake", "refrigerator", "oven", "toaster", "sink", "microwave"
205
+ ]
206
+ sports_items = [
207
+ "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat",
208
+ "baseball glove", "skateboard", "surfboard", "tennis racket"
209
+ ]
210
+ personal_items = ["handbag", "tie", "suitcase", "umbrella", "backpack"]
211
+
212
+ # fallback natural language
213
+ if any(item in class_name for item in furniture_items):
214
+ return "furniture"
215
+ elif any(item in class_name for item in plant_items):
216
+ return "plant"
217
+ elif any(item in class_name for item in electronic_items):
218
+ return "electronics"
219
+ elif any(item in class_name for item in vehicle_items):
220
+ return "vehicle"
221
+ elif any(item in class_name for item in person_items):
222
+ return "person"
223
+ elif any(item in class_name for item in kitchen_items):
224
+ return "kitchen items" # 移除底線
225
+ elif any(item in class_name for item in sports_items):
226
+ return "sports"
227
+ elif any(item in class_name for item in personal_items):
228
+ return "personal items" # 移除底線
229
+ else:
230
+ return "misc"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
 
232
+ except Exception as e:
233
+ logger.error(f"Error categorizing object: {str(e)}")
234
+ logger.error(traceback.format_exc())
235
+ return "misc"
236
 
237
+ def _clean_category_name(self, category: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  """
239
+ 清理類別名稱,移除底線並轉換為較自然的格式
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
 
241
  Args:
242
+ category: 原始類別名稱
 
 
 
 
 
 
243
 
244
  Returns:
245
+ str: 清理後的類別名稱
246
  """
247
+ try:
248
+ if not category:
249
+ return "misc"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
 
251
+ # 將底線替換為空格
252
+ cleaned = category.replace('_', ' ')
253
 
254
+ # 處理常見的技術性命名模式
255
+ replacements = {
256
+ 'kitchen items': 'kitchen items',
257
+ 'personal items': 'personal items',
258
+ 'traffic light': 'traffic light',
259
+ 'misc items': 'misc'
260
+ }
261
 
262
+ # 應用特定的替換規則
263
+ for old_term, new_term in replacements.items():
264
+ if cleaned == old_term:
265
+ return new_term
 
266
 
267
+ return cleaned.strip()
 
 
268
 
269
+ except Exception as e:
270
+ logger.warning(f"Error cleaning category name '{category}': {str(e)}")
271
+ return "misc"
 
272
 
273
+ def _get_object_categories(self, detected_objects: List[Dict]) -> set:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
  """
275
+ 從檢測到的物件中獲取唯一的物件類別
276
 
277
  Args:
278
+ detected_objects: 檢測到的物件列表
279
 
280
  Returns:
281
+ 唯一物件類別的集合
282
  """
 
 
 
 
 
 
283
  try:
284
+ return self.object_extractor.get_object_categories(detected_objects)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  except Exception as e:
286
+ logger.error(f"Error in _get_object_categories: {str(e)}")
287
+ logger.error(traceback.format_exc())
288
+ return set()
289
 
290
+ def _identify_core_objects_for_scene(self, detected_objects: List[Dict], scene_type: str) -> List[Dict]:
 
 
 
291
  """
292
+ 識別定義特定場景類型的核心物件
293
 
294
  Args:
295
+ detected_objects: 檢測到的物件列表
296
+ scene_type: 場景類型
 
297
 
298
  Returns:
299
+ 場景的核心物件列表
300
  """
301
+ try:
302
+ return self.object_extractor.identify_core_objects_for_scene(detected_objects, scene_type)
303
+ except Exception as e:
304
+ logger.error(f"Error in _identify_core_objects_for_scene: {str(e)}")
305
+ logger.error(traceback.format_exc())
306
+ return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
307
 
308
+ def _evaluate_zone_identification_feasibility(self, detected_objects: List[Dict], scene_type: str) -> bool:
309
  """
310
+ 基於物件關聯性和分布特徵的彈性可行性評估
311
 
312
  Args:
313
+ detected_objects: 檢測到的物件列表
314
+ scene_type: 場景類型
315
 
316
  Returns:
317
+ 是否適合進行區域識別
318
  """
319
+ try:
320
+ return self.zone_evaluator.evaluate_zone_identification_feasibility(detected_objects, scene_type)
321
+ except Exception as e:
322
+ logger.error(f"Error in _evaluate_zone_identification_feasibility: {str(e)}")
323
+ logger.error(traceback.format_exc())
324
+ return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
325
 
326
+ def _calculate_functional_relationships(self, detected_objects: List[Dict]) -> float:
327
  """
328
+ 計算物件間的功能關聯性評分
329
 
330
  Args:
331
+ detected_objects: 檢測到的物件列表
332
 
333
  Returns:
334
+ 功能關聯性評分 (0.0-1.0)
335
  """
336
+ try:
337
+ return self.zone_evaluator.calculate_functional_relationships(detected_objects)
338
+ except Exception as e:
339
+ logger.error(f"Error in _calculate_functional_relationships: {str(e)}")
340
+ logger.error(traceback.format_exc())
341
+ return 0.0
 
 
 
 
 
 
 
 
 
 
 
 
342
 
343
+ def _calculate_spatial_diversity(self, detected_objects: List[Dict]) -> float:
344
  """
345
+ 計算物件空間分布的多樣性
346
 
347
  Args:
348
+ detected_objects: 檢測到的物件列表
349
 
350
  Returns:
351
+ 空間多樣性評分 (0.0-1.0)
352
  """
353
+ try:
354
+ return self.zone_evaluator.calculate_spatial_diversity(detected_objects)
355
+ except Exception as e:
356
+ logger.error(f"Error in _calculate_spatial_diversity: {str(e)}")
357
+ logger.error(traceback.format_exc())
358
+ return 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359
 
360
+ def _get_complexity_threshold(self, scene_type: str) -> float:
361
  """
362
+ 根據場景類型返回適當的複雜度閾值
363
 
364
  Args:
365
+ scene_type: 場景類型
 
 
366
 
367
  Returns:
368
+ 複雜度閾值 (0.0-1.0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
369
  """
370
+ try:
371
+ return self.zone_evaluator.get_complexity_threshold(scene_type)
372
+ except Exception as e:
373
+ logger.error(f"Error in _get_complexity_threshold: {str(e)}")
374
+ logger.error(traceback.format_exc())
375
+ return 0.55
376
 
377
+ def _create_distribution_map(self, detected_objects: List[Dict]) -> Dict:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
378
  """
379
+ 創建物件在各區域分布的詳細地圖,用於空間分析
380
 
381
  Args:
382
+ detected_objects: 檢測到的物件列表
 
383
 
384
  Returns:
385
+ 包含各區域分布詳情的字典
386
  """
387
+ try:
388
+ return self.region_analyzer.create_distribution_map(detected_objects)
389
+ except Exception as e:
390
+ logger.error(f"Error in _create_distribution_map: {str(e)}")
391
+ logger.error(traceback.format_exc())
392
+ return {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
393
 
394
+ def _find_main_region(self, region_objects_dict: Dict) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
395
  """
396
+ 找到物件最多的主要區域
397
 
398
  Args:
399
+ region_objects_dict: 區域物件字典
 
 
400
 
401
  Returns:
402
+ 主要區域名稱
403
  """
404
+ try:
405
+ if not region_objects_dict:
406
+ return "unknown"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
407
 
408
+ return max(region_objects_dict.items(),
409
+ key=lambda x: len(x[1]),
410
+ default=("unknown", []))[0]
411
+ except Exception as e:
412
+ logger.error(f"Error in _find_main_region: {str(e)}")
413
+ logger.error(traceback.format_exc())
414
+ return "unknown"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
415
 
416
+ def _detect_cross_pattern(self, positions):
417
+ """檢測位置中的十字交叉模式 - 委託給SceneViewpointAnalyzer"""
418
+ try:
419
+ return self.scene_viewpoint_analyzer._detect_cross_pattern(positions)
420
+ except Exception as e:
421
+ logger.error(f"Error in _detect_cross_pattern: {str(e)}")
422
+ return False
423
 
424
+ def _analyze_movement_directions(self, positions):
425
+ """分析位置中的移動方向 - 委託給SceneViewpointAnalyzer"""
426
+ try:
427
+ return self.scene_viewpoint_analyzer._analyze_movement_directions(positions)
428
+ except Exception as e:
429
+ logger.error(f"Error in _analyze_movement_directions: {str(e)}")
430
+ return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
431
 
432
+ def _get_directional_description(self, region: str) -> str:
433
+ """將區域名稱轉換為方位描述 - 委託給RegionAnalyzer"""
434
+ try:
435
+ return self.region_analyzer.get_directional_description(region)
436
+ except Exception as e:
437
+ logger.error(f"Error in _get_directional_description: {str(e)}")
438
+ return "central"
439
 
440
+ @property
441
+ def regions(self):
442
+ """提供對區域定義的向後兼容訪問"""
443
+ return self.region_analyzer.regions
template_manager.py ADDED
@@ -0,0 +1,2150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import traceback
3
+ import re
4
+ import random
5
+ from typing import Dict, List, Optional, Any
6
+ import json
7
+
8
+ from scene_detail_templates import SCENE_DETAIL_TEMPLATES
9
+ from object_template_fillers import OBJECT_TEMPLATE_FILLERS
10
+ from viewpoint_templates import VIEWPOINT_TEMPLATES
11
+ from cultural_templates import CULTURAL_TEMPLATES
12
+ from lighting_conditions import LIGHTING_CONDITIONS
13
+ from confidence_templates import CONFIDENCE_TEMPLATES
14
+
15
+ class TemplateLoadingError(Exception):
16
+ """模板載入或處理相關錯誤的自訂例外"""
17
+ pass
18
+
19
+ class TemplateFillError(Exception):
20
+ pass
21
+
22
+ class TemplateManager:
23
+ """
24
+ 模板管理器 - 負責描述模板的載入、管理和填充
25
+
26
+ 此class 管理所有用於場景描述生成的模板資源,提供模板填充功能,
27
+ 並根據場景類型、物體檢測結果和上下文的資訊給出適當的描述內容。
28
+ """
29
+
30
+ def __init__(self, custom_templates_db: Optional[Dict] = None):
31
+ """
32
+ 初始化模板管理器
33
+
34
+ Args:
35
+ custom_templates_db: 可選的自定義模板數據庫,如果提供則會與默認模板合併
36
+ """
37
+ self.logger = logging.getLogger(self.__class__.__name__)
38
+ self.template_registry = {}
39
+
40
+ try:
41
+ # 載入模板數據庫
42
+ self.templates = self._load_templates()
43
+
44
+ # 如果提供了自定義模板,則進行合併
45
+ if custom_templates_db:
46
+ self._merge_custom_templates(custom_templates_db)
47
+
48
+ # 驗證模板完整性
49
+ self._validate_templates()
50
+
51
+ self.logger.info("TemplateManager initialized successfully with %d template categories",
52
+ len(self.templates))
53
+
54
+ except Exception as e:
55
+ error_msg = f"Failed to initialize TemplateManager: {str(e)}"
56
+ self.logger.error(f"{error_msg}\n{traceback.format_exc()}")
57
+ # 初始化基本的空模板
58
+ self.templates = self._initialize_fallback_templates()
59
+
60
+ def _load_templates(self) -> Dict:
61
+ """
62
+ 載入所有描述模板
63
+
64
+ Returns:
65
+ Dict: 包含所有模板類別的字典
66
+ """
67
+ try:
68
+ templates = {}
69
+
70
+ # 載入場景詳細描述模板
71
+ self.logger.debug("Loading scene detail templates")
72
+ try:
73
+ templates["scene_detail_templates"] = SCENE_DETAIL_TEMPLATES
74
+ except NameError:
75
+ self.logger.warning("SCENE_DETAIL_TEMPLATES not defined, using empty dict")
76
+ templates["scene_detail_templates"] = {}
77
+
78
+ # 載入物體模板填充器
79
+ self.logger.debug("Loading object template fillers")
80
+ try:
81
+ templates["object_template_fillers"] = OBJECT_TEMPLATE_FILLERS
82
+ except NameError:
83
+ self.logger.warning("OBJECT_TEMPLATE_FILLERS not defined, using empty dict")
84
+ templates["object_template_fillers"] = {}
85
+
86
+ # 載入視角模板
87
+ self.logger.debug("Loading viewpoint templates")
88
+ try:
89
+ templates["viewpoint_templates"] = VIEWPOINT_TEMPLATES
90
+ except NameError:
91
+ self.logger.warning("VIEWPOINT_TEMPLATES not defined, using empty dict")
92
+ templates["viewpoint_templates"] = {}
93
+
94
+ # 載入文化模板
95
+ self.logger.debug("Loading cultural templates")
96
+ try:
97
+ templates["cultural_templates"] = CULTURAL_TEMPLATES
98
+ except NameError:
99
+ self.logger.warning("CULTURAL_TEMPLATES not defined, using empty dict")
100
+ templates["cultural_templates"] = {}
101
+
102
+ # 從照明條件模組載入照明模板
103
+ self.logger.debug("Loading lighting templates")
104
+ try:
105
+ templates["lighting_templates"] = self._extract_lighting_templates()
106
+ except Exception as e:
107
+ self.logger.warning(f"Failed to extract lighting templates: {str(e)}")
108
+ templates["lighting_templates"] = {}
109
+
110
+ # 載入信心度模板
111
+ self.logger.debug("Loading confidence templates")
112
+ try:
113
+ templates["confidence_templates"] = CONFIDENCE_TEMPLATES
114
+ except NameError:
115
+ self.logger.warning("CONFIDENCE_TEMPLATES not defined, using empty dict")
116
+ templates["confidence_templates"] = {}
117
+
118
+ # 初始化默認模板(當成備份)
119
+ self._initialize_default_templates(templates)
120
+
121
+ self.logger.info("Successfully loaded %d template categories", len(templates))
122
+ return templates
123
+
124
+ except Exception as e:
125
+ error_msg = f"Unexpected error during template loading: {str(e)}"
126
+ self.logger.error(f"{error_msg}\n{traceback.format_exc()}")
127
+ # 返回基本模板
128
+ return self._initialize_fallback_templates()
129
+
130
+ def _initialize_template_registry(self) -> Dict[str, Dict[str, Any]]:
131
+ """
132
+ 初始化模板,包含各種場景類型的結構化模板
133
+
134
+ Returns:
135
+ Dict[str, Dict[str, Any]]: 模板註冊表字典
136
+ """
137
+ try:
138
+ template_registry = {
139
+ "indoor_detailed": {
140
+ "scene_type": "indoor",
141
+ "complexity": "high",
142
+ "structure": [
143
+ {
144
+ "type": "opening",
145
+ "content": "This indoor scene presents a comprehensive view of a well-organized living space."
146
+ },
147
+ {
148
+ "type": "zone_analysis",
149
+ "priority": "functional_areas",
150
+ "detail_level": "detailed"
151
+ },
152
+ {
153
+ "type": "object_summary",
154
+ "grouping": "by_category",
155
+ "include_counts": True
156
+ },
157
+ {
158
+ "type": "conclusion",
159
+ "style": "analytical"
160
+ }
161
+ ]
162
+ },
163
+
164
+ "indoor_moderate": {
165
+ "scene_type": "indoor",
166
+ "complexity": "medium",
167
+ "structure": [
168
+ {
169
+ "type": "opening",
170
+ "content": "The indoor environment displays organized functional areas."
171
+ },
172
+ {
173
+ "type": "zone_analysis",
174
+ "priority": "main_areas",
175
+ "detail_level": "moderate"
176
+ },
177
+ {
178
+ "type": "object_summary",
179
+ "grouping": "by_function",
180
+ "include_counts": False
181
+ },
182
+ {
183
+ "type": "conclusion",
184
+ "style": "descriptive"
185
+ }
186
+ ]
187
+ },
188
+
189
+ "indoor_simple": {
190
+ "scene_type": "indoor",
191
+ "complexity": "low",
192
+ "structure": [
193
+ {
194
+ "type": "opening",
195
+ "content": "An indoor space with visible furniture and household items."
196
+ },
197
+ {
198
+ "type": "zone_analysis",
199
+ "priority": "basic_areas",
200
+ "detail_level": "simple"
201
+ },
202
+ {
203
+ "type": "object_summary",
204
+ "grouping": "general",
205
+ "include_counts": False
206
+ }
207
+ ]
208
+ },
209
+
210
+ "outdoor_detailed": {
211
+ "scene_type": "outdoor",
212
+ "complexity": "high",
213
+ "structure": [
214
+ {
215
+ "type": "opening",
216
+ "content": "This outdoor scene captures a dynamic urban environment with multiple activity zones."
217
+ },
218
+ {
219
+ "type": "zone_analysis",
220
+ "priority": "activity_areas",
221
+ "detail_level": "detailed"
222
+ },
223
+ {
224
+ "type": "object_summary",
225
+ "grouping": "by_location",
226
+ "include_counts": True
227
+ },
228
+ {
229
+ "type": "conclusion",
230
+ "style": "environmental"
231
+ }
232
+ ]
233
+ },
234
+
235
+ "outdoor_moderate": {
236
+ "scene_type": "outdoor",
237
+ "complexity": "medium",
238
+ "structure": [
239
+ {
240
+ "type": "opening",
241
+ "content": "The outdoor scene shows organized public spaces and pedestrian areas."
242
+ },
243
+ {
244
+ "type": "zone_analysis",
245
+ "priority": "public_areas",
246
+ "detail_level": "moderate"
247
+ },
248
+ {
249
+ "type": "object_summary",
250
+ "grouping": "by_type",
251
+ "include_counts": False
252
+ },
253
+ {
254
+ "type": "conclusion",
255
+ "style": "observational"
256
+ }
257
+ ]
258
+ },
259
+
260
+ "outdoor_simple": {
261
+ "scene_type": "outdoor",
262
+ "complexity": "low",
263
+ "structure": [
264
+ {
265
+ "type": "opening",
266
+ "content": "An outdoor area with pedestrians and urban elements."
267
+ },
268
+ {
269
+ "type": "zone_analysis",
270
+ "priority": "basic_areas",
271
+ "detail_level": "simple"
272
+ },
273
+ {
274
+ "type": "object_summary",
275
+ "grouping": "general",
276
+ "include_counts": False
277
+ }
278
+ ]
279
+ },
280
+
281
+ "commercial_detailed": {
282
+ "scene_type": "commercial",
283
+ "complexity": "high",
284
+ "structure": [
285
+ {
286
+ "type": "opening",
287
+ "content": "This commercial environment demonstrates organized retail and customer service areas."
288
+ },
289
+ {
290
+ "type": "zone_analysis",
291
+ "priority": "service_areas",
292
+ "detail_level": "detailed"
293
+ },
294
+ {
295
+ "type": "object_summary",
296
+ "grouping": "by_function",
297
+ "include_counts": True
298
+ },
299
+ {
300
+ "type": "conclusion",
301
+ "style": "business"
302
+ }
303
+ ]
304
+ },
305
+
306
+ "transportation_detailed": {
307
+ "scene_type": "transportation",
308
+ "complexity": "high",
309
+ "structure": [
310
+ {
311
+ "type": "opening",
312
+ "content": "This transportation hub features organized passenger facilities and transit infrastructure."
313
+ },
314
+ {
315
+ "type": "zone_analysis",
316
+ "priority": "transit_areas",
317
+ "detail_level": "detailed"
318
+ },
319
+ {
320
+ "type": "object_summary",
321
+ "grouping": "by_transit_function",
322
+ "include_counts": True
323
+ },
324
+ {
325
+ "type": "conclusion",
326
+ "style": "infrastructure"
327
+ }
328
+ ]
329
+ },
330
+
331
+ "default": {
332
+ "scene_type": "general",
333
+ "complexity": "medium",
334
+ "structure": [
335
+ {
336
+ "type": "opening",
337
+ "content": "The scene displays various elements organized across functional areas."
338
+ },
339
+ {
340
+ "type": "zone_analysis",
341
+ "priority": "general_areas",
342
+ "detail_level": "moderate"
343
+ },
344
+ {
345
+ "type": "object_summary",
346
+ "grouping": "general",
347
+ "include_counts": False
348
+ },
349
+ {
350
+ "type": "conclusion",
351
+ "style": "general"
352
+ }
353
+ ]
354
+ }
355
+ }
356
+
357
+ self.logger.debug(f"Initialized template registry with {len(template_registry)} templates")
358
+ return template_registry
359
+
360
+ except Exception as e:
361
+ error_msg = f"Error initializing template registry: {str(e)}"
362
+ self.logger.error(f"{error_msg}\n{traceback.format_exc()}")
363
+ # 返回最基本的註冊表
364
+ return {
365
+ "default": {
366
+ "scene_type": "general",
367
+ "complexity": "low",
368
+ "structure": [
369
+ {
370
+ "type": "opening",
371
+ "content": "Scene analysis completed with identified objects and areas."
372
+ }
373
+ ]
374
+ }
375
+ }
376
+
377
+ def get_template_by_scene_type(self, scene_type: str, detected_objects: List[Dict],
378
+ functional_zones: Dict) -> str:
379
+ """
380
+ 根據場景類型選擇合適的模��並進行標準化處理
381
+
382
+ Args:
383
+ scene_type: 場景類型
384
+ detected_objects: 檢測到的物件列表
385
+ functional_zones: 功能區域字典
386
+
387
+ Returns:
388
+ str: 標準化後的模板字符串
389
+ """
390
+ try:
391
+ # 獲取場景的物件統計信息
392
+ object_stats = self._analyze_scene_composition(detected_objects)
393
+ zone_count = len(functional_zones) if functional_zones else 0
394
+
395
+ # 根據場景複雜度和類型選擇模板
396
+ if scene_type in self.templates:
397
+ scene_templates = self.templates[scene_type]
398
+
399
+ # 根據複雜度選擇合適的模板變體
400
+ if zone_count >= 3 and object_stats.get("total_objects", 0) >= 10:
401
+ template_key = "complex"
402
+ elif zone_count >= 2 or object_stats.get("total_objects", 0) >= 5:
403
+ template_key = "moderate"
404
+ else:
405
+ template_key = "simple"
406
+
407
+ if template_key in scene_templates:
408
+ raw_template = scene_templates[template_key]
409
+ else:
410
+ raw_template = scene_templates.get("default", scene_templates[list(scene_templates.keys())[0]])
411
+ else:
412
+ # 如果沒有特定場景的模板,使用通用模板
413
+ raw_template = self._get_generic_template(object_stats, zone_count)
414
+
415
+ # 標準化模板中的佔位符和格式
416
+ standardized_template = self._standardize_template_format(raw_template)
417
+ return standardized_template
418
+
419
+ except Exception as e:
420
+ logger.error(f"Error selecting template for scene type '{scene_type}': {str(e)}")
421
+ return self._get_fallback_template()
422
+
423
+ def _analyze_scene_composition(self, detected_objects: List[Dict]) -> Dict:
424
+ """
425
+ 分析場景組成以確定模板複雜度
426
+
427
+ Args:
428
+ detected_objects: 檢測到的物件列表
429
+
430
+ Returns:
431
+ Dict: 場景組成統計信息
432
+ """
433
+ try:
434
+ total_objects = len(detected_objects)
435
+
436
+ # 統計不同類型的物件
437
+ object_categories = {}
438
+ for obj in detected_objects:
439
+ class_name = obj.get("class_name", "unknown")
440
+ object_categories[class_name] = object_categories.get(class_name, 0) + 1
441
+
442
+ # 計算場景多樣性
443
+ unique_categories = len(object_categories)
444
+
445
+ return {
446
+ "total_objects": total_objects,
447
+ "unique_categories": unique_categories,
448
+ "category_distribution": object_categories,
449
+ "complexity_score": min(total_objects * 0.3 + unique_categories * 0.7, 10)
450
+ }
451
+
452
+ except Exception as e:
453
+ logger.warning(f"Error analyzing scene composition: {str(e)}")
454
+ return {"total_objects": 0, "unique_categories": 0, "complexity_score": 0}
455
+
456
+ def _get_generic_template(self, object_stats: Dict, zone_count: int) -> str:
457
+ """
458
+ 獲取通用模板
459
+
460
+ Args:
461
+ object_stats: 物件統計信息
462
+ zone_count: 功能區域數量
463
+
464
+ Returns:
465
+ str: 通用模板字符串
466
+ """
467
+ try:
468
+ complexity_score = object_stats.get("complexity_score", 0)
469
+
470
+ if complexity_score >= 7 or zone_count >= 3:
471
+ return "This scene presents a comprehensive view featuring {functional_area} with {primary_objects}. The spatial organization demonstrates {spatial_arrangement} across multiple {activity_areas}, creating a dynamic environment with diverse elements and clear functional zones."
472
+ elif complexity_score >= 4 or zone_count >= 2:
473
+ return "The scene displays {functional_area} containing {primary_objects}. The arrangement shows {spatial_organization} with distinct areas serving different purposes within the overall space."
474
+ else:
475
+ return "A {scene_description} featuring {primary_objects} arranged in {basic_layout} within the visible area."
476
+
477
+ except Exception as e:
478
+ logger.warning(f"Error getting generic template: {str(e)}")
479
+ return self._get_fallback_template()
480
+
481
+ def _get_fallback_template(self) -> str:
482
+ """
483
+ 獲取備用模板
484
+
485
+ Returns:
486
+ str: 備用模板字符串
487
+ """
488
+ return "A scene featuring various elements and organized areas of activity within the visible space."
489
+
490
+ def _standardize_template_format(self, template: str) -> str:
491
+ """
492
+ 標準化模板格式,確保佔位符和表達方式符合自然語言要求
493
+
494
+ Args:
495
+ template: 原始模板字符串
496
+
497
+ Returns:
498
+ str: 標準化後的模板字符串
499
+ """
500
+ try:
501
+ if not template:
502
+ return self._get_fallback_template()
503
+
504
+ import re
505
+ standardized = template
506
+
507
+ # 標準化佔位符格式,移除技術性標記
508
+ placeholder_mapping = {
509
+ r'\{zone_\d+\}': '{functional_area}',
510
+ r'\{object_group_\d+\}': '{primary_objects}',
511
+ r'\{region_\d+\}': '{spatial_area}',
512
+ r'\{category_\d+\}': '{object_category}',
513
+ r'\{area_\d+\}': '{activity_area}',
514
+ r'\{section_\d+\}': '{scene_section}'
515
+ }
516
+
517
+ for pattern, replacement in placeholder_mapping.items():
518
+ standardized = re.sub(pattern, replacement, standardized)
519
+
520
+ # 標準化常見的技術性術語
521
+ term_replacements = {
522
+ 'functional_zones': 'areas of activity',
523
+ 'object_detection': 'visible elements',
524
+ 'category_regions': 'organized sections',
525
+ 'spatial_distribution': 'arrangement throughout the space',
526
+ 'viewpoint_analysis': 'perspective view'
527
+ }
528
+
529
+ for tech_term, natural_term in term_replacements.items():
530
+ standardized = standardized.replace(tech_term, natural_term)
531
+
532
+ # 確保模板語法的自然性
533
+ standardized = self._improve_template_readability(standardized)
534
+
535
+ return standardized
536
+
537
+ except Exception as e:
538
+ logger.warning(f"Error standardizing template format: {str(e)}")
539
+ return template if template else self._get_fallback_template()
540
+
541
+ def _improve_template_readability(self, template: str) -> str:
542
+ """
543
+ 改善模板的可讀性和自然性
544
+
545
+ Args:
546
+ template: 模板字符串
547
+
548
+ Returns:
549
+ str: 改善後的模板字符串
550
+ """
551
+ try:
552
+ import re
553
+
554
+ # 移除多餘的空格和換行
555
+ improved = re.sub(r'\s+', ' ', template).strip()
556
+
557
+ # 改善句子連接
558
+ improved = improved.replace(' . ', '. ')
559
+ improved = improved.replace(' , ', ', ')
560
+ improved = improved.replace(' ; ', '; ')
561
+
562
+ # 確保適當的句號結尾
563
+ if improved and not improved.endswith(('.', '!', '?')):
564
+ improved += '.'
565
+
566
+ # 改善常見的表達問題
567
+ readability_fixes = [
568
+ (r'\bthe the\b', 'the'),
569
+ (r'\ba a\b', 'a'),
570
+ (r'\ban an\b', 'an'),
571
+ (r'\bwith with\b', 'with'),
572
+ (r'\bin in\b', 'in'),
573
+ (r'\bof of\b', 'of'),
574
+ (r'\band and\b', 'and')
575
+ ]
576
+
577
+ for pattern, replacement in readability_fixes:
578
+ improved = re.sub(pattern, replacement, improved, flags=re.IGNORECASE)
579
+
580
+ return improved
581
+
582
+ except Exception as e:
583
+ logger.warning(f"Error improving template readability: {str(e)}")
584
+ return template
585
+
586
+ def _extract_lighting_templates(self) -> Dict:
587
+ """
588
+ 從照明條件模組提取照明描述模板
589
+
590
+ Returns:
591
+ Dict: 照明模板字典
592
+ """
593
+ try:
594
+ lighting_templates = {}
595
+
596
+ # 從 LIGHTING_CONDITIONS 提取時間描述
597
+ time_descriptions = LIGHTING_CONDITIONS.get("time_descriptions", {})
598
+
599
+ for time_key, time_data in time_descriptions.items():
600
+ if isinstance(time_data, dict) and "general" in time_data:
601
+ lighting_templates[time_key] = time_data["general"]
602
+ else:
603
+ # 如果數據結構不符合預期,使用備用描述
604
+ lighting_templates[time_key] = f"The scene is captured during {time_key.replace('_', ' ')}."
605
+
606
+ # 確保至少有基本的照明模板
607
+ if not lighting_templates:
608
+ self.logger.warning("No lighting templates found, using defaults")
609
+ lighting_templates = self._get_default_lighting_templates()
610
+
611
+ self.logger.debug("Extracted %d lighting templates", len(lighting_templates))
612
+ return lighting_templates
613
+
614
+ except Exception as e:
615
+ self.logger.warning(f"Error extracting lighting templates: {str(e)}, using defaults")
616
+ return self._get_default_lighting_templates()
617
+
618
+ def _get_default_lighting_templates(self) -> Dict:
619
+ """獲取默認照明模板"""
620
+ return {
621
+ "day_clear": "The scene is captured during clear daylight conditions.",
622
+ "day_overcast": "The scene is captured during overcast daylight.",
623
+ "night": "The scene is captured at night with artificial lighting.",
624
+ "dawn": "The scene is captured during dawn with soft natural lighting.",
625
+ "dusk": "The scene is captured during dusk with diminishing natural light.",
626
+ "unknown": "The lighting conditions are not clearly identifiable."
627
+ }
628
+
629
+ def _initialize_default_templates(self, templates: Dict):
630
+ """
631
+ 初始化默認模板作為備份機制
632
+
633
+ Args:
634
+ templates: 要檢查和補充的模板字典
635
+ """
636
+ try:
637
+ # 置信度模板備份
638
+ if "confidence_templates" not in templates or not templates["confidence_templates"]:
639
+ templates["confidence_templates"] = {
640
+ "high": "{description} {details}",
641
+ "medium": "This appears to be {description} {details}",
642
+ "low": "This might be {description}, but the confidence is low. {details}"
643
+ }
644
+
645
+ # 場景詳細模板備份
646
+ if "scene_detail_templates" not in templates or not templates["scene_detail_templates"]:
647
+ templates["scene_detail_templates"] = {
648
+ "default": ["A scene with various elements and objects."]
649
+ }
650
+
651
+ # 物體填充模板備份
652
+ if "object_template_fillers" not in templates or not templates["object_template_fillers"]:
653
+ templates["object_template_fillers"] = {
654
+ "default": ["various items", "different objects", "multiple elements"]
655
+ }
656
+
657
+ # 視角模板備份
658
+ if "viewpoint_templates" not in templates or not templates["viewpoint_templates"]:
659
+ templates["viewpoint_templates"] = {
660
+ "eye_level": {
661
+ "prefix": "From eye level, ",
662
+ "observation": "the scene is viewed straight ahead.",
663
+ "short_desc": "at eye level"
664
+ },
665
+ "aerial": {
666
+ "prefix": "From above, ",
667
+ "observation": "the scene is viewed from a bird's-eye perspective.",
668
+ "short_desc": "from above"
669
+ },
670
+ "low_angle": {
671
+ "prefix": "From a low angle, ",
672
+ "observation": "the scene is viewed from below looking upward.",
673
+ "short_desc": "from below"
674
+ },
675
+ "elevated": {
676
+ "prefix": "From an elevated position, ",
677
+ "observation": "the scene is viewed from a higher vantage point.",
678
+ "short_desc": "from an elevated position"
679
+ }
680
+ }
681
+
682
+ # 文化模板備份
683
+ if "cultural_templates" not in templates or not templates["cultural_templates"]:
684
+ templates["cultural_templates"] = {
685
+ "asian": {
686
+ "elements": ["traditional architectural elements", "cultural signage", "Asian design features"],
687
+ "description": "The scene displays distinctive Asian cultural characteristics with {elements}."
688
+ },
689
+ "european": {
690
+ "elements": ["classical architecture", "European design elements", "historic features"],
691
+ "description": "The scene exhibits European architectural and cultural elements including {elements}."
692
+ }
693
+ }
694
+
695
+ self.logger.debug("Default templates initialized as backup")
696
+
697
+ except Exception as e:
698
+ self.logger.error(f"Error initializing default templates: {str(e)}")
699
+
700
+ def _merge_custom_templates(self, custom_templates: Dict):
701
+ """
702
+ 合併自定義模板到現有模板庫
703
+
704
+ Args:
705
+ custom_templates: 自定義模板字典
706
+ """
707
+ try:
708
+ for template_category, custom_content in custom_templates.items():
709
+ if template_category in self.templates:
710
+ if isinstance(self.templates[template_category], dict) and isinstance(custom_content, dict):
711
+ self.templates[template_category].update(custom_content)
712
+ self.logger.debug(f"Merged custom templates for category: {template_category}")
713
+ else:
714
+ self.templates[template_category] = custom_content
715
+ self.logger.debug(f"Replaced templates for category: {template_category}")
716
+ else:
717
+ self.templates[template_category] = custom_content
718
+ self.logger.debug(f"Added new template category: {template_category}")
719
+
720
+ self.logger.info("Successfully merged custom templates")
721
+
722
+ except Exception as e:
723
+ self.logger.warning(f"Error merging custom templates: {str(e)}")
724
+
725
+ def _validate_templates(self):
726
+ """
727
+ 驗證模板完整性和有效性
728
+ """
729
+ try:
730
+ required_categories = [
731
+ "scene_detail_templates",
732
+ "object_template_fillers",
733
+ "viewpoint_templates",
734
+ "cultural_templates",
735
+ "lighting_templates",
736
+ "confidence_templates"
737
+ ]
738
+
739
+ missing_categories = []
740
+ for category in required_categories:
741
+ if category not in self.templates:
742
+ missing_categories.append(category)
743
+ elif not self.templates[category]:
744
+ self.logger.warning(f"Template category '{category}' is empty")
745
+
746
+ if missing_categories:
747
+ error_msg = f"Missing required template categories: {missing_categories}"
748
+ self.logger.warning(error_msg)
749
+ # 為缺失的類別創建空模板
750
+ for category in missing_categories:
751
+ self.templates[category] = {}
752
+
753
+ # 驗證視角模板結構
754
+ self._validate_viewpoint_templates()
755
+
756
+ # 驗證文化模板結構
757
+ self._validate_cultural_templates()
758
+
759
+ self.logger.debug("Template validation completed successfully")
760
+
761
+ except Exception as e:
762
+ error_msg = f"Template validation failed: {str(e)}"
763
+ self.logger.error(f"{error_msg}\n{traceback.format_exc()}")
764
+
765
+ def _validate_viewpoint_templates(self):
766
+ """驗證視角模板結構"""
767
+ viewpoint_templates = self.templates.get("viewpoint_templates", {})
768
+
769
+ for viewpoint, template_data in viewpoint_templates.items():
770
+ if not isinstance(template_data, dict):
771
+ self.logger.warning(f"Invalid viewpoint template structure for '{viewpoint}'")
772
+ continue
773
+
774
+ required_keys = ["prefix", "observation"]
775
+ for key in required_keys:
776
+ if key not in template_data:
777
+ self.logger.warning(f"Missing '{key}' in viewpoint template '{viewpoint}'")
778
+
779
+ def _validate_cultural_templates(self):
780
+ """驗證文化模板結構"""
781
+ cultural_templates = self.templates.get("cultural_templates", {})
782
+
783
+ for culture, template_data in cultural_templates.items():
784
+ if not isinstance(template_data, dict):
785
+ self.logger.warning(f"Invalid cultural template structure for '{culture}'")
786
+ continue
787
+
788
+ if "elements" not in template_data or "description" not in template_data:
789
+ self.logger.warning(f"Missing required keys in cultural template '{culture}'")
790
+
791
+ def get_template(self, category: str, key: Optional[str] = None) -> Any:
792
+ """
793
+ 獲取指定類別的模板
794
+
795
+ Args:
796
+ category: 模板類別名稱
797
+ key: 可選的具體模板鍵值
798
+
799
+ Returns:
800
+ Any: 請求的模板內容,如果不存在則返回空字典或空字符串
801
+ """
802
+ try:
803
+ if category not in self.templates:
804
+ self.logger.warning(f"Template category '{category}' not found")
805
+ return {} if key is None else ""
806
+
807
+ if key is None:
808
+ return self.templates[category]
809
+
810
+ category_templates = self.templates[category]
811
+ if not isinstance(category_templates, dict):
812
+ self.logger.warning(f"Template category '{category}' is not a dictionary")
813
+ return ""
814
+
815
+ if key not in category_templates:
816
+ self.logger.warning(f"Template key '{key}' not found in category '{category}'")
817
+ return ""
818
+
819
+ return category_templates[key]
820
+
821
+ except Exception as e:
822
+ error_msg = f"Error retrieving template {category}.{key}: {str(e)}"
823
+ self.logger.error(error_msg)
824
+ return {} if key is None else ""
825
+
826
+ def fill_template(self, template: str, detected_objects: List[Dict], scene_type: str,
827
+ places365_info: Optional[Dict] = None,
828
+ object_statistics: Optional[Dict] = None) -> str:
829
+ """
830
+ 填充模板中的佔位符,增強容錯處理
831
+
832
+ Args:
833
+ template: 包含佔位符的模板字符串
834
+ detected_objects: 檢測到的物體列表
835
+ scene_type: 場景類型
836
+ places365_info: Places365場景分類信息
837
+ object_statistics: 物體統計信息
838
+
839
+ Returns:
840
+ str: 填充後的模板字符串,確保語法正確
841
+ """
842
+ try:
843
+ self.logger.debug(f"Filling template for scene_type: {scene_type}")
844
+
845
+ if not template or not template.strip():
846
+ return "A scene with various elements."
847
+
848
+ # 預處理模板,移除可能的問題模式
849
+ template = self._preprocess_template(template)
850
+
851
+ # 查找模板中的佔位符
852
+ placeholders = re.findall(r'\{([^}]+)\}', template)
853
+ filled_template = template
854
+
855
+ # 獲取模板填充器
856
+ fillers = self.templates.get("object_template_fillers", {})
857
+
858
+ # 基於物體統計信息生成替換內容
859
+ statistics_based_replacements = self._generate_statistics_replacements(object_statistics)
860
+
861
+ # 生成默認替換內容
862
+ default_replacements = self._generate_default_replacements()
863
+
864
+ # 添加Places365上下文信息
865
+ places365_replacements = self._generate_places365_replacements(places365_info)
866
+
867
+ # 添加功能區域信息到場景數據中以便後續使用
868
+ scene_functional_zones = None
869
+ if hasattr(self, '_current_functional_zones'):
870
+ scene_functional_zones = self._current_functional_zones
871
+
872
+ # 合併所有替換內容(優先順序是統計信息 > Places365 > 默認)
873
+ all_replacements = {**default_replacements, **places365_replacements, **statistics_based_replacements}
874
+
875
+ # 填充每個佔位符
876
+ for placeholder in placeholders:
877
+ try:
878
+ replacement = self._get_placeholder_replacement(
879
+ placeholder, fillers, all_replacements, detected_objects, scene_type
880
+ )
881
+
882
+ # 確保替換內容不為空且有意義
883
+ if not replacement or not replacement.strip():
884
+ replacement = self._get_emergency_replacement(placeholder)
885
+
886
+ filled_template = filled_template.replace(f"{{{placeholder}}}", replacement)
887
+
888
+ except Exception as placeholder_error:
889
+ self.logger.warning(f"Failed to replace placeholder '{placeholder}': {str(placeholder_error)}")
890
+ # 使用緊急替換值
891
+ emergency_replacement = self._get_emergency_replacement(placeholder)
892
+ filled_template = filled_template.replace(f"{{{placeholder}}}", emergency_replacement)
893
+
894
+ # 修復可能的語法問題
895
+ filled_template = self._postprocess_filled_template(filled_template)
896
+
897
+ self.logger.debug("Template filling completed successfully")
898
+ return filled_template
899
+
900
+ except Exception as e:
901
+ error_msg = f"Error filling template: {str(e)}"
902
+ self.logger.error(f"{error_msg}\n{traceback.format_exc()}")
903
+ # 返回安全的備用內容
904
+ return self._generate_fallback_description(scene_type, detected_objects)
905
+
906
+ def _preprocess_template(self, template: str) -> str:
907
+ """
908
+ 預處理模板,修復常見問題
909
+
910
+ Args:
911
+ template: 原始模板字符串
912
+
913
+ Returns:
914
+ str: 預處理後的模板
915
+ """
916
+ try:
917
+ # 移除可能導致問題的模式
918
+ template = re.sub(r'\{[^}]*\}\s*,\s*\{[^}]*\}', '{combined_elements}', template)
919
+
920
+ # 確保模板不以逗號開始
921
+ template = re.sub(r'^[,\s]*', '', template)
922
+
923
+ return template.strip()
924
+
925
+ except Exception as e:
926
+ self.logger.warning(f"Error preprocessing template: {str(e)}")
927
+ return template
928
+
929
+ def _get_emergency_replacement(self, placeholder: str) -> str:
930
+ """
931
+ 獲取緊急替換值,確保不會產生語法錯誤
932
+
933
+ Args:
934
+ placeholder: 佔位符名稱
935
+
936
+ Returns:
937
+ str: 安全的替換值
938
+ """
939
+ emergency_replacements = {
940
+ "crossing_pattern": "pedestrian walkways",
941
+ "pedestrian_behavior": "people moving through the area",
942
+ "traffic_pattern": "vehicle movement",
943
+ "scene_setting": "this location",
944
+ "urban_elements": "city features",
945
+ "street_elements": "urban components"
946
+ }
947
+
948
+ if placeholder in emergency_replacements:
949
+ return emergency_replacements[placeholder]
950
+
951
+ # 基於佔位符名稱生成合理的替換
952
+ cleaned = placeholder.replace('_', ' ')
953
+ if len(cleaned.split()) > 1:
954
+ return cleaned
955
+ else:
956
+ return f"various {cleaned}"
957
+
958
+ def _postprocess_filled_template(self, filled_template: str) -> str:
959
+ """
960
+ 後處理填充完成的模板,修復語法問題
961
+
962
+ Args:
963
+ filled_template: 填充後的模板字符串
964
+
965
+ Returns:
966
+ str: 修復後的模板字符串
967
+ """
968
+ try:
969
+ # 修復 "In , " 模式
970
+ filled_template = re.sub(r'\bIn\s*,\s*', 'In this scene, ', filled_template)
971
+ filled_template = re.sub(r'\bAt\s*,\s*', 'At this location, ', filled_template)
972
+ filled_template = re.sub(r'\bWithin\s*,\s*', 'Within this area, ', filled_template)
973
+
974
+ # 修復連續逗號
975
+ filled_template = re.sub(r',\s*,', ',', filled_template)
976
+
977
+ # 修復開頭的逗號
978
+ filled_template = re.sub(r'^[,\s]*', '', filled_template)
979
+
980
+ # 確保首字母大寫
981
+ if filled_template and not filled_template[0].isupper():
982
+ filled_template = filled_template[0].upper() + filled_template[1:]
983
+
984
+ # 確保以句號結尾
985
+ if filled_template and not filled_template.endswith(('.', '!', '?')):
986
+ filled_template += '.'
987
+
988
+ return filled_template.strip()
989
+
990
+ except Exception as e:
991
+ self.logger.warning(f"Error postprocessing filled template: {str(e)}")
992
+ return filled_template
993
+
994
+ def _generate_fallback_description(self, scene_type: str, detected_objects: List[Dict]) -> str:
995
+ """
996
+ 生成備用描述,當模板填充完全失敗時使用
997
+
998
+ Args:
999
+ scene_type: 場景類型
1000
+ detected_objects: 檢測到的物體列表
1001
+
1002
+ Returns:
1003
+ str: 備用描述
1004
+ """
1005
+ try:
1006
+ object_count = len(detected_objects)
1007
+
1008
+ if object_count == 0:
1009
+ return f"A {scene_type.replace('_', ' ')} scene."
1010
+ elif object_count == 1:
1011
+ return f"A {scene_type.replace('_', ' ')} scene with one visible element."
1012
+ else:
1013
+ return f"A {scene_type.replace('_', ' ')} scene with {object_count} visible elements."
1014
+
1015
+ except Exception as e:
1016
+ self.logger.warning(f"Error generating fallback description: {str(e)}")
1017
+ return "A scene with various elements."
1018
+
1019
+ def _generate_statistics_replacements(self, object_statistics: Optional[Dict]) -> Dict[str, str]:
1020
+ """
1021
+ 基於物體統計信息生成模板替換內容
1022
+
1023
+ Args:
1024
+ object_statistics: 物體統計信息
1025
+
1026
+ Returns:
1027
+ Dict[str, str]: 統計信息基礎的替換內容
1028
+ """
1029
+ replacements = {}
1030
+
1031
+ if not object_statistics:
1032
+ return replacements
1033
+
1034
+ try:
1035
+ # 處理植物元素
1036
+ if "potted plant" in object_statistics:
1037
+ count = object_statistics["potted plant"]["count"]
1038
+ if count == 1:
1039
+ replacements["plant_elements"] = "a potted plant"
1040
+ elif count <= 3:
1041
+ replacements["plant_elements"] = f"{count} potted plants"
1042
+ else:
1043
+ replacements["plant_elements"] = f"multiple potted plants ({count} total)"
1044
+
1045
+ # 處理座位
1046
+ if "chair" in object_statistics:
1047
+ count = object_statistics["chair"]["count"]
1048
+ if count == 1:
1049
+ replacements["seating"] = "a chair"
1050
+ elif count <= 4:
1051
+ replacements["seating"] = f"{count} chairs"
1052
+ else:
1053
+ replacements["seating"] = f"numerous chairs ({count} total)"
1054
+
1055
+ # 處理人員
1056
+ if "person" in object_statistics:
1057
+ count = object_statistics["person"]["count"]
1058
+ if count == 1:
1059
+ replacements["people_and_vehicles"] = "a person"
1060
+ replacements["pedestrian_flow"] = "an individual walking"
1061
+ elif count <= 5:
1062
+ replacements["people_and_vehicles"] = f"{count} people"
1063
+ replacements["pedestrian_flow"] = f"{count} people walking"
1064
+ else:
1065
+ replacements["people_and_vehicles"] = f"many people ({count} individuals)"
1066
+ replacements["pedestrian_flow"] = f"a crowd of {count} people"
1067
+
1068
+ # 處理桌子設置
1069
+ if "dining table" in object_statistics:
1070
+ count = object_statistics["dining table"]["count"]
1071
+ if count == 1:
1072
+ replacements["table_setup"] = "a dining table"
1073
+ replacements["table_description"] = "a dining surface"
1074
+ else:
1075
+ replacements["table_setup"] = f"{count} dining tables"
1076
+ replacements["table_description"] = f"{count} dining surfaces"
1077
+
1078
+ self.logger.debug(f"Generated {len(replacements)} statistics-based replacements")
1079
+
1080
+ except Exception as e:
1081
+ self.logger.warning(f"Error generating statistics replacements: {str(e)}")
1082
+
1083
+ return replacements
1084
+
1085
+ def _generate_places365_replacements(self, places365_info: Optional[Dict]) -> Dict[str, str]:
1086
+ """
1087
+ 基於Places365信息生成模板替換內容
1088
+
1089
+ Args:
1090
+ places365_info: Places365場景分類信息
1091
+
1092
+ Returns:
1093
+ Dict[str, str]: Places365基礎的替換內容
1094
+ """
1095
+ replacements = {}
1096
+
1097
+ if not places365_info or places365_info.get('confidence', 0) <= 0.35:
1098
+ replacements["places365_context"] = ""
1099
+ replacements["places365_atmosphere"] = ""
1100
+ return replacements
1101
+
1102
+ try:
1103
+ scene_label = places365_info.get('scene_label', '').replace('_', ' ')
1104
+ attributes = places365_info.get('attributes', [])
1105
+
1106
+ # 生成場景上下文
1107
+ if scene_label:
1108
+ replacements["places365_context"] = f"characteristic of a {scene_label}"
1109
+ else:
1110
+ replacements["places365_context"] = ""
1111
+
1112
+ # 生成氛圍描述
1113
+ if 'natural_lighting' in attributes:
1114
+ replacements["places365_atmosphere"] = "with natural illumination"
1115
+ elif 'artificial_lighting' in attributes:
1116
+ replacements["places365_atmosphere"] = "under artificial lighting"
1117
+ else:
1118
+ replacements["places365_atmosphere"] = ""
1119
+
1120
+ self.logger.debug("Generated Places365-based replacements")
1121
+
1122
+ except Exception as e:
1123
+ self.logger.warning(f"Error generating Places365 replacements: {str(e)}")
1124
+ replacements["places365_context"] = ""
1125
+ replacements["places365_atmosphere"] = ""
1126
+
1127
+ return replacements
1128
+
1129
+ def _generate_default_replacements(self) -> Dict[str, str]:
1130
+ """
1131
+ 生成默認的模板替換內容
1132
+
1133
+ Returns:
1134
+ Dict[str, str]: 默認替換內容
1135
+ """
1136
+ return {
1137
+
1138
+ "scene_introduction": "this scene",
1139
+ "location_prefix": "this location",
1140
+ "setting_description": "this setting",
1141
+ "area_description": "this area",
1142
+ "environment_description": "this environment",
1143
+ "spatial_introduction": "this space",
1144
+
1145
+ # 室內相關
1146
+ "furniture": "various furniture pieces",
1147
+ "seating": "comfortable seating",
1148
+ "electronics": "entertainment devices",
1149
+ "bed_type": "a bed",
1150
+ "bed_location": "room",
1151
+ "bed_description": "sleeping arrangements",
1152
+ "extras": "personal items",
1153
+ "table_setup": "a dining table and chairs",
1154
+ "table_description": "a dining surface",
1155
+ "dining_items": "dining furniture and tableware",
1156
+ "appliances": "kitchen appliances",
1157
+ "kitchen_items": "cooking utensils and dishware",
1158
+ "cooking_equipment": "cooking equipment",
1159
+ "office_equipment": "work-related furniture and devices",
1160
+ "desk_setup": "a desk and chair",
1161
+ "computer_equipment": "electronic devices",
1162
+
1163
+ # 室外/城市相關
1164
+ "traffic_description": "vehicles and pedestrians",
1165
+ "people_and_vehicles": "people and various vehicles",
1166
+ "street_elements": "urban infrastructure",
1167
+ "park_features": "benches and greenery",
1168
+ "outdoor_elements": "natural features",
1169
+ "park_description": "outdoor amenities",
1170
+ "store_elements": "merchandise displays",
1171
+ "shopping_activity": "customers browse and shop",
1172
+ "store_items": "products for sale",
1173
+
1174
+ # 高級餐廳相關
1175
+ "design_elements": "elegant decor",
1176
+ "lighting": "stylish lighting fixtures",
1177
+
1178
+ # 亞洲商業街相
1179
+ "storefront_features": "compact shops",
1180
+ "pedestrian_flow": "people walking",
1181
+ "asian_elements": "distinctive cultural elements",
1182
+ "cultural_elements": "traditional design features",
1183
+ "signage": "colorful signs",
1184
+ "street_activities": "busy urban activity",
1185
+
1186
+ # 金融區相關
1187
+ "buildings": "tall buildings",
1188
+ "traffic_elements": "vehicles",
1189
+ "skyscrapers": "high-rise buildings",
1190
+ "road_features": "wide streets",
1191
+ "architectural_elements": "modern architecture",
1192
+ "city_landmarks": "prominent structures",
1193
+
1194
+ # 十字路口相關
1195
+ "crossing_pattern": "clearly marked pedestrian crossings",
1196
+ "pedestrian_behavior": "careful pedestrian movement",
1197
+ "pedestrian_density": "multiple groups of pedestrians",
1198
+ "traffic_pattern": "well-regulated traffic flow",
1199
+ "pedestrian_flow": "steady pedestrian movement",
1200
+ "traffic_description": "active urban traffic",
1201
+ "people_and_vehicles": "pedestrians and vehicles",
1202
+ "street_elements": "urban infrastructure elements",
1203
+
1204
+ # 交通相關
1205
+ "transit_vehicles": "public transportation vehicles",
1206
+ "passenger_activity": "commuter movement",
1207
+ "transportation_modes": "various transit options",
1208
+ "passenger_needs": "waiting areas",
1209
+ "transit_infrastructure": "transit facilities",
1210
+ "passenger_movement": "commuter flow",
1211
+
1212
+ # 購物區相關
1213
+ "retail_elements": "shops and displays",
1214
+ "store_types": "various retail establishments",
1215
+ "walkway_features": "pedestrian pathways",
1216
+ "commercial_signage": "store signs",
1217
+ "consumer_behavior": "shopping activities",
1218
+
1219
+ # 空中視角相關
1220
+ "commercial_layout": "organized retail areas",
1221
+ "pedestrian_pattern": "people movement patterns",
1222
+ "gathering_features": "public gathering spaces",
1223
+ "movement_pattern": "crowd flow patterns",
1224
+ "urban_elements": "city infrastructure",
1225
+ "public_activity": "social interaction",
1226
+
1227
+ # 文化特定元素
1228
+ "stall_elements": "vendor booths",
1229
+ "lighting_features": "decorative lights",
1230
+ "food_elements": "food offerings",
1231
+ "vendor_stalls": "market stalls",
1232
+ "nighttime_activity": "evening commerce",
1233
+ "cultural_lighting": "traditional lighting",
1234
+ "night_market_sounds": "lively market sounds",
1235
+ "evening_crowd_behavior": "nighttime social activity",
1236
+ "architectural_elements": "cultural buildings",
1237
+ "religious_structures": "sacred buildings",
1238
+ "decorative_features": "ornamental designs",
1239
+ "cultural_practices": "traditional activities",
1240
+ "temple_architecture": "religious structures",
1241
+ "sensory_elements": "atmospheric elements",
1242
+ "visitor_activities": "cultural experiences",
1243
+ "ritual_activities": "ceremonial practices",
1244
+ "cultural_symbols": "meaningful symbols",
1245
+ "architectural_style": "historical buildings",
1246
+ "historic_elements": "traditional architecture",
1247
+ "urban_design": "city planning elements",
1248
+ "social_behaviors": "public interactions",
1249
+ "european_features": "European architectural details",
1250
+ "tourist_activities": "visitor activities",
1251
+ "local_customs": "regional practices",
1252
+
1253
+ # 時間特定元素
1254
+ "lighting_effects": "artificial lighting",
1255
+ "shadow_patterns": "light and shadow",
1256
+ "urban_features": "city elements",
1257
+ "illuminated_elements": "lit structures",
1258
+ "evening_activities": "nighttime activities",
1259
+ "light_sources": "lighting points",
1260
+ "lit_areas": "illuminated spaces",
1261
+ "shadowed_zones": "darker areas",
1262
+ "illuminated_signage": "bright signs",
1263
+ "colorful_lighting": "multicolored lights",
1264
+ "neon_elements": "neon signs",
1265
+ "night_crowd_behavior": "evening social patterns",
1266
+ "light_displays": "lighting installations",
1267
+ "building_features": "architectural elements",
1268
+ "nightlife_activities": "evening entertainment",
1269
+ "lighting_modifier": "bright",
1270
+
1271
+ # 混合環境元素
1272
+ "transitional_elements": "connecting features",
1273
+ "indoor_features": "interior elements",
1274
+ "outdoor_setting": "exterior spaces",
1275
+ "interior_amenities": "inside comforts",
1276
+ "exterior_features": "outside elements",
1277
+ "inside_elements": "interior design",
1278
+ "outside_spaces": "outdoor areas",
1279
+ "dual_environment_benefits": "combined settings",
1280
+ "passenger_activities": "waiting behaviors",
1281
+ "transportation_types": "transit vehicles",
1282
+ "sheltered_elements": "covered areas",
1283
+ "exposed_areas": "open sections",
1284
+ "waiting_behaviors": "passenger activities",
1285
+ "indoor_facilities": "inside services",
1286
+ "platform_features": "transit platform elements",
1287
+ "transit_routines": "transportation procedures",
1288
+
1289
+ # 專門場所元素
1290
+ "seating_arrangement": "spectator seating",
1291
+ "playing_surface": "athletic field",
1292
+ "sporting_activities": "sports events",
1293
+ "spectator_facilities": "viewer accommodations",
1294
+ "competition_space": "sports arena",
1295
+ "sports_events": "athletic competitions",
1296
+ "viewing_areas": "audience sections",
1297
+ "field_elements": "field markings and equipment",
1298
+ "game_activities": "competitive play",
1299
+ "construction_equipment": "building machinery",
1300
+ "building_materials": "construction supplies",
1301
+ "construction_activities": "building work",
1302
+ "work_elements": "construction tools",
1303
+ "structural_components": "building structures",
1304
+ "site_equipment": "construction gear",
1305
+ "raw_materials": "building supplies",
1306
+ "construction_process": "building phases",
1307
+ "medical_elements": "healthcare equipment",
1308
+ "clinical_activities": "medical procedures",
1309
+ "facility_design": "healthcare layout",
1310
+ "healthcare_features": "medical facilities",
1311
+ "patient_interactions": "care activities",
1312
+ "equipment_types": "medical devices",
1313
+ "care_procedures": "health services",
1314
+ "treatment_spaces": "clinical areas",
1315
+ "educational_furniture": "learning furniture",
1316
+ "learning_activities": "educational practices",
1317
+ "instructional_design": "teaching layout",
1318
+ "classroom_elements": "school equipment",
1319
+ "teaching_methods": "educational approaches",
1320
+ "student_engagement": "learning participation",
1321
+ "learning_spaces": "educational areas",
1322
+ "educational_tools": "teaching resources",
1323
+ "knowledge_transfer": "learning exchanges"
1324
+ }
1325
+
1326
+ def _generate_objects_summary(self, detected_objects: List[Dict]) -> str:
1327
+ """
1328
+ 基於檢測物件生成自然語言摘要,按重要性排序
1329
+
1330
+ Args:
1331
+ detected_objects: 檢測到的物件列表
1332
+
1333
+ Returns:
1334
+ str: 物件摘要描述
1335
+ """
1336
+ try:
1337
+ # detected_objects 裡有幾個 traffic light)
1338
+ tl_count = len([obj for obj in detected_objects if obj.get("class_name","") == "traffic light"])
1339
+ # print(f"[DEBUG] _generate_objects_summary 傳入的 detected_objects 中 traffic light: {tl_count} 個")
1340
+ for obj in detected_objects:
1341
+ if obj.get("class_name","") == "traffic light":
1342
+ print(f" - conf={obj.get('confidence',0):.4f}, bbox={obj.get('bbox')}, region={obj.get('region')}")
1343
+
1344
+ if not detected_objects:
1345
+ return "various elements"
1346
+
1347
+ # calculate object statistic
1348
+ object_counts = {}
1349
+ total_confidence = 0
1350
+
1351
+ for obj in detected_objects:
1352
+ class_name = obj.get("class_name", "unknown")
1353
+ confidence = obj.get("confidence", 0.5)
1354
+
1355
+ if class_name not in object_counts:
1356
+ object_counts[class_name] = {"count": 0, "total_confidence": 0}
1357
+
1358
+ object_counts[class_name]["count"] += 1
1359
+ object_counts[class_name]["total_confidence"] += confidence
1360
+ total_confidence += confidence
1361
+
1362
+ # 計算平均置信度並排序
1363
+ sorted_objects = []
1364
+ for class_name, stats in object_counts.items():
1365
+ avg_confidence = stats["total_confidence"] / stats["count"]
1366
+ count = stats["count"]
1367
+
1368
+ # 重要性評分:結合數量和置信度
1369
+ importance_score = (count * 0.6) + (avg_confidence * 0.4)
1370
+ sorted_objects.append((class_name, count, importance_score))
1371
+
1372
+ # 按重要性排序,取前5個最重要的物件
1373
+ sorted_objects.sort(key=lambda x: x[2], reverse=True)
1374
+ top_objects = sorted_objects[:5]
1375
+
1376
+ # 生成自然語言描述
1377
+ descriptions = []
1378
+ for class_name, count, _ in top_objects:
1379
+ clean_name = class_name.replace('_', ' ')
1380
+ if count == 1:
1381
+ article = "an" if clean_name[0].lower() in 'aeiou' else "a"
1382
+ descriptions.append(f"{article} {clean_name}")
1383
+ else:
1384
+ descriptions.append(f"{count} {clean_name}s")
1385
+
1386
+ # 組合描述
1387
+ if len(descriptions) == 1:
1388
+ return descriptions[0]
1389
+ elif len(descriptions) == 2:
1390
+ return f"{descriptions[0]} and {descriptions[1]}"
1391
+ else:
1392
+ return ", ".join(descriptions[:-1]) + f", and {descriptions[-1]}"
1393
+
1394
+ except Exception as e:
1395
+ self.logger.warning(f"Error generating objects summary: {str(e)}")
1396
+ return "various elements"
1397
+
1398
+ def _get_placeholder_replacement(self, placeholder: str, fillers: Dict,
1399
+ all_replacements: Dict, detected_objects: List[Dict],
1400
+ scene_type: str) -> str:
1401
+ """
1402
+ 獲取特定佔位符的替換內容,確保永遠不返回空值
1403
+ """
1404
+ try:
1405
+ # 優先處理動態內容生成的佔位符
1406
+ dynamic_placeholders = [
1407
+ 'primary_objects', 'detected_objects_summary', 'main_objects',
1408
+ 'functional_area', 'functional_zones_description', 'scene_elements'
1409
+ ]
1410
+
1411
+ if placeholder in dynamic_placeholders:
1412
+ dynamic_content = self._generate_objects_summary(detected_objects)
1413
+ if dynamic_content and dynamic_content.strip():
1414
+ return dynamic_content.strip()
1415
+
1416
+ # 檢查預定義替換內容
1417
+ if placeholder in all_replacements:
1418
+ replacement = all_replacements[placeholder]
1419
+ if replacement and replacement.strip():
1420
+ return replacement.strip()
1421
+
1422
+ # 檢查物體模板填充器
1423
+ if placeholder in fillers:
1424
+ options = fillers[placeholder]
1425
+ if options and isinstance(options, list):
1426
+ valid_options = [opt.strip() for opt in options if opt and str(opt).strip()]
1427
+ if valid_options:
1428
+ num_items = min(len(valid_options), random.randint(1, 3))
1429
+ selected_items = random.sample(valid_options, num_items)
1430
+
1431
+ if len(selected_items) == 1:
1432
+ return selected_items[0]
1433
+ elif len(selected_items) == 2:
1434
+ return f"{selected_items[0]} and {selected_items[1]}"
1435
+ else:
1436
+ return ", ".join(selected_items[:-1]) + f", and {selected_items[-1]}"
1437
+
1438
+ # 基於檢測對象生成動態內容
1439
+ scene_specific_replacement = self._generate_scene_specific_content(
1440
+ placeholder, detected_objects, scene_type
1441
+ )
1442
+ if scene_specific_replacement and scene_specific_replacement.strip():
1443
+ return scene_specific_replacement.strip()
1444
+
1445
+ # 通用備用字典 - 擴展版本
1446
+ fallback_replacements = {
1447
+ # 交通和城市相關
1448
+ "crossing_pattern": "pedestrian crosswalks",
1449
+ "pedestrian_behavior": "people moving carefully",
1450
+ "traffic_pattern": "vehicle movement",
1451
+ "urban_elements": "city infrastructure",
1452
+ "street_elements": "urban features",
1453
+ "intersection_features": "traffic management systems",
1454
+ "pedestrian_density": "groups of people",
1455
+ "pedestrian_flow": "pedestrian movement",
1456
+ "traffic_description": "vehicle traffic",
1457
+ "people_and_vehicles": "pedestrians and cars",
1458
+
1459
+ # 場景設置相關
1460
+ "scene_setting": "this urban environment",
1461
+ "location_context": "the area",
1462
+ "spatial_context": "the scene",
1463
+ "environmental_context": "this location",
1464
+
1465
+ # 常見的家具和設備
1466
+ "furniture": "various furniture pieces",
1467
+ "seating": "seating arrangements",
1468
+ "electronics": "electronic devices",
1469
+ "appliances": "household appliances",
1470
+
1471
+ # 活動和行為
1472
+ "activities": "various activities",
1473
+ "interactions": "people interacting",
1474
+ "movement": "movement patterns",
1475
+
1476
+ # 照明和氛圍
1477
+ "lighting_conditions": "ambient lighting",
1478
+ "atmosphere": "the overall atmosphere",
1479
+ "ambiance": "environmental ambiance",
1480
+
1481
+ # 空間描述
1482
+ "spatial_arrangement": "spatial organization",
1483
+ "layout": "the layout",
1484
+ "composition": "visual composition",
1485
+
1486
+ # 物體和元素
1487
+ "objects": "various objects",
1488
+ "elements": "scene elements",
1489
+ "features": "notable features",
1490
+ "details": "observable details"
1491
+ }
1492
+
1493
+ if placeholder in fallback_replacements:
1494
+ return fallback_replacements[placeholder]
1495
+
1496
+ # 基於場景類型的智能默認值
1497
+ scene_based_defaults = self._get_scene_based_default(placeholder, scene_type)
1498
+ if scene_based_defaults:
1499
+ return scene_based_defaults
1500
+
1501
+ # 最終備用:將下劃線轉換為有意義的短語
1502
+ cleaned_placeholder = placeholder.replace('_', ' ')
1503
+
1504
+ # 對常見模式提供更好的默認值
1505
+ if placeholder.endswith('_pattern'):
1506
+ return f"{cleaned_placeholder.replace(' pattern', '')} arrangement"
1507
+ elif placeholder.endswith('_behavior'):
1508
+ return f"{cleaned_placeholder.replace(' behavior', '')} activity"
1509
+ elif placeholder.endswith('_description'):
1510
+ return f"{cleaned_placeholder.replace(' description', '')} elements"
1511
+ elif placeholder.endswith('_elements'):
1512
+ return cleaned_placeholder
1513
+ elif placeholder.endswith('_features'):
1514
+ return cleaned_placeholder
1515
+ else:
1516
+ return cleaned_placeholder if cleaned_placeholder != placeholder else "various elements"
1517
+
1518
+ except Exception as e:
1519
+ self.logger.warning(f"Error getting replacement for placeholder '{placeholder}': {str(e)}")
1520
+ # 確保即使在異常情況下也返回有意義的內容
1521
+ return placeholder.replace('_', ' ') if placeholder else "scene elements"
1522
+
1523
+ def _get_scene_based_default(self, placeholder: str, scene_type: str) -> Optional[str]:
1524
+ """
1525
+ 基於場景類型提供智能默認值
1526
+
1527
+ Args:
1528
+ placeholder: 佔位符名稱
1529
+ scene_type: 場景類型
1530
+
1531
+ Returns:
1532
+ Optional[str]: 場景特定的默認值或None
1533
+ """
1534
+ try:
1535
+ # 針對不同場景類型的特定默認值
1536
+ scene_defaults = {
1537
+ "urban_intersection": {
1538
+ "crossing_pattern": "marked crosswalks",
1539
+ "pedestrian_behavior": "pedestrians crossing carefully",
1540
+ "traffic_pattern": "controlled traffic flow"
1541
+ },
1542
+ "city_street": {
1543
+ "traffic_description": "urban vehicle traffic",
1544
+ "street_elements": "city infrastructure",
1545
+ "people_and_vehicles": "pedestrians and vehicles"
1546
+ },
1547
+ "living_room": {
1548
+ "furniture": "comfortable living room furniture",
1549
+ "seating": "sofas and chairs",
1550
+ "electronics": "entertainment equipment"
1551
+ },
1552
+ "kitchen": {
1553
+ "appliances": "kitchen appliances",
1554
+ "cooking_equipment": "cooking tools and equipment"
1555
+ },
1556
+ "office_workspace": {
1557
+ "office_equipment": "work furniture and devices",
1558
+ "desk_setup": "desk and office chair"
1559
+ }
1560
+ }
1561
+
1562
+ if scene_type in scene_defaults and placeholder in scene_defaults[scene_type]:
1563
+ return scene_defaults[scene_type][placeholder]
1564
+
1565
+ return None
1566
+
1567
+ except Exception as e:
1568
+ self.logger.warning(f"Error getting scene-based default for '{placeholder}' in '{scene_type}': {str(e)}")
1569
+ return None
1570
+
1571
+ def _generate_scene_specific_content(self, placeholder: str, detected_objects: List[Dict],
1572
+ scene_type: str) -> Optional[str]:
1573
+ """
1574
+ 基於場景特定邏輯生成佔位符內容
1575
+
1576
+ Args:
1577
+ placeholder: 佔位符名稱
1578
+ detected_objects: 檢測到的物體列表
1579
+ scene_type: 場景類型
1580
+
1581
+ Returns:
1582
+ Optional[str]: 生成的內容或None
1583
+ """
1584
+ try:
1585
+ if placeholder == "furniture":
1586
+ # 提取家具物品
1587
+ furniture_ids = [56, 57, 58, 59, 60, 61] # 家具類別ID
1588
+ furniture_objects = [obj for obj in detected_objects if obj.get("class_id") in furniture_ids]
1589
+
1590
+ if furniture_objects:
1591
+ furniture_names = [obj.get("class_name", "furniture") for obj in furniture_objects[:3]]
1592
+ unique_names = list(set(furniture_names))
1593
+ return ", ".join(unique_names) if len(unique_names) > 1 else unique_names[0]
1594
+ return "various furniture items"
1595
+
1596
+ elif placeholder == "electronics":
1597
+ # 提取電子設備
1598
+ electronics_ids = [62, 63, 64, 65, 66, 67, 68, 69, 70] # 電子設備類別ID
1599
+ electronics_objects = [obj for obj in detected_objects if obj.get("class_id") in electronics_ids]
1600
+
1601
+ if electronics_objects:
1602
+ electronics_names = [obj.get("class_name", "electronic device") for obj in electronics_objects[:3]]
1603
+ unique_names = list(set(electronics_names))
1604
+ return ", ".join(unique_names) if len(unique_names) > 1 else unique_names[0]
1605
+ return "electronic devices"
1606
+
1607
+ elif placeholder == "people_count":
1608
+ # 計算人數
1609
+ people_count = len([obj for obj in detected_objects if obj.get("class_id") == 0])
1610
+
1611
+ if people_count == 0:
1612
+ return "no people"
1613
+ elif people_count == 1:
1614
+ return "one person"
1615
+ elif people_count < 5:
1616
+ return f"{people_count} people"
1617
+ else:
1618
+ return "several people"
1619
+
1620
+ elif placeholder == "seating":
1621
+ # 提取座位物品
1622
+ seating_ids = [56, 57] # chair, sofa
1623
+ seating_objects = [obj for obj in detected_objects if obj.get("class_id") in seating_ids]
1624
+
1625
+ if seating_objects:
1626
+ seating_names = [obj.get("class_name", "seating") for obj in seating_objects[:2]]
1627
+ unique_names = list(set(seating_names))
1628
+ return ", ".join(unique_names) if len(unique_names) > 1 else unique_names[0]
1629
+ return "seating arrangements"
1630
+
1631
+ # 如果沒有匹配的特定邏輯,返回None
1632
+ return None
1633
+
1634
+ except Exception as e:
1635
+ self.logger.warning(f"Error generating scene-specific content for '{placeholder}': {str(e)}")
1636
+ return None
1637
+
1638
+ def get_confidence_template(self, confidence_level: str) -> str:
1639
+ """
1640
+ 獲取指定信心度級別的模板
1641
+
1642
+ Args:
1643
+ confidence_level: 信心度級別 ('high', 'medium', 'low')
1644
+
1645
+ Returns:
1646
+ str: 信心度模板字符串
1647
+ """
1648
+ try:
1649
+ confidence_templates = self.templates.get("confidence_templates", {})
1650
+
1651
+ if confidence_level in confidence_templates:
1652
+ return confidence_templates[confidence_level]
1653
+
1654
+ # 備用模板
1655
+ fallback_templates = {
1656
+ "high": "{description} {details}",
1657
+ "medium": "This appears to be {description} {details}",
1658
+ "low": "This might be {description}, but the confidence is low. {details}"
1659
+ }
1660
+
1661
+ return fallback_templates.get(confidence_level, "{description} {details}")
1662
+
1663
+ except Exception as e:
1664
+ self.logger.warning(f"Error getting confidence template for '{confidence_level}': {str(e)}")
1665
+ return "{description} {details}"
1666
+
1667
+ def get_lighting_template(self, lighting_type: str) -> str:
1668
+ """
1669
+ 獲取指定照明類型的模板
1670
+
1671
+ Args:
1672
+ lighting_type: 照明類型
1673
+
1674
+ Returns:
1675
+ str: 照明描述模板
1676
+ """
1677
+ try:
1678
+ lighting_templates = self.templates.get("lighting_templates", {})
1679
+
1680
+ if lighting_type in lighting_templates:
1681
+ return lighting_templates[lighting_type]
1682
+
1683
+ # 備用模板
1684
+ return f"The scene is captured with {lighting_type.replace('_', ' ')} lighting conditions."
1685
+
1686
+ except Exception as e:
1687
+ self.logger.warning(f"Error getting lighting template for '{lighting_type}': {str(e)}")
1688
+ return "The lighting conditions are not clearly identifiable."
1689
+
1690
+ def get_viewpoint_template(self, viewpoint: str) -> Dict[str, str]:
1691
+ """
1692
+ 獲取指定視角的模板
1693
+
1694
+ Args:
1695
+ viewpoint: 視角類型
1696
+
1697
+ Returns:
1698
+ Dict[str, str]: 包含prefix、observation等鍵的視角模板字典
1699
+ """
1700
+ try:
1701
+ viewpoint_templates = self.templates.get("viewpoint_templates", {})
1702
+
1703
+ if viewpoint in viewpoint_templates:
1704
+ return viewpoint_templates[viewpoint]
1705
+
1706
+ # 備用模板
1707
+ fallback_templates = {
1708
+ "eye_level": {
1709
+ "prefix": "From eye level, ",
1710
+ "observation": "the scene is viewed straight ahead.",
1711
+ "short_desc": "at eye level"
1712
+ },
1713
+ "aerial": {
1714
+ "prefix": "From above, ",
1715
+ "observation": "the scene is viewed from a bird's-eye perspective.",
1716
+ "short_desc": "from above"
1717
+ },
1718
+ "low_angle": {
1719
+ "prefix": "From a low angle, ",
1720
+ "observation": "the scene is viewed from below looking upward.",
1721
+ "short_desc": "from below"
1722
+ },
1723
+ "elevated": {
1724
+ "prefix": "From an elevated position, ",
1725
+ "observation": "the scene is viewed from a higher vantage point.",
1726
+ "short_desc": "from an elevated position"
1727
+ }
1728
+ }
1729
+
1730
+ return fallback_templates.get(viewpoint, fallback_templates["eye_level"])
1731
+
1732
+ except Exception as e:
1733
+ self.logger.warning(f"Error getting viewpoint template for '{viewpoint}': {str(e)}")
1734
+ return {
1735
+ "prefix": "",
1736
+ "observation": "the scene is viewed normally.",
1737
+ "short_desc": "normally"
1738
+ }
1739
+
1740
+ def get_cultural_template(self, cultural_context: str) -> Dict[str, Any]:
1741
+ """
1742
+ 獲取指定文化語境的模板
1743
+
1744
+ Args:
1745
+ cultural_context: 文化語境
1746
+
1747
+ Returns:
1748
+ Dict[str, Any]: 文化模板字典
1749
+ """
1750
+ try:
1751
+ cultural_templates = self.templates.get("cultural_templates", {})
1752
+
1753
+ if cultural_context in cultural_templates:
1754
+ return cultural_templates[cultural_context]
1755
+
1756
+ # 備用模板
1757
+ return {
1758
+ "elements": ["cultural elements"],
1759
+ "description": f"The scene displays {cultural_context} cultural characteristics."
1760
+ }
1761
+
1762
+ except Exception as e:
1763
+ self.logger.warning(f"Error getting cultural template for '{cultural_context}': {str(e)}")
1764
+ return {
1765
+ "elements": ["various elements"],
1766
+ "description": "The scene displays cultural characteristics."
1767
+ }
1768
+
1769
+ def get_scene_detail_templates(self, scene_type: str, viewpoint: Optional[str] = None) -> List[str]:
1770
+ """
1771
+ 獲取場景詳細描述模板
1772
+
1773
+ Args:
1774
+ scene_type: 場景類型
1775
+ viewpoint: 可選的視角類型
1776
+
1777
+ Returns:
1778
+ List[str]: 場景描述模板列表
1779
+ """
1780
+ try:
1781
+ scene_templates = self.templates.get("scene_detail_templates", {})
1782
+
1783
+ # 首先嘗試獲取特定視角的模板
1784
+ if viewpoint:
1785
+ viewpoint_key = f"{scene_type}_{viewpoint}"
1786
+ if viewpoint_key in scene_templates:
1787
+ return scene_templates[viewpoint_key]
1788
+
1789
+ # 然後嘗試獲取場景類型的通用模板
1790
+ if scene_type in scene_templates:
1791
+ return scene_templates[scene_type]
1792
+
1793
+ # 最後使用默認模板
1794
+ if "default" in scene_templates:
1795
+ return scene_templates["default"]
1796
+
1797
+ # 備用模板
1798
+ return ["A scene with various elements and objects."]
1799
+
1800
+ except Exception as e:
1801
+ self.logger.warning(f"Error getting scene detail templates for '{scene_type}': {str(e)}")
1802
+ return ["A scene with various elements and objects."]
1803
+
1804
+ def reload_templates(self):
1805
+ """
1806
+ 重新載入所有模板
1807
+ """
1808
+ try:
1809
+ self.template_manager.reload_templates()
1810
+ self.logger.info("Templates reloaded successfully")
1811
+ except Exception as e:
1812
+ self.logger.error(f"Error reloading templates: {str(e)}")
1813
+
1814
+ def get_template_categories(self) -> List[str]:
1815
+ """
1816
+ 獲取所有可用的模板類別名稱
1817
+
1818
+ Returns:
1819
+ List[str]: 模板類別名稱列表
1820
+ """
1821
+ return list(self.templates.keys())
1822
+
1823
+ def template_exists(self, category: str, key: Optional[str] = None) -> bool:
1824
+ """
1825
+ 檢查模板是否存在
1826
+
1827
+ Args:
1828
+ category: 模板類別
1829
+ key: 可選的模板鍵值
1830
+
1831
+ Returns:
1832
+ bool: 模板是否存在
1833
+ """
1834
+ try:
1835
+ if category not in self.templates:
1836
+ return False
1837
+
1838
+ if key is None:
1839
+ return True
1840
+
1841
+ category_templates = self.templates[category]
1842
+ if isinstance(category_templates, dict):
1843
+ return key in category_templates
1844
+
1845
+ return False
1846
+
1847
+ except Exception as e:
1848
+ self.logger.warning(f"Error checking template existence for {category}.{key}: {str(e)}")
1849
+ return False
1850
+
1851
+ def apply_template(self, template: Union[str, Dict[str, Any]], scene_data: Dict[str, Any]) -> str:
1852
+ """
1853
+ 應用選定的模板來生成場景描述
1854
+
1855
+ Args:
1856
+ template: 模板字符串或模板內容字典
1857
+ scene_data: 場景分析的資料字典
1858
+
1859
+ Returns:
1860
+ str: 最終生成的場景描述
1861
+ """
1862
+ try:
1863
+ # 如果傳入的是字符串模板,直接使用填充邏輯
1864
+ if isinstance(template, str):
1865
+ self.logger.debug("Processing string template directly")
1866
+
1867
+ # 提取場景數據
1868
+ detected_objects = scene_data.get("detected_objects", [])
1869
+ scene_type = scene_data.get("scene_type", "general")
1870
+ places365_info = scene_data.get("places365_info")
1871
+ object_statistics = scene_data.get("object_statistics")
1872
+ functional_zones = scene_data.get("functional_zones", {})
1873
+
1874
+ # 暫存功能區域資訊供填充邏輯使用
1875
+ self._current_functional_zones = functional_zones
1876
+
1877
+ # 使用現有的填充邏輯
1878
+ filled_description = self.fill_template(
1879
+ template,
1880
+ detected_objects,
1881
+ scene_type,
1882
+ places365_info,
1883
+ object_statistics
1884
+ )
1885
+
1886
+ # 清理暫存資訊
1887
+ if hasattr(self, '_current_functional_zones'):
1888
+ delattr(self, '_current_functional_zones')
1889
+
1890
+ return filled_description
1891
+
1892
+ # 如果傳入的是字典結構模板
1893
+ elif isinstance(template, dict):
1894
+ self.logger.debug("Processing structured template")
1895
+ return self._process_structured_template(template, scene_data)
1896
+
1897
+ # 如果是模板名稱字符串且需要從registry獲取
1898
+ elif hasattr(self, 'template_registry') and template in self.template_registry:
1899
+ template_dict = self.template_registry[template]
1900
+ return self._process_structured_template(template_dict, scene_data)
1901
+
1902
+ else:
1903
+ self.logger.warning(f"Invalid template format or template not found: {type(template)}")
1904
+ return self._generate_fallback_scene_description(scene_data)
1905
+
1906
+ except Exception as e:
1907
+ self.logger.error(f"Error applying template: {str(e)}")
1908
+ return self._generate_fallback_scene_description(scene_data)
1909
+
1910
+ def _process_structured_template(self, template: Dict[str, Any], scene_data: Dict[str, Any]) -> str:
1911
+ """
1912
+ 處理結構化模板字典
1913
+
1914
+ Args:
1915
+ template: 結構化模板字典
1916
+ scene_data: 場景分析資料
1917
+
1918
+ Returns:
1919
+ str: 生成的場景描述
1920
+ """
1921
+ try:
1922
+ # 提取 scene_data 中各區塊資料
1923
+ zone_data = scene_data.get("functional_zones", scene_data.get("zones", {}))
1924
+ object_data = scene_data.get("detected_objects", [])
1925
+ scene_context = scene_data.get("scene_context", "")
1926
+
1927
+ # 獲取模板結構
1928
+ structure = template.get("structure", [])
1929
+ if not structure:
1930
+ self.logger.warning("Template has no structure defined")
1931
+ return self._generate_fallback_scene_description(scene_data)
1932
+
1933
+ description_parts = []
1934
+
1935
+ # 按照模板結構生成描述
1936
+ for section in structure:
1937
+ section_type = section.get("type", "")
1938
+ content = section.get("content", "")
1939
+
1940
+ if section_type == "opening":
1941
+ description_parts.append(content)
1942
+
1943
+ elif section_type == "zone_analysis":
1944
+ zone_descriptions = self._generate_zone_descriptions(zone_data, section)
1945
+ if zone_descriptions:
1946
+ description_parts.extend(zone_descriptions)
1947
+
1948
+ elif section_type == "object_summary":
1949
+ object_summary = self._generate_object_summary(object_data, section)
1950
+ if object_summary:
1951
+ description_parts.append(object_summary)
1952
+
1953
+ elif section_type == "conclusion":
1954
+ conclusion = self._generate_conclusion(template, zone_data, object_data)
1955
+ if conclusion:
1956
+ description_parts.append(conclusion)
1957
+
1958
+ # 合併並標準化輸出
1959
+ final_description = self._standardize_final_description(" ".join(description_parts))
1960
+ self.logger.info("Successfully applied structured template")
1961
+ return final_description
1962
+
1963
+ except Exception as e:
1964
+ self.logger.error(f"Error processing structured template: {str(e)}")
1965
+ return self._generate_fallback_scene_description(scene_data)
1966
+
1967
+ def _generate_fallback_scene_description(self, scene_data: Dict[str, Any]) -> str:
1968
+ """
1969
+ 生成備用場景描述
1970
+
1971
+ Args:
1972
+ scene_data: 場景分析資料
1973
+
1974
+ Returns:
1975
+ str: 備用場景描述
1976
+ """
1977
+ try:
1978
+ detected_objects = scene_data.get("detected_objects", [])
1979
+ zones = scene_data.get("functional_zones", scene_data.get("zones", {}))
1980
+ scene_type = scene_data.get("scene_type", "general")
1981
+
1982
+ object_count = len(detected_objects)
1983
+ zone_count = len(zones)
1984
+
1985
+ if zone_count > 0 and object_count > 0:
1986
+ return f"Scene analysis completed with {zone_count} functional areas containing {object_count} identified objects."
1987
+ elif object_count > 0:
1988
+ return f"Scene analysis identified {object_count} objects in this {scene_type.replace('_', ' ')} environment."
1989
+ else:
1990
+ return f"Scene analysis completed for this {scene_type.replace('_', ' ')} environment."
1991
+
1992
+ except Exception as e:
1993
+ self.logger.warning(f"Error generating fallback description: {str(e)}")
1994
+ return "Scene analysis completed with detected objects and functional areas."
1995
+
1996
+
1997
+ def _generate_zone_descriptions(self, zone_data: Dict[str, Any], section: Dict[str, Any]) -> List[str]:
1998
+ """
1999
+ 生成功能區域描述
2000
+ """
2001
+ try:
2002
+ descriptions = []
2003
+
2004
+ if not zone_data:
2005
+ return descriptions
2006
+
2007
+ # 直接處理區域資料(zone_data 本身就是區域字典)
2008
+ sorted_zones = sorted(zone_data.items(),
2009
+ key=lambda x: len(x[1].get("objects", [])),
2010
+ reverse=True)
2011
+
2012
+ for zone_name, zone_info in sorted_zones:
2013
+ description = zone_info.get("description", "")
2014
+ objects = zone_info.get("objects", [])
2015
+
2016
+ if objects:
2017
+ # 使用現有描述或生成基於物件的描述
2018
+ if description and not any(tech in description.lower() for tech in ['zone', 'area', 'region']):
2019
+ zone_desc = description
2020
+ else:
2021
+ # 生成更自然的區域描述
2022
+ clean_zone_name = zone_name.replace('_', ' ').replace(' area', '').replace(' zone', '')
2023
+ object_list = ', '.join(objects[:3])
2024
+
2025
+ if 'crossing' in zone_name or 'pedestrian' in zone_name:
2026
+ zone_desc = f"In the central crossing area, there are {object_list}."
2027
+ elif 'vehicle' in zone_name or 'traffic' in zone_name:
2028
+ zone_desc = f"The vehicle movement area includes {object_list}."
2029
+ elif 'control' in zone_name:
2030
+ zone_desc = f"Traffic control elements include {object_list}."
2031
+ else:
2032
+ zone_desc = f"The {clean_zone_name} contains {object_list}."
2033
+
2034
+ if len(objects) > 3:
2035
+ zone_desc += f" Along with {len(objects) - 3} additional elements."
2036
+
2037
+ descriptions.append(zone_desc)
2038
+
2039
+ return descriptions
2040
+
2041
+ except Exception as e:
2042
+ logger.error(f"Error generating zone descriptions: {str(e)}")
2043
+ return []
2044
+
2045
+ def _generate_object_summary(self, object_data: List[Dict], section: Dict[str, Any]) -> str:
2046
+ """
2047
+ 生成物件摘要描述
2048
+ """
2049
+ try:
2050
+ if not object_data:
2051
+ return ""
2052
+
2053
+ # 統計物件類型並計算重要性
2054
+ object_stats = {}
2055
+ for obj in object_data:
2056
+ class_name = obj.get("class_name", "unknown")
2057
+ confidence = obj.get("confidence", 0.5)
2058
+
2059
+ if class_name not in object_stats:
2060
+ object_stats[class_name] = {"count": 0, "total_confidence": 0}
2061
+
2062
+ object_stats[class_name]["count"] += 1
2063
+ object_stats[class_name]["total_confidence"] += confidence
2064
+
2065
+ # 按重要性排序(結合數量和置信度)
2066
+ sorted_objects = []
2067
+ for class_name, stats in object_stats.items():
2068
+ count = stats["count"]
2069
+ avg_confidence = stats["total_confidence"] / count
2070
+ importance = count * 0.6 + avg_confidence * 0.4
2071
+ sorted_objects.append((class_name, count, importance))
2072
+
2073
+ sorted_objects.sort(key=lambda x: x[2], reverse=True)
2074
+
2075
+ # 生成自然語言描述
2076
+ descriptions = []
2077
+ for class_name, count, _ in sorted_objects[:5]:
2078
+ clean_name = class_name.replace('_', ' ')
2079
+ if count == 1:
2080
+ article = "an" if clean_name[0].lower() in 'aeiou' else "a"
2081
+ descriptions.append(f"{article} {clean_name}")
2082
+ else:
2083
+ descriptions.append(f"{count} {clean_name}s")
2084
+
2085
+ if len(descriptions) == 1:
2086
+ return f"The scene features {descriptions[0]}."
2087
+ elif len(descriptions) == 2:
2088
+ return f"The scene features {descriptions[0]} and {descriptions[1]}."
2089
+ else:
2090
+ main_items = ", ".join(descriptions[:-1])
2091
+ return f"The scene features {main_items}, and {descriptions[-1]}."
2092
+
2093
+ except Exception as e:
2094
+ self.logger.error(f"Error generating object summary: {str(e)}")
2095
+ return ""
2096
+
2097
+ def _generate_conclusion(self, template: Dict[str, Any], zone_data: Dict[str, Any],
2098
+ object_data: List[Dict]) -> str:
2099
+ """
2100
+ 生成結論描述
2101
+ """
2102
+ try:
2103
+ scene_type = template.get("scene_type", "general")
2104
+ zones_count = len(zone_data)
2105
+ objects_count = len(object_data)
2106
+
2107
+ if scene_type == "indoor":
2108
+ conclusion = f"This indoor environment demonstrates clear functional organization with {zones_count} distinct areas and {objects_count} identified objects."
2109
+ elif scene_type == "outdoor":
2110
+ conclusion = f"This outdoor scene shows dynamic activity patterns across {zones_count} functional zones with {objects_count} detected elements."
2111
+ else:
2112
+ conclusion = f"The scene analysis reveals {zones_count} functional areas containing {objects_count} identifiable objects."
2113
+
2114
+ return conclusion
2115
+
2116
+ except Exception as e:
2117
+ logger.error(f"Error generating conclusion: {str(e)}")
2118
+ return ""
2119
+
2120
+ def _standardize_final_description(self, description: str) -> str:
2121
+ """
2122
+ 對最終描述進行標準化處理
2123
+
2124
+ Args:
2125
+ description: 原始描述文本
2126
+
2127
+ Returns:
2128
+ str: 標準化後的描述文本
2129
+ """
2130
+ try:
2131
+ # 移除多餘空格
2132
+ description = " ".join(description.split())
2133
+
2134
+ # 確保句子間有適當間距
2135
+ description = description.replace(". ", ". ")
2136
+
2137
+ # 移除任何殘留的技術性標識符
2138
+ technical_patterns = [
2139
+ r'zone_\d+', r'area_\d+', r'region_\d+',
2140
+ r'_zone', r'_area', r'_region'
2141
+ ]
2142
+
2143
+ for pattern in technical_patterns:
2144
+ description = re.sub(pattern, '', description, flags=re.IGNORECASE)
2145
+
2146
+ return description.strip()
2147
+
2148
+ except Exception as e:
2149
+ logger.error(f"Error standardizing final description: {str(e)}")
2150
+ return description
text_formatter.py ADDED
@@ -0,0 +1,545 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import traceback
3
+ import re
4
+ from typing import Dict, List, Optional
5
+
6
+ from landmark_data import ALL_LANDMARKS
7
+
8
+ class TextFormattingError(Exception):
9
+ """文本格式化過程中的自定義異常"""
10
+ pass
11
+
12
+
13
+ class TextFormatter:
14
+ """
15
+ 文本格式化器 - 負責文本拼接、格式化和最終輸出優化
16
+
17
+ 該類別處理所有與文本格式化相關的邏輯,包括智能文本拼接、
18
+ 標點符號處理、大小寫規範化以及地標引用的過濾功能。
19
+ """
20
+
21
+ def __init__(self):
22
+ """
23
+ 初始化文本格式化器
24
+ """
25
+ self.logger = logging.getLogger(self.__class__.__name__)
26
+
27
+ try:
28
+ # 載入地標數據用於引用過濾
29
+ self.landmark_data = self._load_landmark_data()
30
+
31
+ self.logger.info("TextFormatter initialized successfully")
32
+
33
+ except Exception as e:
34
+ error_msg = f"Failed to initialize TextFormatter: {str(e)}"
35
+ self.logger.error(f"{error_msg}\n{traceback.format_exc()}")
36
+ raise TextFormattingError(error_msg) from e
37
+
38
+ def _load_landmark_data(self) -> Dict:
39
+ """
40
+ 載入地標數據
41
+
42
+ Returns:
43
+ Dict: 地標數據字典
44
+ """
45
+ try:
46
+ return ALL_LANDMARKS
47
+ except ImportError:
48
+ self.logger.warning("Failed to import landmark data, landmark filtering will be disabled")
49
+ return {}
50
+ except Exception as e:
51
+ self.logger.warning(f"Error loading landmark data: {str(e)}")
52
+ return {}
53
+
54
+ def smart_append(self, current_text: str, new_fragment: str) -> str:
55
+ """
56
+ 將新文本片段附加到現有文本,處理標點符號和大小寫
57
+
58
+ Args:
59
+ current_text: 要加到的現有文本
60
+ new_fragment: 要加的新文本片段
61
+
62
+ Returns:
63
+ str: 合併後的文本,具有適當的格式化
64
+ """
65
+ try:
66
+ # 處理空值情況
67
+ if not new_fragment:
68
+ return current_text
69
+
70
+ if not current_text:
71
+ # 確保第一個字符大寫
72
+ return new_fragment[0].upper() + new_fragment[1:] if new_fragment else ""
73
+
74
+ # 清理現有文本
75
+ current_text = current_text.rstrip()
76
+
77
+ # 檢查結尾標點符號
78
+ ends_with_sentence = current_text.endswith(('.', '!', '?'))
79
+ ends_with_comma = current_text.endswith(',')
80
+
81
+ # 特別處理 "A xxx A yyy" 模式
82
+ if (current_text.startswith("A ") or current_text.startswith("An ")) and \
83
+ (new_fragment.startswith("A ") or new_fragment.startswith("An ")):
84
+ return current_text + ". " + new_fragment
85
+
86
+ # 檢查新片段是否包含地標名稱(通常為專有名詞)
87
+ has_landmark_name = any(word[0].isupper() for word in new_fragment.split()
88
+ if len(word) > 2 and not word.startswith(("A ", "An ", "The ")))
89
+
90
+ # 決定如何連接文本
91
+ if ends_with_sentence:
92
+ # 句子後,以大寫開始並添加適當間距
93
+ joined_text = current_text + " " + (new_fragment[0].upper() + new_fragment[1:])
94
+ elif ends_with_comma:
95
+ # 逗號後,要保持流暢性,除非是專有名詞或特殊情況
96
+ if new_fragment.startswith(('I ', 'I\'', 'A ', 'An ', 'The ')) or new_fragment[0].isupper() or has_landmark_name:
97
+ joined_text = current_text + " " + new_fragment
98
+ else:
99
+ joined_text = current_text + " " + new_fragment[0].lower() + new_fragment[1:]
100
+ elif "scene is" in new_fragment.lower() or "scene includes" in new_fragment.lower():
101
+ # 加關於場景的新句子時,使用句號
102
+ joined_text = current_text + ". " + new_fragment
103
+ else:
104
+ # 其他情況,根據內容決定
105
+ if self._is_related_phrases(current_text, new_fragment):
106
+ if new_fragment.startswith(('I ', 'I\'', 'A ', 'An ', 'The ')) or new_fragment[0].isupper() or has_landmark_name:
107
+ joined_text = current_text + ", " + new_fragment
108
+ else:
109
+ joined_text = current_text + ", " + new_fragment[0].lower() + new_fragment[1:]
110
+ else:
111
+ # 對不相關的短語使用句號
112
+ joined_text = current_text + ". " + (new_fragment[0].upper() + new_fragment[1:])
113
+
114
+ return joined_text
115
+
116
+ except Exception as e:
117
+ self.logger.warning(f"Error in smart_append: {str(e)}")
118
+ # 備用簡單拼接
119
+ return f"{current_text} {new_fragment}" if current_text else new_fragment
120
+
121
+ def _is_related_phrases(self, text1: str, text2: str) -> bool:
122
+ """
123
+ 判斷兩個短語是否相關,應該用逗號
124
+
125
+ Args:
126
+ text1: 第一個文本片段
127
+ text2: 要加的第二個文本片段
128
+
129
+ Returns:
130
+ bool: 短語是否相關
131
+ """
132
+ try:
133
+ # 檢查兩個短語是否都以 "A" 或 "An" 開始 - 這些是獨立的描述
134
+ if (text1.startswith("A ") or text1.startswith("An ")) and \
135
+ (text2.startswith("A ") or text2.startswith("An ")):
136
+ return False # 這些是獨立的描述,不是相關短語
137
+
138
+ # 檢查第二個短語是否以連接詞開始
139
+ connecting_words = ["which", "where", "who", "whom", "whose", "with", "without",
140
+ "this", "these", "that", "those", "and", "or", "but"]
141
+
142
+ first_word = text2.split()[0].lower() if text2 else ""
143
+ if first_word in connecting_words:
144
+ return True
145
+
146
+ # 檢查第一個短語是否以暗示連續性的內容結尾
147
+ ending_patterns = ["such as", "including", "like", "especially", "particularly",
148
+ "for example", "for instance", "namely", "specifically"]
149
+
150
+ for pattern in ending_patterns:
151
+ if text1.lower().endswith(pattern):
152
+ return True
153
+
154
+ # 檢查兩個短語是否都關於場景
155
+ if "scene" in text1.lower() and "scene" in text2.lower():
156
+ return False # 關於場景的獨立陳述應該是分開的句子
157
+
158
+ return False
159
+
160
+ except Exception as e:
161
+ self.logger.warning(f"Error checking phrase relationship: {str(e)}")
162
+ return False
163
+
164
+ def format_final_description(self, text: str) -> str:
165
+ """
166
+ 格式化最終描述文本,確保正確的標點符號、大小寫和間距
167
+
168
+ Args:
169
+ text: 要格式化的文本
170
+
171
+ Returns:
172
+ str: 格式化後的文本
173
+ """
174
+ try:
175
+ if not text or not text.strip():
176
+ return ""
177
+
178
+ # 首先修剪前導/尾隨空白
179
+ text = text.strip()
180
+
181
+ # 1. 處理連續的 "A/An" 段落(可能將它們分成句子)
182
+ text = re.sub(r'(A\s+[^.!?]+?[\w\.])\s+(A\s+)', r'\1. \2', text, flags=re.IGNORECASE)
183
+ text = re.sub(r'(An\s+[^.!?]+?[\w\.])\s+(An?\s+)', r'\1. \2', text, flags=re.IGNORECASE)
184
+
185
+ # 2. 確保整個文本的第一個字符大寫
186
+ if text:
187
+ text = text[0].upper() + text[1:]
188
+
189
+ # 3. 規範化空白:多個空格變為一個
190
+ text = re.sub(r'\s{2,}', ' ', text)
191
+
192
+ # 4. 句子結尾標點符號後大寫
193
+ def capitalize_after_punctuation(match):
194
+ return match.group(1) + match.group(2).upper()
195
+ text = re.sub(r'([.!?]\s+)([a-z])', capitalize_after_punctuation, text)
196
+
197
+ # 5. 處理逗號後的大小寫
198
+ def fix_capitalization_after_comma(match):
199
+ leading_comma_space = match.group(1) # (,\s+)
200
+ word_after_comma = match.group(2) # ([A-Z][a-zA-Z]*)
201
+
202
+ proper_nouns_exceptions = ["I", "I'm", "I've", "I'd", "I'll",
203
+ "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday",
204
+ "January", "February", "March", "April", "May", "June", "July",
205
+ "August", "September", "October", "November", "December"]
206
+
207
+ if word_after_comma in proper_nouns_exceptions:
208
+ return match.group(0)
209
+
210
+ # 如果詞看起來像專有名詞(已經大寫且不是常用詞),保持不變
211
+ if len(word_after_comma) > 2 and word_after_comma[0].isupper() and word_after_comma.lower() not in ["this", "that", "these", "those", "they", "their", "then", "thus"]:
212
+ return match.group(0) # 如果看起來已經是專有名詞則保持不變
213
+
214
+ return leading_comma_space + word_after_comma[0].lower() + word_after_comma[1:]
215
+ text = re.sub(r'(,\s+)([A-Z][a-zA-Z\'\-]+)', fix_capitalization_after_comma, text)
216
+
217
+ # 6. 修正標點符號周圍的間距
218
+ text = re.sub(r'\s*([.,;:!?])\s*', r'\1 ', text) # 確保標點符號後有一個空格,前面沒有
219
+ text = text.replace(' .', '.').replace(' ,', ',') # 清理標點符號前可能的空格
220
+
221
+ # 7. 合併多個句子結尾標點符號
222
+ text = re.sub(r'[.!?]{2,}', '.', text) # 將多個轉換為單個句號
223
+ text = re.sub(r',+', ',', text) # 多個逗號變為一個
224
+
225
+ # 8. 確保文本以單個句子結尾標點符號結尾
226
+ text = text.strip() # 檢查最後一個字符前移除尾隨空白
227
+ if text and not text[-1] in '.!?':
228
+ text += '.'
229
+
230
+ # 9. 處理空的佔位符和前導標點符號
231
+ text = re.sub(r'\bIn\s*,\s*', 'In this scene, ', text) # 修復 "In , " 問題
232
+ text = re.sub(r'\s*,\s*([A-Z])', r'. \1', text) # 修復逗號後直接跟大寫字母的問題
233
+ text = re.sub(r'^[.,;:!?\s]+', '', text) # 移除前導標點符號
234
+
235
+ # 10. 第一個字母大寫的最終檢查
236
+ if text:
237
+ text = text[0].upper() + text[1:]
238
+
239
+ # 11. 移除最終標點符號前的空格(如果規則7意外添加)
240
+ text = re.sub(r'\s+([.!?])$', r'\1', text)
241
+
242
+ return text.strip() # 最終修剪
243
+
244
+ except Exception as e:
245
+ self.logger.warning(f"Error formatting final description: {str(e)}")
246
+ # 備用基本格式化
247
+ if text:
248
+ text = text.strip()
249
+ if text and not text.endswith(('.', '!', '?')):
250
+ text += '.'
251
+ if text:
252
+ text = text[0].upper() + text[1:]
253
+ return text
254
+ return ""
255
+
256
+ def filter_landmark_references(self, text: str, enable_landmark: bool = True) -> str:
257
+ """
258
+ 動態過濾文本中的地標引用
259
+
260
+ Args:
261
+ text: 需要過濾的文本
262
+ enable_landmark: 是否啟用地標功能
263
+
264
+ Returns:
265
+ str: 過濾後的文本
266
+ """
267
+ try:
268
+ if enable_landmark or not text:
269
+ return text
270
+
271
+ # 動態收集所有地標名稱和位置
272
+ landmark_names = []
273
+ locations = []
274
+
275
+ for landmark_id, info in self.landmark_data.items():
276
+ # 收集地標名稱及其別名
277
+ landmark_names.append(info["name"])
278
+ landmark_names.extend(info.get("aliases", []))
279
+
280
+ # 收集地理位置
281
+ if "location" in info:
282
+ location = info["location"]
283
+ locations.append(location)
284
+
285
+ # 處理分離的城市和國家名稱
286
+ parts = location.split(",")
287
+ if len(parts) >= 1:
288
+ locations.append(parts[0].strip())
289
+ if len(parts) >= 2:
290
+ locations.append(parts[1].strip())
291
+
292
+ # 替換所有地標名稱
293
+ for name in landmark_names:
294
+ if name and len(name) > 2: # 避免過短的名稱
295
+ text = re.sub(r'\b' + re.escape(name) + r'\b', "tall structure", text, flags=re.IGNORECASE)
296
+
297
+ # 動態替換所有位置引用
298
+ for location in locations:
299
+ if location and len(location) > 2:
300
+ # 替換常見位置表述模式
301
+ text = re.sub(r'in ' + re.escape(location), "in the urban area", text, flags=re.IGNORECASE)
302
+ text = re.sub(r'of ' + re.escape(location), "of the urban area", text, flags=re.IGNORECASE)
303
+ text = re.sub(r'\b' + re.escape(location) + r'\b', "the urban area", text, flags=re.IGNORECASE)
304
+
305
+ # 通用地標描述模式替換
306
+ landmark_patterns = [
307
+ (r'a (tourist|popular|famous) landmark', r'an urban structure'),
308
+ (r'an iconic structure in ([A-Z][a-zA-Z\s,]+)', r'an urban structure in the area'),
309
+ (r'a famous (monument|tower|landmark) in ([A-Z][a-zA-Z\s,]+)', r'an urban structure in the area'),
310
+ (r'(centered|built|located|positioned) around the ([A-Z][a-zA-Z\s]+? (Tower|Monument|Landmark))', r'located in this area'),
311
+ (r'(sightseeing|guided tours|cultural tourism) (at|around|near) (this landmark|the [A-Z][a-zA-Z\s]+)', r'\1 in this area'),
312
+ (r'this (famous|iconic|historic|well-known) (landmark|monument|tower|structure)', r'this urban structure'),
313
+ (r'([A-Z][a-zA-Z\s]+) Tower', r'tall structure'),
314
+ (r'a (tower|structure) in ([A-Z][a-zA-Z\s,]+)', r'a \1 in the area'),
315
+ (r'landmark scene', r'urban scene'),
316
+ (r'tourist destination', r'urban area'),
317
+ (r'tourist attraction', r'urban area')
318
+ ]
319
+
320
+ for pattern, replacement in landmark_patterns:
321
+ text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
322
+
323
+ return text
324
+
325
+ except Exception as e:
326
+ self.logger.warning(f"Error filtering landmark references: {str(e)}")
327
+ return text
328
+
329
+ def optimize_text_flow(self, text: str) -> str:
330
+ """
331
+ 優化文本流暢性,減少重複和改善可讀性
332
+
333
+ Args:
334
+ text: 要優化的文本
335
+
336
+ Returns:
337
+ str: 優化後的文本
338
+ """
339
+ try:
340
+ if not text:
341
+ return text
342
+
343
+ # 移除重複的短語
344
+ text = self._remove_duplicate_phrases(text)
345
+
346
+ # 優化連接詞使用
347
+ text = self._optimize_connectors(text)
348
+
349
+ # 平衡句子長度
350
+ text = self._balance_sentence_length(text)
351
+
352
+ return text
353
+
354
+ except Exception as e:
355
+ self.logger.warning(f"Error optimizing text flow: {str(e)}")
356
+ return text
357
+
358
+ def _remove_duplicate_phrases(self, text: str) -> str:
359
+ """
360
+ 移除文本中的重複短語
361
+
362
+ Args:
363
+ text: 輸入文本
364
+
365
+ Returns:
366
+ str: 移除重複後的文本
367
+ """
368
+ try:
369
+ # 分割成句子
370
+ sentences = re.split(r'[.!?]+', text)
371
+ unique_sentences = []
372
+ seen_content = set()
373
+
374
+ for sentence in sentences:
375
+ sentence = sentence.strip()
376
+ if not sentence:
377
+ continue
378
+
379
+ # 規範化以進行比較(移除額外空白和標點符號)
380
+ normalized = re.sub(r'\s+', ' ', sentence.lower().strip())
381
+
382
+ # 檢查是否實質相似
383
+ is_duplicate = False
384
+ for seen in seen_content:
385
+ if self._sentences_similar(normalized, seen):
386
+ is_duplicate = True
387
+ break
388
+
389
+ if not is_duplicate:
390
+ unique_sentences.append(sentence)
391
+ seen_content.add(normalized)
392
+
393
+ return '. '.join(unique_sentences) + '.' if unique_sentences else ""
394
+
395
+ except Exception as e:
396
+ self.logger.warning(f"Error removing duplicate phrases: {str(e)}")
397
+ return text
398
+
399
+ def _sentences_similar(self, sent1: str, sent2: str) -> bool:
400
+ """
401
+ 檢查兩個句子是否相似
402
+
403
+ Args:
404
+ sent1: 第一個句子
405
+ sent2: 第二個句子
406
+
407
+ Returns:
408
+ bool: 句子是否相似
409
+ """
410
+ try:
411
+ # 簡單的相似性檢查:如果80%的詞彙重疊
412
+ words1 = set(sent1.split())
413
+ words2 = set(sent2.split())
414
+
415
+ if not words1 or not words2:
416
+ return False
417
+
418
+ intersection = len(words1 & words2)
419
+ union = len(words1 | words2)
420
+
421
+ similarity = intersection / union if union > 0 else 0
422
+ return similarity > 0.8
423
+
424
+ except Exception as e:
425
+ self.logger.warning(f"Error checking sentence similarity: {str(e)}")
426
+ return False
427
+
428
+ def _optimize_connectors(self, text: str) -> str:
429
+ """
430
+ 優化連接詞的使用
431
+
432
+ Args:
433
+ text: 輸入文本
434
+
435
+ Returns:
436
+ str: 優化連接詞後的文本
437
+ """
438
+ try:
439
+ # 替換重複的連接詞
440
+ text = re.sub(r'\band\s+and\b', 'and', text, flags=re.IGNORECASE)
441
+ text = re.sub(r'\bwith\s+with\b', 'with', text, flags=re.IGNORECASE)
442
+
443
+ # 改善過度使用 "and" 的情況
444
+ text = re.sub(r'(\w+),\s+and\s+(\w+),\s+and\s+(\w+)', r'\1, \2, and \3', text)
445
+
446
+ return text
447
+
448
+ except Exception as e:
449
+ self.logger.warning(f"Error optimizing connectors: {str(e)}")
450
+ return text
451
+
452
+ def _balance_sentence_length(self, text: str) -> str:
453
+ """
454
+ 平衡句子長度,分割過長的句子
455
+
456
+ Args:
457
+ text: 輸入文本
458
+
459
+ Returns:
460
+ str: 平衡句子長度後的文本
461
+ """
462
+ try:
463
+ sentences = re.split(r'([.!?]+)', text)
464
+ balanced_text = ""
465
+
466
+ for i in range(0, len(sentences), 2):
467
+ if i + 1 < len(sentences):
468
+ sentence = sentences[i]
469
+ punctuation = sentences[i + 1]
470
+
471
+ # 如果句子太長(超過150個字符),嘗試在適當位置分割
472
+ if len(sentence) > 150:
473
+ # 在逗號或連接詞處分割
474
+ split_points = [m.start() for m in re.finditer(r',\s+(?:and|but|or|while|when|where)', sentence)]
475
+ if split_points:
476
+ mid_point = split_points[len(split_points) // 2]
477
+ first_part = sentence[:mid_point].strip()
478
+ second_part = sentence[mid_point + 1:].strip()
479
+ if second_part and not second_part[0].isupper():
480
+ second_part = second_part[0].upper() + second_part[1:]
481
+ balanced_text += first_part + ". " + second_part + punctuation + " "
482
+ else:
483
+ balanced_text += sentence + punctuation + " "
484
+ else:
485
+ balanced_text += sentence + punctuation + " "
486
+
487
+ return balanced_text.strip()
488
+
489
+ except Exception as e:
490
+ self.logger.warning(f"Error balancing sentence length: {str(e)}")
491
+ return text
492
+
493
+ def validate_text_quality(self, text: str) -> Dict[str, bool]:
494
+ """
495
+ 驗證文本質量
496
+
497
+ Args:
498
+ text: 要驗證的文本
499
+
500
+ Returns:
501
+ Dict[str, bool]: 質量檢查結果
502
+ """
503
+ try:
504
+ quality_checks = {
505
+ "has_content": bool(text and text.strip()),
506
+ "proper_capitalization": bool(text and text[0].isupper()) if text else False,
507
+ "ends_with_punctuation": bool(text and text.strip()[-1] in '.!?') if text else False,
508
+ "no_double_spaces": " " not in text if text else True,
509
+ "no_leading_punctuation": not bool(re.match(r'^[.,;:!?]', text.strip())) if text else True,
510
+ "reasonable_length": 20 <= len(text) <= 1000 if text else False
511
+ }
512
+
513
+ return quality_checks
514
+
515
+ except Exception as e:
516
+ self.logger.warning(f"Error validating text quality: {str(e)}")
517
+ return {"error": True}
518
+
519
+ def get_text_statistics(self, text: str) -> Dict[str, int]:
520
+ """
521
+ 獲取文本統計信息
522
+
523
+ Args:
524
+ text: 要分析的文本
525
+
526
+ Returns:
527
+ Dict[str, int]: 文本統計信息
528
+ """
529
+ try:
530
+ if not text:
531
+ return {"characters": 0, "words": 0, "sentences": 0}
532
+
533
+ characters = len(text)
534
+ words = len(text.split())
535
+ sentences = len(re.findall(r'[.!?]+', text))
536
+
537
+ return {
538
+ "characters": characters,
539
+ "words": words,
540
+ "sentences": sentences
541
+ }
542
+
543
+ except Exception as e:
544
+ self.logger.warning(f"Error getting text statistics: {str(e)}")
545
+ return {"characters": 0, "words": 0, "sentences": 0}
text_quality_validator.py ADDED
@@ -0,0 +1,452 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import logging
3
+ import traceback
4
+ from typing import Dict, List, Any, Optional, Set, Tuple
5
+
6
+
7
+ class TextQualityValidator:
8
+ """
9
+ 負責驗證和確保生成文本的品質和事實準確性。
10
+ 包含事實檢查、視角一致性、場景類型一致性等驗證功能。
11
+ """
12
+
13
+ def __init__(self):
14
+ """初始化文本品質驗證器"""
15
+ # 設置專屬logger
16
+ self.logger = logging.getLogger(self.__class__.__name__)
17
+ if not self.logger.handlers:
18
+ handler = logging.StreamHandler()
19
+ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
20
+ handler.setFormatter(formatter)
21
+ self.logger.addHandler(handler)
22
+ self.logger.setLevel(logging.INFO)
23
+
24
+ # 初始化驗證規則
25
+ self._initialize_validation_rules()
26
+ self.logger.info("TextQualityValidator initialized successfully")
27
+
28
+ def _initialize_validation_rules(self):
29
+ """初始化各種驗證規則和詞彙庫"""
30
+ try:
31
+ # 地點和文化詞彙列表
32
+ self.location_terms = ["plaza", "square", "market", "mall", "avenue", "boulevard"]
33
+ self.cultural_terms = ["european", "asian", "american", "african", "western", "eastern"]
34
+
35
+ # 視角詞彙對應表
36
+ self.perspective_terms = {
37
+ "aerial": ["aerial", "bird's-eye", "overhead", "top-down", "above", "looking down"],
38
+ "ground": ["street-level", "ground level", "eye-level", "standing"],
39
+ "indoor": ["inside", "interior", "indoor", "within"],
40
+ "close-up": ["close-up", "detailed view", "close shot"]
41
+ }
42
+
43
+ # 視角前綴對應表
44
+ self.perspective_prefixes = {
45
+ "aerial": "From an aerial perspective, ",
46
+ "ground": "From street level, ",
47
+ "indoor": "In this indoor setting, ",
48
+ "close-up": "In this close-up view, "
49
+ }
50
+
51
+ # 數值檢測模式
52
+ self.number_patterns = [
53
+ (r'(\d+)\s+(people|person|pedestrians|individuals)', r'\1', r'\2'),
54
+ (r'(\d+)\s+(cars|vehicles|automobiles)', r'\1', r'\2'),
55
+ (r'(\d+)\s+(buildings|structures)', r'\1', r'\2'),
56
+ (r'(\d+)\s+(plants|potted plants|flowers)', r'\1', r'\2'),
57
+ (r'(\d+)\s+(beds|furniture|tables|chairs)', r'\1', r'\2')
58
+ ]
59
+
60
+ # 禁用場景詞列表
61
+ self.prohibited_scene_words = ["plaza", "square", "european", "asian", "american"]
62
+
63
+ self.logger.info("Validation rules initialized successfully")
64
+
65
+ except Exception as e:
66
+ error_msg = f"Failed to initialize validation rules: {str(e)}"
67
+ self.logger.error(error_msg)
68
+ self.logger.error(traceback.format_exc())
69
+ raise Exception(error_msg) from e
70
+
71
+ def verify_factual_accuracy(self,
72
+ original_desc: str,
73
+ generated_desc: str,
74
+ object_list: str) -> str:
75
+ """
76
+ 驗證生成描述的事實準確性
77
+
78
+ Args:
79
+ original_desc: 原始場景描述
80
+ generated_desc: 生成的描述
81
+ object_list: 檢測到的物件列表
82
+
83
+ Returns:
84
+ str: 驗證並可能修正後的描述
85
+ """
86
+ try:
87
+ self.logger.debug("Starting factual accuracy verification")
88
+
89
+ # 將原始描述和物體列表合併為授權詞彙源
90
+ authorized_content = original_desc.lower() + " " + object_list.lower()
91
+
92
+ # 檢查和替換未授權的地點和文化詞彙
93
+ verified_desc = self._check_unauthorized_terms(generated_desc, authorized_content)
94
+
95
+ # 檢查重複用詞問題
96
+ verified_desc = self._detect_repetitive_patterns(verified_desc)
97
+
98
+ self.logger.debug("Factual accuracy verification completed")
99
+ return verified_desc
100
+
101
+ except Exception as e:
102
+ error_msg = f"Factual accuracy verification failed: {str(e)}"
103
+ self.logger.error(error_msg)
104
+ self.logger.error(traceback.format_exc())
105
+ return generated_desc # 發生錯誤時返回原始生成描述
106
+
107
+ def _check_unauthorized_terms(self, generated_desc: str, authorized_content: str) -> str:
108
+ """檢查並替換未授權的詞彙"""
109
+ # 檢查生成文本中的每個詞
110
+ for term in self.location_terms + self.cultural_terms:
111
+ # 僅當該詞出現在生成文本但不在授權內容中時進行替換
112
+ if term in generated_desc.lower() and term not in authorized_content:
113
+ # 根據詞語類型選擇適當的替換詞
114
+ if term in self.location_terms:
115
+ replacement = "area"
116
+ else:
117
+ replacement = "scene"
118
+
119
+ # 使用正則表達式進��完整詞匹配替換
120
+ pattern = re.compile(r'\b' + term + r'\b', re.IGNORECASE)
121
+ generated_desc = pattern.sub(replacement, generated_desc)
122
+
123
+ return generated_desc
124
+
125
+ def _detect_repetitive_patterns(self, generated_desc: str) -> str:
126
+ """檢測並處理重複用詞問題"""
127
+ repetitive_patterns = [
128
+ (r'\b(visible)\b.*?\b(visible)\b', 'Multiple uses of "visible" detected'),
129
+ (r'\b(positioned)\b.*?\b(positioned)\b', 'Multiple uses of "positioned" detected'),
130
+ (r'\b(located)\b.*?\b(located)\b', 'Multiple uses of "located" detected'),
131
+ (r'\b(situated)\b.*?\b(situated)\b', 'Multiple uses of "situated" detected'),
132
+ (r'\b(appears)\b.*?\b(appears)\b', 'Multiple uses of "appears" detected'),
133
+ (r'\b(features)\b.*?\b(features)\b', 'Multiple uses of "features" detected'),
134
+ (r'\bThis\s+(\w+)\s+.*?\bThis\s+\1\b', 'Repetitive sentence structure detected')
135
+ ]
136
+
137
+ # 替換詞典
138
+ replacement_dict = {
139
+ 'visible': ['present', 'evident', 'apparent', 'observable'],
140
+ 'positioned': ['arranged', 'placed', 'set', 'organized'],
141
+ 'located': ['found', 'placed', 'situated', 'established'],
142
+ 'situated': ['placed', 'positioned', 'arranged', 'set'],
143
+ 'appears': ['seems', 'looks', 'presents', 'exhibits'],
144
+ 'features': ['includes', 'contains', 'displays', 'showcases']
145
+ }
146
+
147
+ for pattern, issue in repetitive_patterns:
148
+ matches = list(re.finditer(pattern, generated_desc, re.IGNORECASE | re.DOTALL))
149
+ if matches:
150
+ self.logger.warning(f"Text quality issue detected: {issue}")
151
+
152
+ # 針對特定重複詞彙進行替換
153
+ for word in replacement_dict.keys():
154
+ if word in issue.lower():
155
+ word_pattern = re.compile(r'\b' + word + r'\b', re.IGNORECASE)
156
+ word_matches = list(word_pattern.finditer(generated_desc))
157
+
158
+ # 保留第一次出現,替換後續出現
159
+ for i, match in enumerate(word_matches[1:], 1):
160
+ if i <= len(replacement_dict[word]):
161
+ replacement = replacement_dict[word][(i-1) % len(replacement_dict[word])]
162
+
163
+ # 保持原始大小寫格式
164
+ if match.group().isupper():
165
+ replacement = replacement.upper()
166
+ elif match.group().istitle():
167
+ replacement = replacement.capitalize()
168
+
169
+ # 執行替換
170
+ generated_desc = generated_desc[:match.start()] + replacement + generated_desc[match.end():]
171
+ # 重新計算後續匹配位置
172
+ word_matches = list(word_pattern.finditer(generated_desc))
173
+ break
174
+
175
+ return generated_desc
176
+
177
+ def fact_check_description(self,
178
+ original_desc: str,
179
+ enhanced_desc: str,
180
+ scene_type: str,
181
+ detected_objects: List[str]) -> str:
182
+ """
183
+ 對增強後的描述進行全面的事實檢查
184
+
185
+ Args:
186
+ original_desc: 原始場景描述
187
+ enhanced_desc: 增強後的描述
188
+ scene_type: 場景類型
189
+ detected_objects: 檢測到的物體名稱列表
190
+
191
+ Returns:
192
+ str: 經過事實檢查的描述
193
+ """
194
+ try:
195
+ self.logger.debug("Starting comprehensive fact checking")
196
+
197
+ # 如果增強描述為空或太短,返回原始描述
198
+ if not enhanced_desc or len(enhanced_desc) < 30:
199
+ return original_desc
200
+
201
+ # 1. 檢查數值一致性
202
+ enhanced_desc = self._check_numerical_consistency(original_desc, enhanced_desc)
203
+
204
+ # 2. 檢查視角一致性
205
+ enhanced_desc = self._check_perspective_consistency(original_desc, enhanced_desc)
206
+
207
+ # 3. 檢查場景類型一致性
208
+ enhanced_desc = self._check_scene_type_consistency(enhanced_desc, scene_type)
209
+
210
+ # 4. 確保文字長度適當
211
+ enhanced_desc = self._ensure_appropriate_length(enhanced_desc)
212
+
213
+ self.logger.debug("Comprehensive fact checking completed")
214
+ return enhanced_desc
215
+
216
+ except Exception as e:
217
+ error_msg = f"Fact checking failed: {str(e)}"
218
+ self.logger.error(error_msg)
219
+ self.logger.error(traceback.format_exc())
220
+ return enhanced_desc # 發生錯誤時返回增強描述
221
+
222
+ def _check_numerical_consistency(self, original_desc: str, enhanced_desc: str) -> str:
223
+ """檢查數值一致性"""
224
+ # 檢查原始描述中的每個數字
225
+ for pattern, num_group, word_group in self.number_patterns:
226
+ original_matches = re.finditer(pattern, original_desc, re.IGNORECASE)
227
+ for match in original_matches:
228
+ number = match.group(1)
229
+ noun = match.group(2)
230
+
231
+ # 檢查增強描述中是否保留了這個數字
232
+ enhanced_pattern = r'(\d+)\s+(' + re.escape(noun) + r'|' + re.escape(noun.rstrip('s')) + r'|' + re.escape(noun + 's') + r')'
233
+ enhanced_matches = list(re.finditer(enhanced_pattern, enhanced_desc, re.IGNORECASE))
234
+
235
+ if not enhanced_matches:
236
+ # 數字+名詞未在增強描述中找到
237
+ plural_form = noun if noun.endswith('s') or number == '1' else noun + 's'
238
+ if enhanced_desc.startswith("This") or enhanced_desc.startswith("The"):
239
+ enhanced_desc = enhanced_desc.replace("This ", f"This scene with {number} {plural_form} ", 1)
240
+ enhanced_desc = enhanced_desc.replace("The ", f"The scene with {number} {plural_form} ", 1)
241
+ else:
242
+ enhanced_desc = f"The scene includes {number} {plural_form}. " + enhanced_desc
243
+ elif enhanced_matches and enhanced_matches[0].group(1) != number:
244
+ # 存在但數字不一致,需要更正數字
245
+ for ematch in enhanced_matches:
246
+ wrong_number = ematch.group(1)
247
+ enhanced_desc = enhanced_desc.replace(f"{wrong_number} {ematch.group(2)}", f"{number} {ematch.group(2)}")
248
+
249
+ return enhanced_desc
250
+
251
+ def _check_perspective_consistency(self, original_desc: str, enhanced_desc: str) -> str:
252
+ """檢查視角一致性"""
253
+ # 確定原始視角
254
+ original_perspective = None
255
+ for persp, terms in self.perspective_terms.items():
256
+ if any(term in original_desc.lower() for term in terms):
257
+ original_perspective = persp
258
+ break
259
+
260
+ # 檢查是否保留了視角
261
+ if original_perspective:
262
+ enhanced_has_perspective = any(term in enhanced_desc.lower() for term in self.perspective_terms[original_perspective])
263
+
264
+ if not enhanced_has_perspective:
265
+ # 添加缺失的視角
266
+ prefix = self.perspective_prefixes.get(original_perspective, "")
267
+ if prefix:
268
+ if enhanced_desc[0].isupper():
269
+ enhanced_desc = prefix + enhanced_desc[0].lower() + enhanced_desc[1:]
270
+ else:
271
+ enhanced_desc = prefix + enhanced_desc
272
+
273
+ return enhanced_desc
274
+
275
+ def _check_scene_type_consistency(self, enhanced_desc: str, scene_type: str) -> str:
276
+ """檢查場景類型一致性"""
277
+ if scene_type and scene_type.lower() != "unknown" and scene_type.lower() not in enhanced_desc.lower():
278
+ # 添加場景類型
279
+ if enhanced_desc.startswith("This ") or enhanced_desc.startswith("The "):
280
+ # 避免產生重複
281
+ if "scene" in enhanced_desc[:15].lower():
282
+ fixed_type = scene_type.lower()
283
+ enhanced_desc = enhanced_desc.replace("scene", fixed_type, 1)
284
+ else:
285
+ enhanced_desc = enhanced_desc.replace("This ", f"This {scene_type} ", 1)
286
+ enhanced_desc = enhanced_desc.replace("The ", f"The {scene_type} ", 1)
287
+ else:
288
+ enhanced_desc = f"This {scene_type} " + enhanced_desc
289
+
290
+ return enhanced_desc
291
+
292
+ def _ensure_appropriate_length(self, enhanced_desc: str) -> str:
293
+ """確保文字長度適當"""
294
+ words = enhanced_desc.split()
295
+ if len(words) > 200:
296
+ # 找尋接近字數限制的句子結束處
297
+ truncated = ' '.join(words[:200])
298
+ last_period = max(truncated.rfind('.'), truncated.rfind('!'), truncated.rfind('?'))
299
+
300
+ if last_period > 0:
301
+ enhanced_desc = truncated[:last_period+1]
302
+ else:
303
+ enhanced_desc = truncated + '.'
304
+
305
+ return enhanced_desc
306
+
307
+ def ensure_scene_type_consistency(self,
308
+ description: str,
309
+ scene_type: str,
310
+ original_desc: str) -> str:
311
+ """
312
+ 確保描述中的場景類型與指定的場景類型一致
313
+
314
+ Args:
315
+ description: 待檢查的描述
316
+ scene_type: 指定的場景類型
317
+ original_desc: 原始描述(用於參考)
318
+
319
+ Returns:
320
+ str: 場景類型一致的描述
321
+ """
322
+ try:
323
+ self.logger.debug("Ensuring scene type consistency")
324
+ scene_type = scene_type.replace('_', ' ')
325
+ # 檢查是否包含禁止的場景詞
326
+ for word in self.prohibited_scene_words:
327
+ if word in description.lower() and word not in original_desc.lower() and word not in scene_type.lower():
328
+ # 替換錯誤場景詞為正確場景類型
329
+ pattern = re.compile(r'\b' + word + r'\b', re.IGNORECASE)
330
+ description = pattern.sub(scene_type, description)
331
+
332
+ # 確保場景類型在描述中被提及
333
+ if scene_type.lower() not in description.lower():
334
+ # 尋找通用場景詞並替換
335
+ for general_term in ["scene", "area", "place", "location"]:
336
+ if general_term in description.lower():
337
+ pattern = re.compile(r'\b' + general_term + r'\b', re.IGNORECASE)
338
+ description = pattern.sub(scene_type, description, count=1)
339
+ break
340
+ else:
341
+ # 如果沒有找到通用詞,在開頭添加場景類型
342
+ if description.startswith("The "):
343
+ description = description.replace("The ", f"The {scene_type} ", 1)
344
+ elif description.startswith("This "):
345
+ description = description.replace("This ", f"This {scene_type} ", 1)
346
+ else:
347
+ description = f"This {scene_type} " + description
348
+
349
+ self.logger.debug("Scene type consistency ensured")
350
+ return description
351
+
352
+ except Exception as e:
353
+ error_msg = f"Scene type consistency check failed: {str(e)}"
354
+ self.logger.error(error_msg)
355
+ self.logger.error(traceback.format_exc())
356
+ return description
357
+
358
+ def extract_perspective_from_description(self, description: str) -> str:
359
+ """
360
+ 從原始描述中提取視角信息
361
+
362
+ Args:
363
+ description: 原始場景描述
364
+
365
+ Returns:
366
+ str: 提取到的視角描述,如果沒有則返回空字符串
367
+ """
368
+ try:
369
+ for persp_type, terms in self.perspective_terms.items():
370
+ for term in terms:
371
+ if term.lower() in description.lower():
372
+ self.logger.debug(f"Perspective detected: {term}")
373
+ return term
374
+
375
+ return ""
376
+
377
+ except Exception as e:
378
+ self.logger.error(f"Perspective extraction failed: {str(e)}")
379
+ return ""
380
+
381
+ def extract_objects_from_description(self, description: str) -> List[str]:
382
+ """
383
+ 從原始描述中提取物件提及
384
+
385
+ Args:
386
+ description: 原始場景描述
387
+
388
+ Returns:
389
+ List[str]: 提取到的物件列表
390
+ """
391
+ try:
392
+ extracted_objects = []
393
+
394
+ for pattern in self.number_patterns:
395
+ matches = re.finditer(pattern[0], description, re.IGNORECASE)
396
+ for match in matches:
397
+ number = match.group(1)
398
+ object_type = match.group(2)
399
+ extracted_objects.append(f"{number} {object_type}")
400
+
401
+ self.logger.debug(f"Extracted {len(extracted_objects)} objects from description")
402
+ return extracted_objects
403
+
404
+ except Exception as e:
405
+ self.logger.error(f"Object extraction failed: {str(e)}")
406
+ return []
407
+
408
+ def validate_response_completeness(self, response: str) -> Tuple[bool, str]:
409
+ """
410
+ 驗證回應的完整性
411
+
412
+ Args:
413
+ response: 待驗證的回應
414
+
415
+ Returns:
416
+ Tuple[bool, str]: (是否完整, 問題描述)
417
+ """
418
+ try:
419
+ # 檢查回應長度
420
+ if len(response) < 100:
421
+ return False, "Response too short"
422
+
423
+ # 檢查句子結尾
424
+ if len(response) < 200 and "." not in response[-30:]:
425
+ return False, "No proper sentence ending"
426
+
427
+ # 檢查不完整短語
428
+ incomplete_phrases = ["in the", "with the", "and the"]
429
+ if any(response.endswith(phrase) for phrase in incomplete_phrases):
430
+ return False, "Ends with incomplete phrase"
431
+
432
+ return True, "Response is complete"
433
+
434
+ except Exception as e:
435
+ self.logger.error(f"Response completeness validation failed: {str(e)}")
436
+ return False, "Validation error"
437
+
438
+ def get_validator_info(self) -> Dict[str, Any]:
439
+ """
440
+ 獲取驗證器信息
441
+
442
+ Returns:
443
+ Dict[str, Any]: 包含驗證器狀態和配置的信息
444
+ """
445
+ return {
446
+ "location_terms_count": len(self.location_terms),
447
+ "cultural_terms_count": len(self.cultural_terms),
448
+ "perspective_types_count": len(self.perspective_terms),
449
+ "number_patterns_count": len(self.number_patterns),
450
+ "prohibited_words_count": len(self.prohibited_scene_words),
451
+ "initialization_status": "success"
452
+ }
viewpoint_detector.py ADDED
@@ -0,0 +1,437 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import traceback
3
+ from typing import Dict, List, Tuple, Optional
4
+ import numpy as np
5
+
6
+ class ViewpointDetectionError(Exception):
7
+ """Custom exception for errors during viewpoint detection."""
8
+ pass
9
+
10
+
11
+ class ViewpointDetector:
12
+ """
13
+ 視角檢測器 - 分析物體分布模式以識別圖像視角類型
14
+
15
+ 此class負責通過分析檢測到的物體在圖像中的空間分布、大小變化和位置模式,
16
+ 來確定圖像的拍攝視角。特別針對行人密集的十字路口場景進行了優化。
17
+ """
18
+
19
+ def __init__(self,
20
+ aerial_threshold: float = 0.7,
21
+ aerial_size_variance_threshold: float = 0.15,
22
+ low_angle_threshold: float = 0.3,
23
+ vertical_size_ratio_threshold: float = 1.8,
24
+ elevated_threshold: float = 0.6,
25
+ elevated_top_threshold: float = 0.3,
26
+ crosswalk_position_tolerance: float = 0.1,
27
+ crosswalk_axis_tolerance: float = 0.15,
28
+ min_people_for_crosswalk: int = 8,
29
+ min_people_for_aerial: int = 10):
30
+ """
31
+ 初始化視角檢測器
32
+
33
+ Args:
34
+ aerial_threshold: 空中視角檢測的物體密度閾值
35
+ aerial_size_variance_threshold: 空中視角的大小變異閾值
36
+ low_angle_threshold: 低角度視角的底部分布閾值
37
+ vertical_size_ratio_threshold: 垂直大小比例閾值
38
+ elevated_threshold: 高位視角的物體分布閾值
39
+ elevated_top_threshold: 高位視角的頂部物體閾值
40
+ crosswalk_position_tolerance: 十字路口位置容差
41
+ crosswalk_axis_tolerance: 十字路口軸線容差
42
+ min_people_for_crosswalk: 檢測十字路口所需的最少人數
43
+ min_people_for_aerial: 檢測空中視角所需的最少人數
44
+ """
45
+ self.logger = logging.getLogger(self.__class__.__name__)
46
+
47
+ # 視角檢測參數配置
48
+ self.viewpoint_params = {
49
+ "aerial_threshold": aerial_threshold,
50
+ "aerial_size_variance_threshold": aerial_size_variance_threshold,
51
+ "low_angle_threshold": low_angle_threshold,
52
+ "vertical_size_ratio_threshold": vertical_size_ratio_threshold,
53
+ "elevated_threshold": elevated_threshold,
54
+ "elevated_top_threshold": elevated_top_threshold,
55
+ "crosswalk_position_tolerance": crosswalk_position_tolerance,
56
+ "crosswalk_axis_tolerance": crosswalk_axis_tolerance,
57
+ "min_people_for_crosswalk": min_people_for_crosswalk,
58
+ "min_people_for_aerial": min_people_for_aerial
59
+ }
60
+
61
+ self.logger.info("ViewpointDetector initialized with parameters: %s", self.viewpoint_params)
62
+
63
+ def detect_viewpoint(self, detected_objects: List[Dict]) -> str:
64
+ """
65
+ 檢測圖像視角類型
66
+
67
+ Args:
68
+ detected_objects: 檢測到的物體列表,每個物體應包含位置、大小等信息
69
+
70
+ Returns:
71
+ str: 檢測到的視角類型 ('aerial', 'low_angle', 'elevated', 'eye_level')
72
+ """
73
+ try:
74
+ if not detected_objects:
75
+ self.logger.warning("No detected objects provided for viewpoint detection")
76
+ return "eye_level"
77
+
78
+ self.logger.info(f"Starting viewpoint detection with {len(detected_objects)} objects")
79
+
80
+ # 優先檢測十字路口模式(通常為空中視角)
81
+ if self._detect_crosswalk_pattern(detected_objects):
82
+ self.logger.info("Crosswalk pattern detected - returning aerial viewpoint")
83
+ return "aerial"
84
+
85
+ # 檢測基於行人分布的空中視角
86
+ if self._detect_aerial_from_pedestrian_distribution(detected_objects):
87
+ self.logger.info("Aerial viewpoint detected from pedestrian distribution")
88
+ return "aerial"
89
+
90
+ # 標準視角檢測流程
91
+ return self._detect_standard_viewpoint(detected_objects)
92
+
93
+ except Exception as e:
94
+ error_msg = f"Error during viewpoint detection: {str(e)}"
95
+ self.logger.error(f"{error_msg}\n{traceback.format_exc()}")
96
+ return "eye_level" # 返回默認值
97
+
98
+ def _detect_crosswalk_pattern(self, detected_objects: List[Dict]) -> bool:
99
+ """
100
+ 檢測十字路口/斑馬線模式
101
+
102
+ Args:
103
+ detected_objects: 檢測到的物體列表
104
+
105
+ Returns:
106
+ bool: 是否檢測到十字路口模式
107
+ """
108
+ try:
109
+ people_objs = [obj for obj in detected_objects if obj.get("class_id") == 0]
110
+
111
+ if len(people_objs) < self.viewpoint_params["min_people_for_crosswalk"]:
112
+ return False
113
+
114
+ # 提取行人位置
115
+ people_positions = []
116
+ for obj in people_objs:
117
+ if "normalized_center" in obj:
118
+ people_positions.append(obj["normalized_center"])
119
+
120
+ if len(people_positions) < 4:
121
+ return False
122
+
123
+ # 檢測十字形分布
124
+ if self._detect_cross_pattern(people_positions):
125
+ self.logger.debug("Cross pattern detected in pedestrian positions")
126
+ return True
127
+
128
+ # 檢測線性聚類分布
129
+ if self._detect_linear_crosswalk_clusters(people_positions):
130
+ self.logger.debug("Linear crosswalk clusters detected")
131
+ return True
132
+
133
+ return False
134
+
135
+ except Exception as e:
136
+ self.logger.warning(f"Error in crosswalk pattern detection: {str(e)}")
137
+ return False
138
+
139
+ def _detect_cross_pattern(self, positions: List[Tuple[float, float]]) -> bool:
140
+ """
141
+ 檢測十字形分布模式
142
+
143
+ Args:
144
+ positions: 物體位置列表 [(x, y), ...]
145
+
146
+ Returns:
147
+ bool: 是否檢測到十字形模式
148
+ """
149
+ try:
150
+ x_coords = [pos[0] for pos in positions]
151
+ y_coords = [pos[1] for pos in positions]
152
+
153
+ x_range = max(x_coords) - min(x_coords)
154
+ y_range = max(y_coords) - min(y_coords)
155
+
156
+ # 檢查 x 和 y 方向都有較大範圍且範圍相似
157
+ if x_range <= 0.5 or y_range <= 0.5:
158
+ return False
159
+
160
+ if not (0.7 < (x_range / y_range) < 1.3):
161
+ return False
162
+
163
+ # 計算到中心點的距離並檢查軸線分布
164
+ center_x = np.mean(x_coords)
165
+ center_y = np.mean(y_coords)
166
+
167
+ close_to_axis_count = 0
168
+ axis_tolerance = self.viewpoint_params["crosswalk_axis_tolerance"]
169
+
170
+ for x, y in positions:
171
+ x_distance_to_center = abs(x - center_x)
172
+ y_distance_to_center = abs(y - center_y)
173
+
174
+ # 檢查是否接近水平或垂直軸線
175
+ if x_distance_to_center < axis_tolerance or y_distance_to_center < axis_tolerance:
176
+ close_to_axis_count += 1
177
+
178
+ # 如果足夠多的點接近軸線,認為是十字路口
179
+ axis_ratio = close_to_axis_count / len(positions)
180
+ return axis_ratio >= 0.6
181
+
182
+ except Exception as e:
183
+ self.logger.warning(f"Error detecting cross pattern: {str(e)}")
184
+ return False
185
+
186
+ def _detect_linear_crosswalk_clusters(self, positions: List[Tuple[float, float]]) -> bool:
187
+ """
188
+ 檢測線性聚類分布(交叉的斑馬線)
189
+
190
+ Args:
191
+ positions: 物體位置列表
192
+
193
+ Returns:
194
+ bool: 是否檢測到線性交叉模式
195
+ """
196
+ try:
197
+ x_coords = [pos[0] for pos in positions]
198
+ y_coords = [pos[1] for pos in positions]
199
+
200
+ # 檢測 x 和 y 方向的聚類
201
+ x_clusters = self._detect_linear_clusters(x_coords)
202
+ y_clusters = self._detect_linear_clusters(y_coords)
203
+
204
+ # 如果在 x 和 y 方向上都有多個聚類,可能是交叉的斑馬線
205
+ return len(x_clusters) >= 2 and len(y_clusters) >= 2
206
+
207
+ except Exception as e:
208
+ self.logger.warning(f"Error detecting linear crosswalk clusters: {str(e)}")
209
+ return False
210
+
211
+ def _detect_linear_clusters(self, coords: List[float], threshold: float = 0.05) -> List[List[float]]:
212
+ """
213
+ 檢測坐標中的線性聚類
214
+
215
+ Args:
216
+ coords: 一維坐標列表
217
+ threshold: 聚類閾值
218
+
219
+ Returns:
220
+ List[List[float]]: 聚類列表
221
+ """
222
+ if not coords:
223
+ return []
224
+
225
+ try:
226
+ sorted_coords = sorted(coords)
227
+ clusters = []
228
+ current_cluster = [sorted_coords[0]]
229
+
230
+ for i in range(1, len(sorted_coords)):
231
+ if sorted_coords[i] - sorted_coords[i-1] < threshold:
232
+ current_cluster.append(sorted_coords[i])
233
+ else:
234
+ if len(current_cluster) >= 2:
235
+ clusters.append(current_cluster)
236
+ current_cluster = [sorted_coords[i]]
237
+
238
+ # 添加最後一個聚類
239
+ if len(current_cluster) >= 2:
240
+ clusters.append(current_cluster)
241
+
242
+ return clusters
243
+
244
+ except Exception as e:
245
+ self.logger.warning(f"Error in linear cluster detection: {str(e)}")
246
+ return []
247
+
248
+ def _detect_aerial_from_pedestrian_distribution(self, detected_objects: List[Dict]) -> bool:
249
+ """
250
+ 基於行人分布檢測空中視角
251
+
252
+ Args:
253
+ detected_objects: 檢測到的物體列表
254
+
255
+ Returns:
256
+ bool: 是否為空中視角
257
+ """
258
+ try:
259
+ people_objs = [obj for obj in detected_objects if obj.get("class_id") == 0]
260
+
261
+ if len(people_objs) < self.viewpoint_params["min_people_for_aerial"]:
262
+ return False
263
+
264
+ # 統計不同區域的行人數量
265
+ people_region_counts = {}
266
+ for obj in people_objs:
267
+ region = obj.get("region", "unknown")
268
+ people_region_counts[region] = people_region_counts.get(region, 0) + 1
269
+
270
+ # 檢查行人是否分布在多個區域
271
+ regions_with_multiple_people = sum(1 for count in people_region_counts.values() if count >= 2)
272
+
273
+ if regions_with_multiple_people < 4:
274
+ return False
275
+
276
+ # 檢查行人分布的均勻性
277
+ region_counts = list(people_region_counts.values())
278
+ if not region_counts:
279
+ return False
280
+
281
+ region_counts_variance = np.var(region_counts)
282
+ region_counts_mean = np.mean(region_counts)
283
+
284
+ if region_counts_mean > 0:
285
+ variation_coefficient = region_counts_variance / region_counts_mean
286
+ return variation_coefficient < 0.5
287
+
288
+ return False
289
+
290
+ except Exception as e:
291
+ self.logger.warning(f"Error in aerial detection from pedestrian distribution: {str(e)}")
292
+ return False
293
+
294
+ def _detect_standard_viewpoint(self, detected_objects: List[Dict]) -> str:
295
+ """
296
+ 標準視角檢測流程
297
+
298
+ Args:
299
+ detected_objects: 檢測到的物體列表
300
+
301
+ Returns:
302
+ str: 檢測到的視角類型
303
+ """
304
+ try:
305
+ # 計算基本統計指標
306
+ metrics = self._calculate_viewpoint_metrics(detected_objects)
307
+
308
+ # 基於計算的指標判斷視角類型
309
+ if self._is_aerial_viewpoint(metrics):
310
+ return "aerial"
311
+ elif self._is_low_angle_viewpoint(metrics):
312
+ return "low_angle"
313
+ elif self._is_elevated_viewpoint(metrics):
314
+ return "elevated"
315
+ else:
316
+ return "eye_level"
317
+
318
+ except Exception as e:
319
+ self.logger.warning(f"Error in standard viewpoint detection: {str(e)}")
320
+ return "eye_level"
321
+
322
+ def _calculate_viewpoint_metrics(self, detected_objects: List[Dict]) -> Dict:
323
+ """
324
+ 計算視角檢測所需的各項指標
325
+
326
+ Args:
327
+ detected_objects: 檢測到的物體列表
328
+
329
+ Returns:
330
+ Dict: 包含各項指標的字典
331
+ """
332
+ total_objects = len(detected_objects)
333
+ top_region_count = 0
334
+ bottom_region_count = 0
335
+ sizes = []
336
+ height_width_ratios = []
337
+
338
+ try:
339
+ for obj in detected_objects:
340
+ # 統計頂部和底部區域的物體數量
341
+ region = obj.get("region", "")
342
+ if "top" in region:
343
+ top_region_count += 1
344
+ elif "bottom" in region:
345
+ bottom_region_count += 1
346
+
347
+ # 收集大小信息
348
+ if "normalized_area" in obj:
349
+ sizes.append(obj["normalized_area"])
350
+
351
+ # 計算高寬比
352
+ if "normalized_size" in obj:
353
+ width, height = obj["normalized_size"]
354
+ if width > 0:
355
+ height_width_ratios.append(height / width)
356
+
357
+ # 計算比例
358
+ top_ratio = top_region_count / total_objects if total_objects > 0 else 0
359
+ bottom_ratio = bottom_region_count / total_objects if total_objects > 0 else 0
360
+
361
+ # 計算大小變異係數
362
+ size_variance_coefficient = 0
363
+ if sizes and len(sizes) > 1:
364
+ mean_size = np.mean(sizes)
365
+ if mean_size > 0:
366
+ size_variance = np.var(sizes)
367
+ size_variance_coefficient = size_variance / (mean_size ** 2)
368
+
369
+ # 計算平均高寬比
370
+ avg_height_width_ratio = np.mean(height_width_ratios) if height_width_ratios else 1.0
371
+
372
+ metrics = {
373
+ "top_ratio": top_ratio,
374
+ "bottom_ratio": bottom_ratio,
375
+ "size_variance_coefficient": size_variance_coefficient,
376
+ "avg_height_width_ratio": avg_height_width_ratio,
377
+ "total_objects": total_objects
378
+ }
379
+
380
+ self.logger.debug(f"Calculated viewpoint metrics: {metrics}")
381
+ return metrics
382
+
383
+ except Exception as e:
384
+ self.logger.error(f"Error calculating viewpoint metrics: {str(e)}")
385
+ return {
386
+ "top_ratio": 0,
387
+ "bottom_ratio": 0,
388
+ "size_variance_coefficient": 0,
389
+ "avg_height_width_ratio": 1.0,
390
+ "total_objects": total_objects
391
+ }
392
+
393
+ def _is_aerial_viewpoint(self, metrics: Dict) -> bool:
394
+ """判斷是否為空中視角"""
395
+ return (metrics["size_variance_coefficient"] < self.viewpoint_params["aerial_size_variance_threshold"] and
396
+ metrics["bottom_ratio"] < 0.3 and
397
+ metrics["top_ratio"] > self.viewpoint_params["aerial_threshold"])
398
+
399
+ def _is_low_angle_viewpoint(self, metrics: Dict) -> bool:
400
+ """判斷是否為低角度視角"""
401
+ return (metrics["avg_height_width_ratio"] > self.viewpoint_params["vertical_size_ratio_threshold"] and
402
+ metrics["top_ratio"] > self.viewpoint_params["low_angle_threshold"])
403
+
404
+ def _is_elevated_viewpoint(self, metrics: Dict) -> bool:
405
+ """判斷是否為高位視角"""
406
+ return (metrics["bottom_ratio"] > self.viewpoint_params["elevated_threshold"] and
407
+ metrics["top_ratio"] < self.viewpoint_params["elevated_top_threshold"])
408
+
409
+ def get_viewpoint_confidence(self, detected_objects: List[Dict]) -> Tuple[str, float]:
410
+ """
411
+ 獲取視角檢測結果及其信心度
412
+
413
+ Args:
414
+ detected_objects: 檢測到的物體列表
415
+
416
+ Returns:
417
+ Tuple[str, float]: (視角類型, 信心度)
418
+ """
419
+ try:
420
+ viewpoint = self.detect_viewpoint(detected_objects)
421
+
422
+ # 基於檢測條件計算信心度
423
+ if viewpoint == "aerial" and self._detect_crosswalk_pattern(detected_objects):
424
+ confidence = 0.95 # 十字路口模式有很高信心度
425
+ elif viewpoint == "aerial":
426
+ confidence = 0.8
427
+ elif viewpoint == "eye_level":
428
+ confidence = 0.7 # 默認視角信心度較低
429
+ else:
430
+ confidence = 0.85
431
+
432
+ self.logger.info(f"Viewpoint detection result: {viewpoint} (confidence: {confidence:.2f})")
433
+ return viewpoint, confidence
434
+
435
+ except Exception as e:
436
+ self.logger.warning("Using fallback viewpoint due to detection error")
437
+ return "eye_level", 0.3
visualization_helper.py CHANGED
@@ -16,7 +16,6 @@ class VisualizationHelper:
16
  filter_classes: Optional[List[int]] = None) -> Optional[Image.Image]:
17
  """
18
  Visualize detection results on a single image
19
-
20
  Args:
21
  image: Image path or numpy array
22
  result: Detection result object
 
16
  filter_classes: Optional[List[int]] = None) -> Optional[Image.Image]:
17
  """
18
  Visualize detection results on a single image
 
19
  Args:
20
  image: Image path or numpy array
21
  result: Detection result object
zone_evaluator.py ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import logging
3
+ import traceback
4
+ import numpy as np
5
+ from typing import Dict, List, Any, Optional
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+ class ZoneEvaluator:
10
+ """
11
+ 負責功能區域辨識的可行性評估和物件關聯性計算
12
+ 評估是否應該進行區域劃分以及計算物件間的功能關聯性
13
+ """
14
+
15
+ def __init__(self):
16
+ """初始化區域評估器"""
17
+ try:
18
+ # 定義物件間的功能關聯性評分表
19
+ # 分數越高表示兩個物件在功能上越相關,更可能出現在同一功能區域
20
+ self.relationship_pairs = {
21
+ # 家具組合關係 - 這些組合通常出現在特定功能區域
22
+ frozenset([56, 60]): 1.0, # 椅子+桌子 (dining/work area)
23
+ frozenset([57, 62]): 0.9, # 沙發+電視 (living area)
24
+ frozenset([59, 58]): 0.7, # 床+植物 (bedroom decor)
25
+
26
+ # 工作相關組合 - 工作環境的典型配置
27
+ frozenset([63, 66]): 0.9, # 筆電+鍵盤 (workspace)
28
+ frozenset([63, 64]): 0.8, # 筆電+滑鼠 (workspace)
29
+ frozenset([60, 63]): 0.8, # 桌子+筆電 (workspace)
30
+
31
+ # 廚房相關組合 - 廚房設備的常見的物品
32
+ frozenset([68, 72]): 0.9, # 微波爐+冰箱 (kitchen)
33
+ frozenset([69, 71]): 0.8, # 烤箱+水槽 (kitchen)
34
+
35
+ # 用餐相關組合 - 餐廳或用餐區域的典型物品
36
+ frozenset([60, 40]): 0.8, # 桌子+酒杯 (dining)
37
+ frozenset([60, 41]): 0.8, # 桌子+杯子 (dining)
38
+ frozenset([56, 40]): 0.7, # 椅子+酒杯 (dining)
39
+
40
+ # 交通相關組合 - 城市交通的環境
41
+ frozenset([2, 9]): 0.8, # 汽車+交通燈 (traffic)
42
+ frozenset([0, 9]): 0.7, # 行人+交通燈 (crosswalk)
43
+ }
44
+
45
+ logger.info("ZoneEvaluator initialized with predefined relationship pairs")
46
+
47
+ except Exception as e:
48
+ logger.error(f"Failed to initialize ZoneEvaluator: {str(e)}")
49
+ logger.error(traceback.format_exc())
50
+ raise
51
+
52
+ def evaluate_zone_identification_feasibility(self, detected_objects: List[Dict], scene_type: str) -> bool:
53
+ """
54
+ 基於物件關聯性和分布特徵的彈性可行性評估
55
+ 決定是否應該進行功能區域劃分
56
+
57
+ Args:
58
+ detected_objects: 檢測到的物件列表
59
+ scene_type: 場景類型
60
+
61
+ Returns:
62
+ 是否適合進行區域識別
63
+ """
64
+ try:
65
+ if len(detected_objects) < 2:
66
+ logger.info("Insufficient objects for zone identification (minimum 2 required)")
67
+ return False
68
+
69
+ # 計算不同置信度層級的物件分布
70
+ # 高信心度物件更可靠,用於核心區域判斷
71
+ high_conf_objects = [obj for obj in detected_objects if obj.get("confidence", 0) >= 0.6]
72
+ # 中等置信度物件提供補充資訊
73
+ medium_conf_objects = [obj for obj in detected_objects if obj.get("confidence", 0) >= 0.4]
74
+
75
+ # 基礎條件:至少需要一定數量的可信物件才值得進行區域分析
76
+ if len(medium_conf_objects) < 2:
77
+ logger.info("Insufficient medium confidence objects for zone identification")
78
+ return False
79
+
80
+ # 評估物件間的功能關聯性,關聯性高的物件更適合劃分功能區域
81
+ functional_relationships = self.calculate_functional_relationships(detected_objects)
82
+
83
+ # 評估空間分布多樣性 - 物件分散在多個區域才有劃分的意義
84
+ spatial_diversity = self.calculate_spatial_diversity(detected_objects)
85
+
86
+ # 綜合評分機制,用各項指標加權計算最終可行性評分
87
+ feasibility_score = 0
88
+
89
+ # 物件數量的貢獻(權重30%)- 更多物件提供更多劃分依據
90
+ object_count_score = min(len(detected_objects) / 5.0, 1.0) * 0.3
91
+
92
+ # 信心度質量貢獻(權重25%)- 高置信度物件比例影響可靠性
93
+ confidence_score = len(high_conf_objects) / max(len(detected_objects), 1) * 0.25
94
+
95
+ # 功能關聯性貢獻(權重25%)- 有功能關聯的物件更適合劃分區域
96
+ relationship_score = functional_relationships * 0.25
97
+
98
+ # 空間多樣性貢獻(權重20%)- 分散的物件才需要區域劃分
99
+ diversity_score = spatial_diversity * 0.20
100
+
101
+ feasibility_score = object_count_score + confidence_score + relationship_score + diversity_score
102
+
103
+ # 動態閾值:根據場景複雜度調整可行性標準
104
+ complexity_threshold = self.get_complexity_threshold(scene_type)
105
+
106
+ is_feasible = feasibility_score >= complexity_threshold
107
+
108
+ logger.info(f"Zone identification feasibility: {is_feasible} (score: {feasibility_score:.3f}, threshold: {complexity_threshold:.3f})")
109
+ logger.debug(f"Score breakdown - objects: {object_count_score:.3f}, confidence: {confidence_score:.3f}, relationships: {relationship_score:.3f}, diversity: {diversity_score:.3f}")
110
+
111
+ return is_feasible
112
+
113
+ except Exception as e:
114
+ logger.error(f"Error evaluating zone identification feasibility: {str(e)}")
115
+ logger.error(traceback.format_exc())
116
+ return False
117
+
118
+ def calculate_functional_relationships(self, detected_objects: List[Dict]) -> float:
119
+ """
120
+ 計算物件間的功能關聯性評分
121
+ 基於常見的物件組合模式評估功能相關性
122
+
123
+ Args:
124
+ detected_objects: 檢測到的物件列表
125
+
126
+ Returns:
127
+ 功能關聯性評分 (0.0-1.0)
128
+ """
129
+ try:
130
+ detected_class_ids = set(obj.get("class_id") for obj in detected_objects)
131
+ max_possible_score = 0
132
+ actual_score = 0
133
+
134
+ # 遍歷所有預定義的關聯性組合,計算實際場景中的關聯性評分
135
+ for pair, score in self.relationship_pairs.items():
136
+ max_possible_score += score
137
+ # 如果檢測到的物件中包含這個關聯組合,累加其評分
138
+ if pair.issubset(detected_class_ids):
139
+ actual_score += score
140
+ logger.debug(f"Found functional relationship: {pair} with score {score}")
141
+
142
+ # 標準化評分:實際評分除以最大可能評分
143
+ relationship_score = actual_score / max_possible_score if max_possible_score > 0 else 0
144
+
145
+ logger.info(f"Functional relationships calculated: {relationship_score:.3f} (found {actual_score:.1f}/{max_possible_score:.1f} possible relationships)")
146
+ return relationship_score
147
+
148
+ except Exception as e:
149
+ logger.error(f"Error calculating functional relationships: {str(e)}")
150
+ logger.error(traceback.format_exc())
151
+ return 0
152
+
153
+ def calculate_spatial_diversity(self, detected_objects: List[Dict]) -> float:
154
+ """
155
+ 計算物件空間分布的多樣性
156
+ 評估物件是否分散在不同區域,避免所有物件集中在單一區域
157
+
158
+ Args:
159
+ detected_objects: 檢測到的物件列表
160
+
161
+ Returns:
162
+ 空間多樣性評分 (0.0-1.0)
163
+ """
164
+ try:
165
+ # 收集所有物件所在的不同區域
166
+ regions = set(obj.get("region", "center") for obj in detected_objects)
167
+ unique_regions = len(regions)
168
+
169
+ # 標準化多樣性評分:假設理想情況是物件分散在2個以上區域
170
+ # 更多區域意味著更高的空間多樣性,更適合進行區域劃分
171
+ diversity_score = min(unique_regions / 2.0, 1.0)
172
+
173
+ logger.info(f"Spatial diversity calculated: {diversity_score:.3f} (objects distributed across {unique_regions} regions)")
174
+ return diversity_score
175
+
176
+ except Exception as e:
177
+ logger.error(f"Error calculating spatial diversity: {str(e)}")
178
+ logger.error(traceback.format_exc())
179
+ return 0
180
+
181
+ def get_complexity_threshold(self, scene_type: str) -> float:
182
+ """
183
+ 根據場景類型返回適當的複雜度閾值
184
+ 平衡不同場景的區域劃分需求
185
+
186
+ Args:
187
+ scene_type: 場景類型
188
+
189
+ Returns:
190
+ 複雜度閾值 (0.0-1.0)
191
+ """
192
+ try:
193
+ # 較簡單場景需要較高分數才進行區域劃分
194
+ # 這些場景通常功能較為單純,不太需要細分
195
+ simple_scenes = ["bedroom", "bathroom", "closet"]
196
+
197
+ # 較複雜場景可以較低分數進行區域劃分
198
+ # 這些場景通常有多種功能,適合劃分不同區域
199
+ complex_scenes = ["living_room", "kitchen", "office_workspace", "dining_area"]
200
+
201
+ if scene_type in simple_scenes:
202
+ threshold = 0.65 # 較高閾值,避免過度細分
203
+ logger.debug(f"Using high threshold {threshold} for simple scene: {scene_type}")
204
+ elif scene_type in complex_scenes:
205
+ threshold = 0.45 # 較低閾值,允許合理劃分
206
+ logger.debug(f"Using low threshold {threshold} for complex scene: {scene_type}")
207
+ else:
208
+ threshold = 0.55 # 中等閾值,平衡策略
209
+ logger.debug(f"Using medium threshold {threshold} for scene: {scene_type}")
210
+
211
+ return threshold
212
+
213
+ except Exception as e:
214
+ logger.error(f"Error getting complexity threshold for scene '{scene_type}': {str(e)}")
215
+ logger.error(traceback.format_exc())
216
+ return 0.55 # 預設中等閾值
217
+
218
+ def analyze_object_clustering(self, detected_objects: List[Dict]) -> Dict:
219
+ """
220
+ 分析物件的聚集模式
221
+ 識別物件是否形成明顯的聚集群組,這有助於功���區域的劃分
222
+
223
+ Args:
224
+ detected_objects: 檢測到的物件列表
225
+
226
+ Returns:
227
+ 包含聚集分析結果的字典
228
+ """
229
+ try:
230
+ clustering_result = {
231
+ "has_clusters": False,
232
+ "cluster_count": 0,
233
+ "cluster_regions": [],
234
+ "clustering_score": 0.0
235
+ }
236
+
237
+ if len(detected_objects) < 3:
238
+ logger.info("Insufficient objects for clustering analysis")
239
+ return clustering_result
240
+
241
+ # 統計每個區域的物件數量
242
+ region_counts = {}
243
+ for obj in detected_objects:
244
+ region = obj.get("region", "unknown")
245
+ region_counts[region] = region_counts.get(region, 0) + 1
246
+
247
+ # 找出有顯著物件聚集的區域(物件數量 >= 2)
248
+ significant_regions = [region for region, count in region_counts.items() if count >= 2]
249
+
250
+ # 計算聚集:聚集區域數量與總區域數量的比例
251
+ total_regions_with_objects = len([count for count in region_counts.values() if count > 0])
252
+ clustering_score = len(significant_regions) / max(total_regions_with_objects, 1)
253
+
254
+ clustering_result.update({
255
+ "has_clusters": len(significant_regions) >= 2,
256
+ "cluster_count": len(significant_regions),
257
+ "cluster_regions": significant_regions,
258
+ "clustering_score": clustering_score
259
+ })
260
+
261
+ logger.info(f"Object clustering analysis: {len(significant_regions)} clusters found in regions {significant_regions}")
262
+ return clustering_result
263
+
264
+ except Exception as e:
265
+ logger.error(f"Error analyzing object clustering: {str(e)}")
266
+ logger.error(traceback.format_exc())
267
+ return {
268
+ "has_clusters": False,
269
+ "cluster_count": 0,
270
+ "cluster_regions": [],
271
+ "clustering_score": 0.0
272
+ }