DawnC commited on
Commit
62cd166
·
verified ·
1 Parent(s): 19c1b92

Upload object_description_generator.py

Browse files
Files changed (1) hide show
  1. object_description_generator.py +235 -121
object_description_generator.py CHANGED
@@ -1,6 +1,5 @@
1
  import logging
2
  import traceback
3
- import re
4
  from typing import Dict, List, Tuple, Optional, Any
5
  import numpy as np
6
 
@@ -389,62 +388,177 @@ class ObjectDescriptionGenerator:
389
 
390
  def optimize_object_description(self, description: str) -> str:
391
  """
392
- 優化物件描述,避免重複列舉相同物件
 
 
 
 
393
 
394
  Args:
395
- description: 原始描述文本
396
 
397
  Returns:
398
- str: 優化後的描述文本
399
  """
400
  try:
401
  import re
402
-
403
- # 處理床鋪重複描述
404
- if "bed in the room" in description:
405
- description = description.replace("a bed in the room", "a bed")
406
-
407
- # 處理重複的物件列表
408
- object_lists = re.findall(r'with ([^\.]+?)(?:\.|\band\b)', description)
409
-
 
 
 
 
 
410
  for obj_list in object_lists:
411
- # 計算每個物件出現次數
412
- items = re.findall(r'([a-zA-Z\s]+)(?:,|\band\b|$)', obj_list)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
413
  item_counts = {}
414
-
415
- for item in items:
 
416
  item = item.strip()
417
- if item and item not in ["and", "with"]:
418
- if item not in item_counts:
419
- item_counts[item] = 0
420
- item_counts[item] += 1
421
-
422
- # 生成優化後的物件列表
 
 
 
 
423
  if item_counts:
424
  new_items = []
 
425
  for item, count in item_counts.items():
426
  if count > 1:
427
- new_items.append(f"{count} {item}s")
 
 
428
  else:
 
429
  new_items.append(item)
430
-
431
- # 格式化新列表
 
432
  if len(new_items) == 1:
433
  new_list = new_items[0]
434
  elif len(new_items) == 2:
435
  new_list = f"{new_items[0]} and {new_items[1]}"
436
  else:
 
437
  new_list = ", ".join(new_items[:-1]) + f", and {new_items[-1]}"
438
-
439
- # 替換原始列表
 
440
  description = description.replace(obj_list, new_list)
441
-
442
  return description
443
-
444
  except Exception as e:
445
  self.logger.warning(f"Error optimizing object description: {str(e)}")
446
  return description
447
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
448
  def generate_dynamic_everyday_description(self,
449
  detected_objects: List[Dict],
450
  lighting_info: Optional[Dict] = None,
@@ -640,15 +754,15 @@ class ObjectDescriptionGenerator:
640
  if object_statistics and class_name in object_statistics:
641
  actual_count = object_statistics[class_name]["count"]
642
  formatted_name_with_exact_count = self._format_object_count_description(
643
- normalized_class_name,
644
  actual_count,
645
- scene_type=scene_type
646
  )
647
  else:
648
  formatted_name_with_exact_count = self._format_object_count_description(
649
- normalized_class_name,
650
  count,
651
- scene_type=scene_type
652
  )
653
 
654
  if formatted_name_with_exact_count == "no specific objects clearly identified" or not formatted_name_with_exact_count:
@@ -747,67 +861,67 @@ class ObjectDescriptionGenerator:
747
  def _remove_repetitive_descriptors(self, description: str) -> str:
748
  """
749
  移除描述中的重複性和不適當的描述詞彙,特別是 "identical" 等詞彙
750
-
751
  Args:
752
  description: 原始描述文本
753
-
754
  Returns:
755
  str: 清理後的描述文本
756
  """
757
  try:
758
  import re
759
-
760
  # 定義需要移除或替換的模式
761
  cleanup_patterns = [
762
  # 移除 "identical" 描述模式
763
  (r'\b(\d+)\s+identical\s+([a-zA-Z\s]+)', r'\1 \2'),
764
  (r'\b(two|three|four|five|six|seven|eight|nine|ten|eleven|twelve)\s+identical\s+([a-zA-Z\s]+)', r'\1 \2'),
765
  (r'\bidentical\s+([a-zA-Z\s]+)', r'\1'),
766
-
767
  # 改善 "comprehensive arrangement" 等過於技術性的表達
768
  (r'\bcomprehensive arrangement of\b', 'arrangement of'),
769
  (r'\bcomprehensive view featuring\b', 'scene featuring'),
770
  (r'\bcomprehensive display of\b', 'display of'),
771
-
772
  # 簡化過度描述性的短語
773
  (r'\bpositioning around\s+(\d+)\s+identical\b', r'positioning around \1'),
774
  (r'\barranged around\s+(\d+)\s+identical\b', r'arranged around \1'),
775
  ]
776
-
777
  processed_description = description
778
  for pattern, replacement in cleanup_patterns:
779
  processed_description = re.sub(pattern, replacement, processed_description, flags=re.IGNORECASE)
780
-
781
  # 進一步清理可能的多餘空格
782
  processed_description = re.sub(r'\s+', ' ', processed_description).strip()
783
-
784
  self.logger.debug(f"Cleaned description: removed repetitive descriptors")
785
  return processed_description
786
-
787
  except Exception as e:
788
  self.logger.warning(f"Error removing repetitive descriptors: {str(e)}")
789
  return description
790
 
791
- def _format_object_count_description(self, class_name: str, count: int,
792
  scene_type: Optional[str] = None,
793
  detected_objects: Optional[List[Dict]] = None,
794
  avg_confidence: float = 0.0) -> str:
795
  """
796
  格式化物件數量描述的核心方法,整合空間排列、材質推斷和場景語境
797
-
798
  這個方法是整個物件描述系統的核心,它將多個子功能整合在一起:
799
  1. 數字到文字的轉換(避免阿拉伯數字)
800
  2. 基於場景的材質推斷
801
  3. 空間排列模式的描述
802
  4. 語境化的物件描述
803
-
804
  Args:
805
  class_name: 標準化後的類別名稱
806
  count: 物件數量
807
  scene_type: 場景類型,用於語境化描述
808
  detected_objects: 該類型的所有檢測物件,用於空間分析
809
  avg_confidence: 平均檢測置信度,影響材質推斷的可信度
810
-
811
  Returns:
812
  str: 完整的格式化數量描述
813
  """
@@ -817,14 +931,14 @@ class ObjectDescriptionGenerator:
817
 
818
  # 獲取基礎的複數形式
819
  plural_form = self._get_plural_form(class_name)
820
-
821
  # 單數情況的處理
822
  if count == 1:
823
- return self._format_single_object_description(class_name, scene_type,
824
  detected_objects, avg_confidence)
825
-
826
  # 複數情況的處理
827
- return self._format_multiple_objects_description(class_name, count, plural_form,
828
  scene_type, detected_objects, avg_confidence)
829
 
830
  except Exception as e:
@@ -832,55 +946,55 @@ class ObjectDescriptionGenerator:
832
  return f"{count} {class_name}s" if count > 1 else class_name
833
 
834
  def _format_single_object_description(self, class_name: str, scene_type: Optional[str],
835
- detected_objects: Optional[List[Dict]],
836
  avg_confidence: float) -> str:
837
  """
838
  處理單個物件的描述生成
839
-
840
  對於單個物件,我們重點在於通過材質推斷和位置描述來豐富描述內容,
841
  避免簡單的 "a chair" 這樣的描述,而是生成 "a wooden dining chair" 這樣的表達
842
-
843
  Args:
844
  class_name: 物件類別名稱
845
  scene_type: 場景類型
846
  detected_objects: 檢測物件列表
847
  avg_confidence: 平均置信度
848
-
849
  Returns:
850
  str: 單個物件的完整描述
851
  """
852
  article = "an" if class_name[0].lower() in 'aeiou' else "a"
853
-
854
  # 獲取材質描述符
855
  material_descriptor = self._get_material_descriptor(class_name, scene_type, avg_confidence)
856
-
857
  # 獲取位置或特徵描述符
858
  feature_descriptor = self._get_single_object_feature(class_name, scene_type, detected_objects)
859
-
860
  # 組合描述
861
  descriptors = []
862
  if material_descriptor:
863
  descriptors.append(material_descriptor)
864
  if feature_descriptor:
865
  descriptors.append(feature_descriptor)
866
-
867
  if descriptors:
868
  return f"{article} {' '.join(descriptors)} {class_name}"
869
  else:
870
  return f"{article} {class_name}"
871
 
872
  def _format_multiple_objects_description(self, class_name: str, count: int, plural_form: str,
873
- scene_type: Optional[str], detected_objects: Optional[List[Dict]],
874
  avg_confidence: float) -> str:
875
  """
876
  處理多個物件的描述生成
877
-
878
  對於多個物件,我們的重點是:
879
  1. 將數字轉換為文字表達
880
  2. 分析空間排列模式
881
  3. 添加適當的材質或功能描述
882
  4. 生成自然流暢的描述
883
-
884
  Args:
885
  class_name: 物件類別名稱
886
  count: 物件數量
@@ -888,17 +1002,17 @@ class ObjectDescriptionGenerator:
888
  scene_type: 場景類型
889
  detected_objects: 檢測物件列表
890
  avg_confidence: 平均置信度
891
-
892
  Returns:
893
  str: 多個物件的完整描述
894
  """
895
  # 數字到文字的轉換映射
896
  number_words = {
897
  2: "two", 3: "three", 4: "four", 5: "five", 6: "six",
898
- 7: "seven", 8: "eight", 9: "nine", 10: "ten",
899
  11: "eleven", 12: "twelve"
900
  }
901
-
902
  # 確定基礎數量表達
903
  if count in number_words:
904
  count_expression = number_words[count]
@@ -906,48 +1020,48 @@ class ObjectDescriptionGenerator:
906
  count_expression = "several"
907
  else:
908
  count_expression = "numerous"
909
-
910
  # 獲取材質或功能描述符
911
  material_descriptor = self._get_material_descriptor(class_name, scene_type, avg_confidence)
912
-
913
  # 獲取空間排列描述
914
- spatial_descriptor = self._get_spatial_arrangement_descriptor(class_name, scene_type,
915
  detected_objects, count)
916
-
917
  # 組合最終描述
918
  descriptors = []
919
  if material_descriptor:
920
  descriptors.append(material_descriptor)
921
-
922
  # 構建基礎描述
923
  base_description = f"{count_expression} {' '.join(descriptors)} {plural_form}".strip()
924
-
925
  # 添加空間排列信息
926
  if spatial_descriptor:
927
  return f"{base_description} {spatial_descriptor}"
928
  else:
929
  return base_description
930
 
931
- def _get_material_descriptor(self, class_name: str, scene_type: Optional[str],
932
  avg_confidence: float) -> Optional[str]:
933
  """
934
  基於場景語境和置信度進行材質推斷
935
-
936
  這個方法實現了智能的材質推斷,它不依賴複雜的圖像分析,
937
  而是基於常識和場景邏輯來推斷最可能的材質描述
938
-
939
  Args:
940
  class_name: 物件類別名稱
941
  scene_type: 場景類型
942
  avg_confidence: 檢測置信度,影響推斷的保守程度
943
-
944
  Returns:
945
  Optional[str]: 材質描述符,如果無法推斷則返回None
946
  """
947
  # 只有在置信度足夠高時才進行材質推斷
948
  if avg_confidence < 0.5:
949
  return None
950
-
951
  # 餐廳和用餐相關場景
952
  if scene_type and scene_type in ["dining_area", "restaurant", "upscale_dining", "cafe"]:
953
  material_mapping = {
@@ -957,7 +1071,7 @@ class ObjectDescriptionGenerator:
957
  "vase": "decorative"
958
  }
959
  return material_mapping.get(class_name)
960
-
961
  # 辦公場景
962
  elif scene_type and scene_type in ["office_workspace", "meeting_room", "conference_room"]:
963
  material_mapping = {
@@ -967,7 +1081,7 @@ class ObjectDescriptionGenerator:
967
  "book": "reference"
968
  }
969
  return material_mapping.get(class_name)
970
-
971
  # 客廳場景
972
  elif scene_type and scene_type in ["living_room"]:
973
  material_mapping = {
@@ -977,7 +1091,7 @@ class ObjectDescriptionGenerator:
977
  "vase": "decorative"
978
  }
979
  return material_mapping.get(class_name)
980
-
981
  # 室外場景
982
  elif scene_type and scene_type in ["city_street", "park_area", "parking_lot"]:
983
  material_mapping = {
@@ -986,7 +1100,7 @@ class ObjectDescriptionGenerator:
986
  "bicycle": "stationed"
987
  }
988
  return material_mapping.get(class_name)
989
-
990
  # 如果沒有特定的場景映射,返回通用描述符
991
  generic_mapping = {
992
  "chair": "comfortable",
@@ -994,30 +1108,30 @@ class ObjectDescriptionGenerator:
994
  "car": "parked",
995
  "person": "present"
996
  }
997
-
998
  return generic_mapping.get(class_name)
999
 
1000
  def _get_spatial_arrangement_descriptor(self, class_name: str, scene_type: Optional[str],
1001
- detected_objects: Optional[List[Dict]],
1002
  count: int) -> Optional[str]:
1003
  """
1004
  分析物件的空間排列模式並生成相應描述
1005
-
1006
  這個方法通過分析物件的位置分布來判斷排列模式,
1007
  然後根據物件類型和場景生成適當的空間描述
1008
-
1009
  Args:
1010
  class_name: 物件類別名稱
1011
  scene_type: 場景類型
1012
  detected_objects: 該類型的所有檢測物件
1013
  count: 物件數量
1014
-
1015
  Returns:
1016
  Optional[str]: 空間排列描述,如果無法分析則返回None
1017
  """
1018
  if not detected_objects or len(detected_objects) < 2:
1019
  return None
1020
-
1021
  try:
1022
  # 提取物件的標準化位置
1023
  positions = []
@@ -1025,17 +1139,17 @@ class ObjectDescriptionGenerator:
1025
  center = obj.get("normalized_center", [0.5, 0.5])
1026
  if isinstance(center, (list, tuple)) and len(center) >= 2:
1027
  positions.append(center)
1028
-
1029
  if len(positions) < 2:
1030
  return None
1031
-
1032
  # 分析排列模式
1033
  arrangement_pattern = self._analyze_arrangement_pattern(positions)
1034
-
1035
  # 根據物件類型和場景生成描述
1036
- return self._generate_arrangement_description(class_name, scene_type,
1037
  arrangement_pattern, count)
1038
-
1039
  except Exception as e:
1040
  self.logger.warning(f"Error analyzing spatial arrangement: {str(e)}")
1041
  return None
@@ -1043,43 +1157,43 @@ class ObjectDescriptionGenerator:
1043
  def _analyze_arrangement_pattern(self, positions: List[List[float]]) -> str:
1044
  """
1045
  分析位置點的排列模式
1046
-
1047
  這個方法使用簡單的幾何分析來判斷物件的排列類型,
1048
  幫助我們理解物件在空間中的組織方式
1049
-
1050
  Args:
1051
  positions: 標準化的位置座標列表
1052
-
1053
  Returns:
1054
  str: 排列模式類型(linear, clustered, scattered, circular等)
1055
  """
1056
  import numpy as np
1057
-
1058
  if len(positions) < 2:
1059
  return "single"
1060
-
1061
  # 轉換為numpy陣列便於計算
1062
  pos_array = np.array(positions)
1063
-
1064
  # 計算位置的分布特徵
1065
  x_coords = pos_array[:, 0]
1066
  y_coords = pos_array[:, 1]
1067
-
1068
  # 分析x和y方向的變異程度
1069
  x_variance = np.var(x_coords)
1070
  y_variance = np.var(y_coords)
1071
-
1072
  # 計算物件間的平均距離
1073
  distances = []
1074
  for i in range(len(positions)):
1075
  for j in range(i + 1, len(positions)):
1076
- dist = np.sqrt((positions[i][0] - positions[j][0])**2 +
1077
  (positions[i][1] - positions[j][1])**2)
1078
  distances.append(dist)
1079
-
1080
  avg_distance = np.mean(distances) if distances else 0
1081
  distance_variance = np.var(distances) if distances else 0
1082
-
1083
  # ���斷排列模式
1084
  if len(positions) >= 4 and self._is_circular_pattern(positions):
1085
  return "circular"
@@ -1097,35 +1211,35 @@ class ObjectDescriptionGenerator:
1097
  def _is_circular_pattern(self, positions: List[List[float]]) -> bool:
1098
  """
1099
  檢查位置是否形成圓形或環形排列
1100
-
1101
  Args:
1102
  positions: 位置座標列表
1103
-
1104
  Returns:
1105
  bool: 是否為圓形排列
1106
  """
1107
  import numpy as np
1108
-
1109
  if len(positions) < 4:
1110
  return False
1111
-
1112
  try:
1113
  pos_array = np.array(positions)
1114
-
1115
  # 計算中心點
1116
  center_x = np.mean(pos_array[:, 0])
1117
  center_y = np.mean(pos_array[:, 1])
1118
-
1119
  # 計算每個點到中心的距離
1120
  distances_to_center = []
1121
  for pos in positions:
1122
  dist = np.sqrt((pos[0] - center_x)**2 + (pos[1] - center_y)**2)
1123
  distances_to_center.append(dist)
1124
-
1125
  # 如果所有距離都相近,可能是圓形排列
1126
  distance_variance = np.var(distances_to_center)
1127
  return distance_variance < 0.05 and np.mean(distances_to_center) > 0.2
1128
-
1129
  except:
1130
  return False
1131
 
@@ -1133,16 +1247,16 @@ class ObjectDescriptionGenerator:
1133
  arrangement_pattern: str, count: int) -> Optional[str]:
1134
  """
1135
  根據物件類型、場景和排列模式生成空間描述
1136
-
1137
  這個方法將抽象的排列模式轉換為自然語言描述,
1138
  並根據具體的物件類型和場景語境進行定制
1139
-
1140
  Args:
1141
  class_name: 物件類別名稱
1142
  scene_type: 場景類型
1143
  arrangement_pattern: 排列模式
1144
  count: 物件數量
1145
-
1146
  Returns:
1147
  Optional[str]: 生成的空間排列描述
1148
  """
@@ -1178,7 +1292,7 @@ class ObjectDescriptionGenerator:
1178
  "distributed": "positioned throughout the scene"
1179
  }
1180
  }
1181
-
1182
  # 獲取對應的描述模板
1183
  if class_name in arrangement_templates:
1184
  template_dict = arrangement_templates[class_name]
@@ -1194,30 +1308,30 @@ class ObjectDescriptionGenerator:
1194
  "distributed": "thoughtfully placed"
1195
  }
1196
  base_description = generic_templates.get(arrangement_pattern, "positioned in the scene")
1197
-
1198
  return base_description
1199
 
1200
  def _get_single_object_feature(self, class_name: str, scene_type: Optional[str],
1201
  detected_objects: Optional[List[Dict]]) -> Optional[str]:
1202
  """
1203
  為單個物件生成特徵描述符
1204
-
1205
  當只有一個物件時,我們可以提供更具體的位置或功能描述
1206
-
1207
  Args:
1208
  class_name: 物件類別名稱
1209
  scene_type: 場景類型
1210
  detected_objects: 檢測物件(單個)
1211
-
1212
  Returns:
1213
  Optional[str]: 特徵描述符
1214
  """
1215
  if not detected_objects or len(detected_objects) != 1:
1216
  return None
1217
-
1218
  obj = detected_objects[0]
1219
  region = obj.get("region", "").lower()
1220
-
1221
  # 基於位置的描述
1222
  if "center" in region:
1223
  if class_name == "dining table":
@@ -1226,14 +1340,14 @@ class ObjectDescriptionGenerator:
1226
  return "centrally placed"
1227
  elif "corner" in region or "left" in region or "right" in region:
1228
  return "positioned"
1229
-
1230
  # 基於場景的功能描述
1231
  if scene_type and scene_type in ["dining_area", "restaurant"]:
1232
  if class_name == "chair":
1233
  return "dining"
1234
  elif class_name == "vase":
1235
  return "decorative"
1236
-
1237
  return None
1238
 
1239
  def _get_plural_form(self, word: str) -> str:
 
1
  import logging
2
  import traceback
 
3
  from typing import Dict, List, Tuple, Optional, Any
4
  import numpy as np
5
 
 
388
 
389
  def optimize_object_description(self, description: str) -> str:
390
  """
391
+ 優化物件描述文本,消除冗餘重複並改善表達流暢度
392
+
393
+ 這個函數是後處理階段的關鍵組件,負責清理和精簡自然語言生成系統
394
+ 產出的描述文字。它專門處理常見的重複問題,如相同物件的重複
395
+ 列舉和冗餘的空間描述,讓最終的描述更簡潔自然。
396
 
397
  Args:
398
+ description: 原始的場景描述文本,可能包含重複或冗餘的表達
399
 
400
  Returns:
401
+ str: 經過優化清理的描述文本,如果處理失敗則返回原始文本
402
  """
403
  try:
404
  import re
405
+
406
+ # 1. 處理冗餘的空間限定表達
407
+ # 使用通用模式來識別和移除不必要的空間描述
408
+ # 例如:"bed in the room" -> "bed",因為床本身就表示是室內環境
409
+ description = self._remove_redundant_spatial_qualifiers(description)
410
+
411
+ # 2. 識別並處理物件列表的重複問題
412
+ # 尋找形如 "with X, Y, Z" 或 "with X and Y" 的物件列表模式
413
+ # 使用正則表達式捕獲 "with" 關鍵字後的物件序列
414
+ # 注意:正則表達式需要修正以避免貪婪匹配的問題
415
+ object_lists = re.findall(r'with ([^.]+?)(?=\.|$)', description)
416
+
417
+ # 遍歷每個找到的物件列表進行重複檢測和優化
418
  for obj_list in object_lists:
419
+ # 3. 解析單個物件列表中的項目
420
+ # 使用更精確的正則表達式來分割物件項目
421
+ # 處理 "X, Y, and Z" 或 "X and Y" 格式的列表
422
+ # 需要特別注意處理最後一個 "and" 的情況
423
+
424
+ # 先處理逗號格式 "A, B, and C"
425
+ if ", and " in obj_list:
426
+ # 分割 ", and " 前後的部分
427
+ before_last_and = obj_list.rsplit(", and ", 1)[0]
428
+ last_item = obj_list.rsplit(", and ", 1)[1]
429
+
430
+ # 處理前面的項目(用逗號分割)
431
+ front_items = [item.strip() for item in before_last_and.split(",")]
432
+ # 添加最後一個項目
433
+ all_items = front_items + [last_item.strip()]
434
+ elif " and " in obj_list:
435
+ # 處理簡單的 "A and B" 格式
436
+ all_items = [item.strip() for item in obj_list.split(" and ")]
437
+ else:
438
+ # 處理純逗號分隔的列表
439
+ all_items = [item.strip() for item in obj_list.split(",")]
440
+
441
+ # 4. 統計物件出現頻率
442
+ # 建立字典來記錄每個物件的出現次數
443
  item_counts = {}
444
+
445
+ for item in all_items:
446
+ # 清理項目文字並過濾無效內容
447
  item = item.strip()
448
+ # 過濾掉連接詞和空白項目
449
+ if item and item not in ["and", "with", ""]:
450
+ # 移除可能的冠詞前綴以便正確計數
451
+ # 例如 "a car" 和 "car" 應該被視為同一項目
452
+ clean_item = self._normalize_item_for_counting(item)
453
+ if clean_item not in item_counts:
454
+ item_counts[clean_item] = 0
455
+ item_counts[clean_item] += 1
456
+
457
+ # 5. 生成優化後的物件列表
458
  if item_counts:
459
  new_items = []
460
+
461
  for item, count in item_counts.items():
462
  if count > 1:
463
+ # 對於重複項目,使用數字加複數形式
464
+ plural_item = self._make_plural(item)
465
+ new_items.append(f"{count} {plural_item}")
466
  else:
467
+ # 單個項目保持原樣
468
  new_items.append(item)
469
+
470
+ # 6. 重新格式化物件列表
471
+ # 使用標準的英文列表連接格式
472
  if len(new_items) == 1:
473
  new_list = new_items[0]
474
  elif len(new_items) == 2:
475
  new_list = f"{new_items[0]} and {new_items[1]}"
476
  else:
477
+ # 使用逗號格式確保清晰度
478
  new_list = ", ".join(new_items[:-1]) + f", and {new_items[-1]}"
479
+
480
+ # 7. 在原文中替換優化後的列表
481
+ # 將原始的冗餘列表替換為優化後的簡潔版本
482
  description = description.replace(obj_list, new_list)
483
+
484
  return description
485
+
486
  except Exception as e:
487
  self.logger.warning(f"Error optimizing object description: {str(e)}")
488
  return description
489
 
490
+ def _remove_redundant_spatial_qualifiers(self, description: str) -> str:
491
+ """
492
+ 移除描述中冗餘的空間限定詞
493
+
494
+ 這個方法使用模式匹配來識別和移除不必要的空間描述,例如
495
+ "bed in the room" 中的 "in the room" 部分通常是多餘的,因為
496
+ 床這個物件本身就是室內環境。
497
+
498
+ Args:
499
+ description: 包含可能多餘空間描述的文本
500
+
501
+ Returns:
502
+ str: 移除多餘空間限定詞後的文本
503
+ """
504
+ import re
505
+
506
+ # 定義常見的多餘空間表達模式
507
+ # 這些模式捕獲「物件 + 不必要的空間限定」的情況
508
+ redundant_patterns = [
509
+ # 室內物件的多餘房間描述
510
+ (r'\b(bed|sofa|couch|chair|table|desk|dresser|nightstand)\s+in\s+the\s+(room|bedroom|living\s+room)', r'\1'),
511
+ # 廚房物件的多餘描述
512
+ (r'\b(refrigerator|stove|oven|sink|microwave)\s+in\s+the\s+kitchen', r'\1'),
513
+ # 浴室物件的多餘描述
514
+ (r'\b(toilet|shower|bathtub|sink)\s+in\s+the\s+(bathroom|restroom)', r'\1'),
515
+ # 一般性的多餘表達:「在場景中」、「在圖片中」等
516
+ (r'\b([\w\s]+)\s+in\s+the\s+(scene|image|picture|frame)', r'\1'),
517
+ ]
518
+
519
+ for pattern, replacement in redundant_patterns:
520
+ description = re.sub(pattern, replacement, description, flags=re.IGNORECASE)
521
+
522
+ return description
523
+
524
+
525
+ def _normalize_item_for_counting(self, item: str) -> str:
526
+ """
527
+ 正規化物件項目以便準確計數
528
+
529
+ 移除冠詞和其他可能影響計數準確性的前綴詞彙,
530
+ 確保 "a car" 和 "car" 被視為同一物件類型。
531
+
532
+ Args:
533
+ item: 原始物件項目字串
534
+
535
+ Returns:
536
+ str: 正規化後的物件項目
537
+ """
538
+ # 移除常見的英文冠詞
539
+ item = re.sub(r'^(a|an|the)\s+', '', item.lower())
540
+ return item.strip()
541
+
542
+ def _make_plural(self, item: str) -> str:
543
+ """
544
+ 將單數名詞轉換為複數形式
545
+
546
+ Args:
547
+ item: 單數形式的名詞
548
+
549
+ Returns:
550
+ str: 複數形式的名詞
551
+ """
552
+ # 重用已經實現的複數化邏輯
553
+ if item.endswith("y") and len(item) > 1 and item[-2].lower() not in 'aeiou':
554
+ return item[:-1] + "ies"
555
+ elif item.endswith(("s", "sh", "ch", "x", "z")):
556
+ return item + "es"
557
+ elif not item.endswith("s"):
558
+ return item + "s"
559
+ else:
560
+ return item
561
+
562
  def generate_dynamic_everyday_description(self,
563
  detected_objects: List[Dict],
564
  lighting_info: Optional[Dict] = None,
 
754
  if object_statistics and class_name in object_statistics:
755
  actual_count = object_statistics[class_name]["count"]
756
  formatted_name_with_exact_count = self._format_object_count_description(
757
+ normalized_class_name,
758
  actual_count,
759
+ scene_type=scene_type
760
  )
761
  else:
762
  formatted_name_with_exact_count = self._format_object_count_description(
763
+ normalized_class_name,
764
  count,
765
+ scene_type=scene_type
766
  )
767
 
768
  if formatted_name_with_exact_count == "no specific objects clearly identified" or not formatted_name_with_exact_count:
 
861
  def _remove_repetitive_descriptors(self, description: str) -> str:
862
  """
863
  移除描述中的重複性和不適當的描述詞彙,特別是 "identical" 等詞彙
864
+
865
  Args:
866
  description: 原始描述文本
867
+
868
  Returns:
869
  str: 清理後的描述文本
870
  """
871
  try:
872
  import re
873
+
874
  # 定義需要移除或替換的模式
875
  cleanup_patterns = [
876
  # 移除 "identical" 描述模式
877
  (r'\b(\d+)\s+identical\s+([a-zA-Z\s]+)', r'\1 \2'),
878
  (r'\b(two|three|four|five|six|seven|eight|nine|ten|eleven|twelve)\s+identical\s+([a-zA-Z\s]+)', r'\1 \2'),
879
  (r'\bidentical\s+([a-zA-Z\s]+)', r'\1'),
880
+
881
  # 改善 "comprehensive arrangement" 等過於技術性的表達
882
  (r'\bcomprehensive arrangement of\b', 'arrangement of'),
883
  (r'\bcomprehensive view featuring\b', 'scene featuring'),
884
  (r'\bcomprehensive display of\b', 'display of'),
885
+
886
  # 簡化過度描述性的短語
887
  (r'\bpositioning around\s+(\d+)\s+identical\b', r'positioning around \1'),
888
  (r'\barranged around\s+(\d+)\s+identical\b', r'arranged around \1'),
889
  ]
890
+
891
  processed_description = description
892
  for pattern, replacement in cleanup_patterns:
893
  processed_description = re.sub(pattern, replacement, processed_description, flags=re.IGNORECASE)
894
+
895
  # 進一步清理可能的多餘空格
896
  processed_description = re.sub(r'\s+', ' ', processed_description).strip()
897
+
898
  self.logger.debug(f"Cleaned description: removed repetitive descriptors")
899
  return processed_description
900
+
901
  except Exception as e:
902
  self.logger.warning(f"Error removing repetitive descriptors: {str(e)}")
903
  return description
904
 
905
+ def _format_object_count_description(self, class_name: str, count: int,
906
  scene_type: Optional[str] = None,
907
  detected_objects: Optional[List[Dict]] = None,
908
  avg_confidence: float = 0.0) -> str:
909
  """
910
  格式化物件數量描述的核心方法,整合空間排列、材質推斷和場景語境
911
+
912
  這個方法是整個物件描述系統的核心,它將多個子功能整合在一起:
913
  1. 數字到文字的轉換(避免阿拉伯數字)
914
  2. 基於場景的材質推斷
915
  3. 空間排列模式的描述
916
  4. 語境化的物件描述
917
+
918
  Args:
919
  class_name: 標準化後的類別名稱
920
  count: 物件數量
921
  scene_type: 場景類型,用於語境化描述
922
  detected_objects: 該類型的所有檢測物件,用於空間分析
923
  avg_confidence: 平均檢測置信度,影響材質推斷的可信度
924
+
925
  Returns:
926
  str: 完整的格式化數量描述
927
  """
 
931
 
932
  # 獲取基礎的複數形式
933
  plural_form = self._get_plural_form(class_name)
934
+
935
  # 單數情況的處理
936
  if count == 1:
937
+ return self._format_single_object_description(class_name, scene_type,
938
  detected_objects, avg_confidence)
939
+
940
  # 複數情況的處理
941
+ return self._format_multiple_objects_description(class_name, count, plural_form,
942
  scene_type, detected_objects, avg_confidence)
943
 
944
  except Exception as e:
 
946
  return f"{count} {class_name}s" if count > 1 else class_name
947
 
948
  def _format_single_object_description(self, class_name: str, scene_type: Optional[str],
949
+ detected_objects: Optional[List[Dict]],
950
  avg_confidence: float) -> str:
951
  """
952
  處理單個物件的描述生成
953
+
954
  對於單個物件,我們重點在於通過材質推斷和位置描述來豐富描述內容,
955
  避免簡單的 "a chair" 這樣的描述,而是生成 "a wooden dining chair" 這樣的表達
956
+
957
  Args:
958
  class_name: 物件類別名稱
959
  scene_type: 場景類型
960
  detected_objects: 檢測物件列表
961
  avg_confidence: 平均置信度
962
+
963
  Returns:
964
  str: 單個物件的完整描述
965
  """
966
  article = "an" if class_name[0].lower() in 'aeiou' else "a"
967
+
968
  # 獲取材質描述符
969
  material_descriptor = self._get_material_descriptor(class_name, scene_type, avg_confidence)
970
+
971
  # 獲取位置或特徵描述符
972
  feature_descriptor = self._get_single_object_feature(class_name, scene_type, detected_objects)
973
+
974
  # 組合描述
975
  descriptors = []
976
  if material_descriptor:
977
  descriptors.append(material_descriptor)
978
  if feature_descriptor:
979
  descriptors.append(feature_descriptor)
980
+
981
  if descriptors:
982
  return f"{article} {' '.join(descriptors)} {class_name}"
983
  else:
984
  return f"{article} {class_name}"
985
 
986
  def _format_multiple_objects_description(self, class_name: str, count: int, plural_form: str,
987
+ scene_type: Optional[str], detected_objects: Optional[List[Dict]],
988
  avg_confidence: float) -> str:
989
  """
990
  處理多個物件的描述生成
991
+
992
  對於多個物件,我們的重點是:
993
  1. 將數字轉換為文字表達
994
  2. 分析空間排列模式
995
  3. 添加適當的材質或功能描述
996
  4. 生成自然流暢的描述
997
+
998
  Args:
999
  class_name: 物件類別名稱
1000
  count: 物件數量
 
1002
  scene_type: 場景類型
1003
  detected_objects: 檢測物件列表
1004
  avg_confidence: 平均置信度
1005
+
1006
  Returns:
1007
  str: 多個物件的完整描述
1008
  """
1009
  # 數字到文字的轉換映射
1010
  number_words = {
1011
  2: "two", 3: "three", 4: "four", 5: "five", 6: "six",
1012
+ 7: "seven", 8: "eight", 9: "nine", 10: "ten",
1013
  11: "eleven", 12: "twelve"
1014
  }
1015
+
1016
  # 確定基礎數量表達
1017
  if count in number_words:
1018
  count_expression = number_words[count]
 
1020
  count_expression = "several"
1021
  else:
1022
  count_expression = "numerous"
1023
+
1024
  # 獲取材質或功能描述符
1025
  material_descriptor = self._get_material_descriptor(class_name, scene_type, avg_confidence)
1026
+
1027
  # 獲取空間排列描述
1028
+ spatial_descriptor = self._get_spatial_arrangement_descriptor(class_name, scene_type,
1029
  detected_objects, count)
1030
+
1031
  # 組合最終描述
1032
  descriptors = []
1033
  if material_descriptor:
1034
  descriptors.append(material_descriptor)
1035
+
1036
  # 構建基礎描述
1037
  base_description = f"{count_expression} {' '.join(descriptors)} {plural_form}".strip()
1038
+
1039
  # 添加空間排列信息
1040
  if spatial_descriptor:
1041
  return f"{base_description} {spatial_descriptor}"
1042
  else:
1043
  return base_description
1044
 
1045
+ def _get_material_descriptor(self, class_name: str, scene_type: Optional[str],
1046
  avg_confidence: float) -> Optional[str]:
1047
  """
1048
  基於場景語境和置信度進行材質推斷
1049
+
1050
  這個方法實現了智能的材質推斷,它不依賴複雜的圖像分析,
1051
  而是基於常識和場景邏輯來推斷最可能的材質描述
1052
+
1053
  Args:
1054
  class_name: 物件類別名稱
1055
  scene_type: 場景類型
1056
  avg_confidence: 檢測置信度,影響推斷的保守程度
1057
+
1058
  Returns:
1059
  Optional[str]: 材質描述符,如果無法推斷則返回None
1060
  """
1061
  # 只有在置信度足夠高時才進行材質推斷
1062
  if avg_confidence < 0.5:
1063
  return None
1064
+
1065
  # 餐廳和用餐相關場景
1066
  if scene_type and scene_type in ["dining_area", "restaurant", "upscale_dining", "cafe"]:
1067
  material_mapping = {
 
1071
  "vase": "decorative"
1072
  }
1073
  return material_mapping.get(class_name)
1074
+
1075
  # 辦公場景
1076
  elif scene_type and scene_type in ["office_workspace", "meeting_room", "conference_room"]:
1077
  material_mapping = {
 
1081
  "book": "reference"
1082
  }
1083
  return material_mapping.get(class_name)
1084
+
1085
  # 客廳場景
1086
  elif scene_type and scene_type in ["living_room"]:
1087
  material_mapping = {
 
1091
  "vase": "decorative"
1092
  }
1093
  return material_mapping.get(class_name)
1094
+
1095
  # 室外場景
1096
  elif scene_type and scene_type in ["city_street", "park_area", "parking_lot"]:
1097
  material_mapping = {
 
1100
  "bicycle": "stationed"
1101
  }
1102
  return material_mapping.get(class_name)
1103
+
1104
  # 如果沒有特定的場景映射,返回通用描述符
1105
  generic_mapping = {
1106
  "chair": "comfortable",
 
1108
  "car": "parked",
1109
  "person": "present"
1110
  }
1111
+
1112
  return generic_mapping.get(class_name)
1113
 
1114
  def _get_spatial_arrangement_descriptor(self, class_name: str, scene_type: Optional[str],
1115
+ detected_objects: Optional[List[Dict]],
1116
  count: int) -> Optional[str]:
1117
  """
1118
  分析物件的空間排列模式並生成相應描述
1119
+
1120
  這個方法通過分析物件的位置分布來判斷排列模式,
1121
  然後根據物件類型和場景生成適當的空間描述
1122
+
1123
  Args:
1124
  class_name: 物件類別名稱
1125
  scene_type: 場景類型
1126
  detected_objects: 該類型的所有檢測物件
1127
  count: 物件數量
1128
+
1129
  Returns:
1130
  Optional[str]: 空間排列描述,如果無法分析則返回None
1131
  """
1132
  if not detected_objects or len(detected_objects) < 2:
1133
  return None
1134
+
1135
  try:
1136
  # 提取物件的標準化位置
1137
  positions = []
 
1139
  center = obj.get("normalized_center", [0.5, 0.5])
1140
  if isinstance(center, (list, tuple)) and len(center) >= 2:
1141
  positions.append(center)
1142
+
1143
  if len(positions) < 2:
1144
  return None
1145
+
1146
  # 分析排列模式
1147
  arrangement_pattern = self._analyze_arrangement_pattern(positions)
1148
+
1149
  # 根據物件類型和場景生成描述
1150
+ return self._generate_arrangement_description(class_name, scene_type,
1151
  arrangement_pattern, count)
1152
+
1153
  except Exception as e:
1154
  self.logger.warning(f"Error analyzing spatial arrangement: {str(e)}")
1155
  return None
 
1157
  def _analyze_arrangement_pattern(self, positions: List[List[float]]) -> str:
1158
  """
1159
  分析位置點的排列模式
1160
+
1161
  這個方法使用簡單的幾何分析來判斷物件的排列類型,
1162
  幫助我們理解物件在空間中的組織方式
1163
+
1164
  Args:
1165
  positions: 標準化的位置座標列表
1166
+
1167
  Returns:
1168
  str: 排列模式類型(linear, clustered, scattered, circular等)
1169
  """
1170
  import numpy as np
1171
+
1172
  if len(positions) < 2:
1173
  return "single"
1174
+
1175
  # 轉換為numpy陣列便於計算
1176
  pos_array = np.array(positions)
1177
+
1178
  # 計算位置的分布特徵
1179
  x_coords = pos_array[:, 0]
1180
  y_coords = pos_array[:, 1]
1181
+
1182
  # 分析x和y方向的變異程度
1183
  x_variance = np.var(x_coords)
1184
  y_variance = np.var(y_coords)
1185
+
1186
  # 計算物件間的平均距離
1187
  distances = []
1188
  for i in range(len(positions)):
1189
  for j in range(i + 1, len(positions)):
1190
+ dist = np.sqrt((positions[i][0] - positions[j][0])**2 +
1191
  (positions[i][1] - positions[j][1])**2)
1192
  distances.append(dist)
1193
+
1194
  avg_distance = np.mean(distances) if distances else 0
1195
  distance_variance = np.var(distances) if distances else 0
1196
+
1197
  # ���斷排列模式
1198
  if len(positions) >= 4 and self._is_circular_pattern(positions):
1199
  return "circular"
 
1211
  def _is_circular_pattern(self, positions: List[List[float]]) -> bool:
1212
  """
1213
  檢查位置是否形成圓形或環形排列
1214
+
1215
  Args:
1216
  positions: 位置座標列表
1217
+
1218
  Returns:
1219
  bool: 是否為圓形排列
1220
  """
1221
  import numpy as np
1222
+
1223
  if len(positions) < 4:
1224
  return False
1225
+
1226
  try:
1227
  pos_array = np.array(positions)
1228
+
1229
  # 計算中心點
1230
  center_x = np.mean(pos_array[:, 0])
1231
  center_y = np.mean(pos_array[:, 1])
1232
+
1233
  # 計算每個點到中心的距離
1234
  distances_to_center = []
1235
  for pos in positions:
1236
  dist = np.sqrt((pos[0] - center_x)**2 + (pos[1] - center_y)**2)
1237
  distances_to_center.append(dist)
1238
+
1239
  # 如果所有距離都相近,可能是圓形排列
1240
  distance_variance = np.var(distances_to_center)
1241
  return distance_variance < 0.05 and np.mean(distances_to_center) > 0.2
1242
+
1243
  except:
1244
  return False
1245
 
 
1247
  arrangement_pattern: str, count: int) -> Optional[str]:
1248
  """
1249
  根據物件類型、場景和排列模式生成空間描述
1250
+
1251
  這個方法將抽象的排列模式轉換為自然語言描述,
1252
  並根據具體的物件類型和場景語境進行定制
1253
+
1254
  Args:
1255
  class_name: 物件類別名稱
1256
  scene_type: 場景類型
1257
  arrangement_pattern: 排列模式
1258
  count: 物件數量
1259
+
1260
  Returns:
1261
  Optional[str]: 生成的空間排列描述
1262
  """
 
1292
  "distributed": "positioned throughout the scene"
1293
  }
1294
  }
1295
+
1296
  # 獲取對應的描述模板
1297
  if class_name in arrangement_templates:
1298
  template_dict = arrangement_templates[class_name]
 
1308
  "distributed": "thoughtfully placed"
1309
  }
1310
  base_description = generic_templates.get(arrangement_pattern, "positioned in the scene")
1311
+
1312
  return base_description
1313
 
1314
  def _get_single_object_feature(self, class_name: str, scene_type: Optional[str],
1315
  detected_objects: Optional[List[Dict]]) -> Optional[str]:
1316
  """
1317
  為單個物件生成特徵描述符
1318
+
1319
  當只有一個物件時,我們可以提供更具體的位置或功能描述
1320
+
1321
  Args:
1322
  class_name: 物件類別名稱
1323
  scene_type: 場景類型
1324
  detected_objects: 檢測物件(單個)
1325
+
1326
  Returns:
1327
  Optional[str]: 特徵描述符
1328
  """
1329
  if not detected_objects or len(detected_objects) != 1:
1330
  return None
1331
+
1332
  obj = detected_objects[0]
1333
  region = obj.get("region", "").lower()
1334
+
1335
  # 基於位置的描述
1336
  if "center" in region:
1337
  if class_name == "dining table":
 
1340
  return "centrally placed"
1341
  elif "corner" in region or "left" in region or "right" in region:
1342
  return "positioned"
1343
+
1344
  # 基於場景的功能描述
1345
  if scene_type and scene_type in ["dining_area", "restaurant"]:
1346
  if class_name == "chair":
1347
  return "dining"
1348
  elif class_name == "vase":
1349
  return "decorative"
1350
+
1351
  return None
1352
 
1353
  def _get_plural_form(self, word: str) -> str: