File size: 17,027 Bytes
e6a18b7
3172319
 
e6a18b7
 
3172319
ca35826
3172319
e6a18b7
 
 
 
3172319
 
 
 
 
e6a18b7
 
 
 
3172319
e6a18b7
4d1f920
 
 
 
 
 
e6a18b7
 
 
3172319
 
e6a18b7
3172319
 
e6a18b7
 
 
 
3172319
e6a18b7
3172319
e6a18b7
 
 
 
 
 
 
 
 
4d1f920
e6a18b7
 
 
 
4d1f920
e6a18b7
 
 
 
 
4d1f920
e6a18b7
 
 
 
4d1f920
e6a18b7
 
 
 
 
 
4d1f920
e6a18b7
 
 
 
 
 
4d1f920
e6a18b7
 
 
 
 
 
 
4d1f920
e6a18b7
 
 
4d1f920
e6a18b7
4d1f920
 
e6a18b7
4d1f920
 
 
e6a18b7
 
 
3172319
e6a18b7
4d1f920
 
e6a18b7
 
 
 
 
 
4d1f920
3172319
e6a18b7
3172319
e6a18b7
 
 
 
 
 
 
 
 
 
 
 
 
3172319
e6a18b7
 
 
 
3172319
 
 
 
 
4d1f920
3172319
 
e6a18b7
 
 
 
 
 
 
 
 
4d1f920
e6a18b7
 
 
 
 
 
 
 
 
 
4d1f920
e6a18b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4d1f920
 
e6a18b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4d1f920
 
e6a18b7
4d1f920
e6a18b7
 
 
4d1f920
e6a18b7
4d1f920
e6a18b7
4d1f920
 
e6a18b7
 
3172319
4d1f920
e6a18b7
4d1f920
e6a18b7
 
 
 
 
 
 
 
4d1f920
e6a18b7
 
3172319
 
4d1f920
3172319
e6a18b7
 
4d1f920
3172319
e6a18b7
3172319
e6a18b7
 
 
3172319
 
 
 
e6a18b7
3172319
 
e6a18b7
3172319
 
 
e6a18b7
 
 
 
 
 
 
 
3172319
e6a18b7
4d1f920
3172319
e6a18b7
 
 
 
 
 
4d1f920
3172319
e6a18b7
3172319
e6a18b7
 
 
4d1f920
3172319
e6a18b7
4d1f920
e6a18b7
4d1f920
 
e6a18b7
 
 
4d1f920
 
e6a18b7
4d1f920
e6a18b7
 
 
4d1f920
 
 
e6a18b7
4d1f920
 
e6a18b7
4d1f920
 
e6a18b7
4d1f920
e6a18b7
4d1f920
e6a18b7
 
 
 
 
 
 
 
 
 
 
 
 
4d1f920
e6a18b7
 
 
4d1f920
e6a18b7
 
 
 
4d1f920
e6a18b7
 
 
4d1f920
e6a18b7
 
4d1f920
e6a18b7
 
 
 
4d1f920
e6a18b7
 
 
4d1f920
e6a18b7
 
 
 
 
4d1f920
e6a18b7
 
 
 
4d1f920
e6a18b7
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382

import os
import numpy as np
import logging
import traceback
from typing import Dict, List, Tuple, Any, Optional
from PIL import Image

from component_initializer import ComponentInitializer
from scene_scoring_engine import SceneScoringEngine
from landmark_processing_manager import LandmarkProcessingManager
from scene_analysis_coordinator import SceneAnalysisCoordinator

class SceneAnalyzer:
    """
    Core class for scene analysis and understanding based on object detection results.
    Analyzes detected objects, their relationships, and infers the scene type.
    此class為場景理解的總窗口

    This is the main Facade class that coordinates all scene analysis components
    while maintaining the original public interface for backward compatibility.
    """

    EVERYDAY_SCENE_TYPE_KEYS = [
        "general_indoor_space", "generic_street_view",
        "desk_area_workspace", "outdoor_gathering_spot",
        "kitchen_counter_or_utility_area"
    ]

    def __init__(self, class_names: Dict[int, str] = None, use_llm: bool = True,
                 use_clip: bool = True, enable_landmark: bool = True,
                 llm_model_path: str = None):
        """
        Initialize the scene analyzer with optional class name mappings.

        Args:
            class_names: Dictionary mapping class IDs to class names (optional)
            use_llm: Whether to enable LLM enhancement functionality
            use_clip: Whether to enable CLIP analysis functionality
            enable_landmark: Whether to enable landmark detection functionality
            llm_model_path: Path to LLM model (optional)
        """
        self.logger = logging.getLogger(__name__)

        try:
            # Initialize all components through the component initializer
            self.component_initializer = ComponentInitializer(
                class_names=class_names,
                use_llm=use_llm,
                use_clip=use_clip,
                enable_landmark=enable_landmark,
                llm_model_path=llm_model_path
            )

            # Get data structures for easy access
            self.SCENE_TYPES = self.component_initializer.get_data_structure('SCENE_TYPES')
            self.OBJECT_CATEGORIES = self.component_initializer.get_data_structure('OBJECT_CATEGORIES')
            self.LANDMARK_ACTIVITIES = self.component_initializer.get_data_structure('LANDMARK_ACTIVITIES')

            # Initialize specialized engines
            self.scene_scoring_engine = SceneScoringEngine(
                scene_types=self.SCENE_TYPES,
                enable_landmark=enable_landmark
            )

            self.landmark_processing_manager = LandmarkProcessingManager(
                enable_landmark=enable_landmark,
                use_clip=use_clip
            )

            # Initialize the main coordinator
            self.scene_analysis_coordinator = SceneAnalysisCoordinator(
                component_initializer=self.component_initializer,
                scene_scoring_engine=self.scene_scoring_engine,
                landmark_processing_manager=self.landmark_processing_manager
            )

            # Store configuration for backward compatibility
            self.class_names = class_names
            self.use_clip = use_clip
            self.use_llm = use_llm
            self.enable_landmark = enable_landmark
            self.use_landmark_detection = enable_landmark

            # Get component references for backward compatibility
            self.spatial_analyzer = self.component_initializer.get_component('spatial_analyzer')
            self.descriptor = self.component_initializer.get_component('descriptor')
            self.scene_describer = self.component_initializer.get_component('scene_describer')
            self.clip_analyzer = self.component_initializer.get_component('clip_analyzer')
            self.llm_enhancer = self.component_initializer.get_component('llm_enhancer')
            self.landmark_classifier = self.component_initializer.get_component('landmark_classifier')

            # Set landmark classifier in the processing manager
            if self.landmark_classifier:
                self.landmark_processing_manager.set_landmark_classifier(self.landmark_classifier)

            self.logger.info("SceneAnalyzer initialized successfully with all components")

        except Exception as e:
            self.logger.error(f"Critical error during SceneAnalyzer initialization: {e}")
            traceback.print_exc()
            raise

    def analyze(self, detection_result: Any, lighting_info: Optional[Dict] = None,
                class_confidence_threshold: float = 0.25, scene_confidence_threshold: float = 0.6,
                enable_landmark: bool = True, places365_info: Optional[Dict] = None) -> Dict:
        """
        Analyze detection results to determine scene type and provide understanding.

        Args:
            detection_result: Detection result from YOLOv8 or similar
            lighting_info: Optional lighting condition analysis results
            class_confidence_threshold: Minimum confidence to consider an object
            scene_confidence_threshold: Minimum confidence to determine a scene
            enable_landmark: Whether to enable landmark detection and recognition for this run
            places365_info: Optional Places365 scene classification results

        Returns:
            Dictionary with scene analysis results
        """
        try:
            return self.scene_analysis_coordinator.analyze(
                detection_result=detection_result,
                lighting_info=lighting_info,
                class_confidence_threshold=class_confidence_threshold,
                scene_confidence_threshold=scene_confidence_threshold,
                enable_landmark=enable_landmark,
                places365_info=places365_info
            )
        except Exception as e:
            self.logger.error(f"Error in scene analysis: {e}")
            traceback.print_exc()
            # Return a safe fallback result
            return {
                "scene_type": "unknown",
                "confidence": 0.0,
                "description": "Scene analysis failed due to an internal error.",
                "enhanced_description": "An error occurred during scene analysis. Please check the system logs for details.",
                "objects_present": [],
                "object_count": 0,
                "regions": {},
                "possible_activities": [],
                "safety_concerns": [],
                "lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0.0}
            }

    def generate_scene_description(self, scene_type: str, detected_objects: List[Dict],
                                 confidence: float, lighting_info: Optional[Dict] = None,
                                 functional_zones: Optional[Dict] = None,
                                 enable_landmark: bool = True,
                                 scene_scores: Optional[Dict] = None,
                                 spatial_analysis: Optional[Dict] = None,
                                 image_dimensions: Optional[Tuple[int, int]] = None) -> str:
        """
        Generate scene description and pass all necessary context to the underlying describer.

        Args:
            scene_type: Identified scene type
            detected_objects: List of detected objects
            confidence: Scene classification confidence
            lighting_info: Lighting condition information (optional)
            functional_zones: Functional zone information (optional)
            enable_landmark: Whether to enable landmark description (optional)
            scene_scores: Scene scores (optional)
            spatial_analysis: Spatial analysis results (optional)
            image_dimensions: Image dimensions (width, height) (optional)

        Returns:
            str: Generated scene description
        """
        try:
            # Convert functional_zones from Dict to List[str] and filter technical terms
            functional_zones_list = []
            if functional_zones and isinstance(functional_zones, dict):
                # Filter out technical terms, keep only meaningful descriptions
                filtered_zones = {k: v for k, v in functional_zones.items()
                                if not k.endswith('_zone') or k in ['dining_zone', 'seating_zone', 'work_zone']}
                functional_zones_list = [v.get('description', k) for k, v in filtered_zones.items()
                                       if isinstance(v, dict) and v.get('description')]
            elif functional_zones and isinstance(functional_zones, list):
                # Filter technical terms from list
                functional_zones_list = [zone for zone in functional_zones
                                       if not zone.endswith('_zone') or 'area' in zone]

            # Generate detailed object statistics
            object_statistics = {}
            for obj in detected_objects:
                class_name = obj.get("class_name", "unknown")
                if class_name not in object_statistics:
                    object_statistics[class_name] = {
                        "count": 0,
                        "avg_confidence": 0.0,
                        "max_confidence": 0.0,
                        "instances": []
                    }

                stats = object_statistics[class_name]
                stats["count"] += 1
                stats["instances"].append(obj)
                stats["max_confidence"] = max(stats["max_confidence"], obj.get("confidence", 0.0))

            # Calculate average confidence
            for class_name, stats in object_statistics.items():
                if stats["count"] > 0:
                    total_conf = sum(inst.get("confidence", 0.0) for inst in stats["instances"])
                    stats["avg_confidence"] = total_conf / stats["count"]

            if self.scene_describer:
                return self.scene_describer.generate_description(
                    scene_type=scene_type,
                    detected_objects=detected_objects,
                    confidence=confidence,
                    lighting_info=lighting_info,
                    functional_zones=functional_zones_list,
                    enable_landmark=enable_landmark,
                    scene_scores=scene_scores,
                    spatial_analysis=spatial_analysis,
                    image_dimensions=image_dimensions,
                    object_statistics=object_statistics
                )
            else:
                return f"A {scene_type} scene with {len(detected_objects)} detected objects."

        except Exception as e:
            self.logger.error(f"Error generating scene description: {e}")
            return f"A {scene_type} scene."

    def process_unknown_objects(self, detection_result, detected_objects):
        """
        Process objects that YOLO failed to identify or have low confidence for landmark detection.

        Args:
            detection_result: YOLO detection results
            detected_objects: List of identified objects

        Returns:
            tuple: (updated object list, landmark object list)
        """
        try:
            return self.landmark_processing_manager.process_unknown_objects(
                detection_result, detected_objects, self.clip_analyzer
            )
        except Exception as e:
            self.logger.error(f"Error processing unknown objects: {e}")
            traceback.print_exc()
            return detected_objects, []

    def _compute_scene_scores(self, detected_objects: List[Dict],
                            spatial_analysis_results: Optional[Dict] = None) -> Dict[str, float]:
        """
        Compute confidence scores for each scene type based on detected objects.

        Args:
            detected_objects: List of detected objects with their details
            spatial_analysis_results: Optional output from SpatialAnalyzer

        Returns:
            Dictionary mapping scene types to confidence scores
        """
        return self.scene_scoring_engine.compute_scene_scores(
            detected_objects, spatial_analysis_results
        )

    def _determine_scene_type(self, scene_scores: Dict[str, float]) -> Tuple[str, float]:
        """
        Determine the most likely scene type based on scores.

        Args:
            scene_scores: Dictionary mapping scene types to confidence scores

        Returns:
            Tuple of (best_scene_type, confidence)
        """
        return self.scene_scoring_engine.determine_scene_type(scene_scores)

    def _fuse_scene_scores(self, yolo_scene_scores: Dict[str, float],
                          clip_scene_scores: Dict[str, float],
                          num_yolo_detections: int = 0,
                          avg_yolo_confidence: float = 0.0,
                          lighting_info: Optional[Dict] = None,
                          places365_info: Optional[Dict] = None) -> Dict[str, float]:
        """
        Fuse scene scores from YOLO-based object detection, CLIP-based analysis, and Places365.

        Args:
            yolo_scene_scores: Scene scores based on YOLO object detection
            clip_scene_scores: Scene scores based on CLIP analysis
            num_yolo_detections: Total number of non-landmark objects detected by YOLO
            avg_yolo_confidence: Average confidence of non-landmark objects detected by YOLO
            lighting_info: Optional lighting condition analysis results
            places365_info: Optional Places365 scene classification results

        Returns:
            Dict: Fused scene scores incorporating all analysis sources
        """
        return self.scene_scoring_engine.fuse_scene_scores(
            yolo_scene_scores, clip_scene_scores, num_yolo_detections,
            avg_yolo_confidence, lighting_info, places365_info
        )

    def _get_alternative_scene_type(self, landmark_scene_type, detected_objects, scene_scores):
        """
        Select appropriate alternative type for landmark scene types.

        Args:
            landmark_scene_type: Original landmark scene type
            detected_objects: List of detected objects
            scene_scores: All scene type scores

        Returns:
            str: Appropriate alternative scene type
        """
        return self.landmark_processing_manager.get_alternative_scene_type(
            landmark_scene_type, detected_objects, scene_scores
        )

    def _remove_landmark_references(self, text):
        """
        Remove all landmark references from text.

        Args:
            text: Input text

        Returns:
            str: Text with landmark references removed
        """
        return self.landmark_processing_manager.remove_landmark_references(text)

    def _define_image_regions(self):
        """Define regions of the image for spatial analysis (3x3 grid)."""
        self.regions = {
            "top_left": (0, 0, 1/3, 1/3),
            "top_center": (1/3, 0, 2/3, 1/3),
            "top_right": (2/3, 0, 1, 1/3),
            "middle_left": (0, 1/3, 1/3, 2/3),
            "middle_center": (1/3, 1/3, 2/3, 2/3),
            "middle_right": (2/3, 1/3, 1, 2/3),
            "bottom_left": (0, 2/3, 1/3, 1),
            "bottom_center": (1/3, 2/3, 2/3, 1),
            "bottom_right": (2/3, 2/3, 1, 1)
        }

    def get_component_status(self) -> Dict[str, bool]:
        """
        Get the initialization status of all components.

        Returns:
            Dictionary mapping component names to their initialization status
        """
        return self.component_initializer.get_initialization_summary()

    def is_component_available(self, component_name: str) -> bool:
        """
        Check if a specific component is available and properly initialized.

        Args:
            component_name: Name of the component to check

        Returns:
            bool: Whether the component is available
        """
        return self.component_initializer.is_component_available(component_name)

    def update_landmark_enable_status(self, enable_landmark: bool):
        """
        Update the landmark detection enable status across all components.

        Args:
            enable_landmark: Whether to enable landmark detection
        """
        self.enable_landmark = enable_landmark
        self.use_landmark_detection = enable_landmark

        # Update all related components
        self.component_initializer.update_landmark_enable_status(enable_landmark)
        self.scene_scoring_engine.update_enable_landmark_status(enable_landmark)
        self.landmark_processing_manager.update_enable_landmark_status(enable_landmark)

        # Update the coordinator's enable_landmark status
        if hasattr(self.scene_analysis_coordinator, 'enable_landmark'):
            self.scene_analysis_coordinator.enable_landmark = enable_landmark