Spaces:

DawnC
/

VisionScout

Running on Zero

App Files Files Community

DawnC commited on 1 day ago

Commit

2fa80e1

verified ·

1 Parent(s): ddc065a

Upload 3 files

Browse files

Files changed (3) hide show

clip_analyzer.py +27 -30
clip_model_manager.py +18 -24
clip_zero_shot_classifier.py +4 -4

clip_analyzer.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import torch
-import open_clip
 import numpy as np
 from PIL import Image
 from typing import Dict, List, Tuple, Any, Optional, Union
@@ -20,14 +20,13 @@ class CLIPAnalyzer:
     Use Clip to intergrate scene understanding function
     """
-    def __init__(self, model_name: str = "ViT-B-16", device: str = None, pretrained: str = "laion2b_s34b_b88k"):
         """
-        初始化 CLIP 分析器，使用 OpenCLIP 實現
         Args:
-            model_name: OpenCLIP 模型名稱，默認 "ViT-B-16"
-            device: 運行設備
-            pretrained: 預訓練權重，使用 "laion2b_s34b_b79k"
         """
         # 自動選擇設備
         if device is None:
@@ -35,23 +34,12 @@ class CLIPAnalyzer:
         else:
             self.device = device
-        print(f"Loading OpenCLIP model {model_name} with {pretrained} on {self.device}...")
         try:
-            self.model, _, self.preprocess = open_clip.create_model_and_transforms(
-                model_name,
-                pretrained=pretrained,
-                device=self.device
-            )
-            self.tokenizer = open_clip.get_tokenizer(model_name)
-            print(f"OpenCLIP model loaded successfully.")
-            import gc
-            gc.collect()
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-                torch.cuda.synchronize()
-            print("Memory cleanup completed after OpenCLIP loading.")
         except Exception as e:
-            print(f"Error loading OpenCLIP model: {e}")
             raise
         self.scene_type_prompts = SCENE_TYPE_PROMPTS
@@ -76,7 +64,7 @@ class CLIPAnalyzer:
             if scene_texts:
                 self.text_features_cache["scene_type_keys"] = list(self.scene_type_prompts.keys())
                 try:
-                    self.text_features_cache["scene_type_tokens"] = self.tokenizer(scene_texts).to(self.device)
                 except Exception as e:
                     print(f"Warning: Error tokenizing scene_type_prompts: {e}")
                     self.text_features_cache["scene_type_tokens"] = None # 標記錯誤或空
@@ -94,7 +82,7 @@ class CLIPAnalyzer:
             for scene_type, prompts in self.cultural_scene_prompts.items():
                 if prompts and isinstance(prompts, list) and all(isinstance(p, str) for p in prompts):
                     try:
-                        cultural_tokens_dict_val[scene_type] = self.tokenizer(prompts).to(self.device)
                     except Exception as e:
                         print(f"Warning: Error tokenizing cultural_scene_prompts for {scene_type}: {e}")
                         cultural_tokens_dict_val[scene_type] = None # 標記錯誤或空
@@ -108,7 +96,7 @@ class CLIPAnalyzer:
             if lighting_texts:
                 self.text_features_cache["lighting_condition_keys"] = list(self.lighting_condition_prompts.keys())
                 try:
-                    self.text_features_cache["lighting_tokens"] = self.tokenizer(lighting_texts).to(self.device)
                 except Exception as e:
                     print(f"Warning: Error tokenizing lighting_condition_prompts: {e}")
                     self.text_features_cache["lighting_tokens"] = None
@@ -125,7 +113,7 @@ class CLIPAnalyzer:
             for scene_type, prompts in self.specialized_scene_prompts.items():
                 if prompts and isinstance(prompts, list) and all(isinstance(p, str) for p in prompts):
                     try:
-                        specialized_tokens_dict_val[scene_type] = self.tokenizer(prompts).to(self.device)
                     except Exception as e:
                         print(f"Warning: Error tokenizing specialized_scene_prompts for {scene_type}: {e}")
                         specialized_tokens_dict_val[scene_type] = None
@@ -139,7 +127,7 @@ class CLIPAnalyzer:
             if viewpoint_texts:
                 self.text_features_cache["viewpoint_keys"] = list(self.viewpoint_prompts.keys())
                 try:
-                    self.text_features_cache["viewpoint_tokens"] = self.tokenizer(viewpoint_texts).to(self.device)
                 except Exception as e:
                     print(f"Warning: Error tokenizing viewpoint_prompts: {e}")
                     self.text_features_cache["viewpoint_tokens"] = None
@@ -156,7 +144,7 @@ class CLIPAnalyzer:
             if object_combination_texts:
                 self.text_features_cache["object_combination_keys"] = list(self.object_combination_prompts.keys())
                 try:
-                    self.text_features_cache["object_combination_tokens"] = self.tokenizer(object_combination_texts).to(self.device)
                 except Exception as e:
                     print(f"Warning: Error tokenizing object_combination_prompts: {e}")
                     self.text_features_cache["object_combination_tokens"] = None
@@ -173,7 +161,7 @@ class CLIPAnalyzer:
             if activity_texts:
                 self.text_features_cache["activity_keys"] = list(self.activity_prompts.keys())
                 try:
-                    self.text_features_cache["activity_tokens"] = self.tokenizer(activity_texts).to(self.device)
                 except Exception as e:
                     print(f"Warning: Error tokenizing activity_prompts: {e}")
                     self.text_features_cache["activity_tokens"] = None
@@ -192,7 +180,7 @@ class CLIPAnalyzer:
         self.cultural_tokens_dict = self.text_features_cache["cultural_tokens_dict"]
         self.specialized_tokens_dict = self.text_features_cache["specialized_tokens_dict"]
-        print("OpenCLIP text_features_cache prepared.")
     def analyze_image(self, image, include_cultural_analysis=True, exclude_categories=None, enable_landmark=True, places365_guidance=None):
         """
@@ -593,7 +581,16 @@ class CLIPAnalyzer:
         return image_features.cpu().numpy()[0] if self.device == "cuda" else image_features.numpy()[0]
     def text_to_embedding(self, text: str) -> np.ndarray:
-        text_token = self.tokenizer([text]).to(self.device)
         with torch.no_grad():
             text_features = self.model.encode_text(text_token)

 import torch
+import clip
 import numpy as np
 from PIL import Image
 from typing import Dict, List, Tuple, Any, Optional, Union
     Use Clip to intergrate scene understanding function
     """
+    def __init__(self, model_name: str = "ViT-B/16", device: str = None):
         """
+        初始化 CLIP 分析器。
         Args:
+            model_name: CLIP Model name, 默認 "ViT-B/16"
+            device: Use GPU if it can use
         """
         # 自動選擇設備
         if device is None:
         else:
             self.device = device
+        print(f"Loading CLIP model {model_name} on {self.device}...")
         try:
+            self.model, self.preprocess = clip.load(model_name, device=self.device)
+            print(f"CLIP model loaded successfully.")
         except Exception as e:
+            print(f"Error loading CLIP model: {e}")
             raise
         self.scene_type_prompts = SCENE_TYPE_PROMPTS
             if scene_texts:
                 self.text_features_cache["scene_type_keys"] = list(self.scene_type_prompts.keys())
                 try:
+                    self.text_features_cache["scene_type_tokens"] = clip.tokenize(scene_texts).to(self.device)
                 except Exception as e:
                     print(f"Warning: Error tokenizing scene_type_prompts: {e}")
                     self.text_features_cache["scene_type_tokens"] = None # 標記錯誤或空
             for scene_type, prompts in self.cultural_scene_prompts.items():
                 if prompts and isinstance(prompts, list) and all(isinstance(p, str) for p in prompts):
                     try:
+                        cultural_tokens_dict_val[scene_type] = clip.tokenize(prompts).to(self.device)
                     except Exception as e:
                         print(f"Warning: Error tokenizing cultural_scene_prompts for {scene_type}: {e}")
                         cultural_tokens_dict_val[scene_type] = None # 標記錯誤或空
             if lighting_texts:
                 self.text_features_cache["lighting_condition_keys"] = list(self.lighting_condition_prompts.keys())
                 try:
+                    self.text_features_cache["lighting_tokens"] = clip.tokenize(lighting_texts).to(self.device)
                 except Exception as e:
                     print(f"Warning: Error tokenizing lighting_condition_prompts: {e}")
                     self.text_features_cache["lighting_tokens"] = None
             for scene_type, prompts in self.specialized_scene_prompts.items():
                 if prompts and isinstance(prompts, list) and all(isinstance(p, str) for p in prompts):
                     try:
+                        specialized_tokens_dict_val[scene_type] = clip.tokenize(prompts).to(self.device)
                     except Exception as e:
                         print(f"Warning: Error tokenizing specialized_scene_prompts for {scene_type}: {e}")
                         specialized_tokens_dict_val[scene_type] = None
             if viewpoint_texts:
                 self.text_features_cache["viewpoint_keys"] = list(self.viewpoint_prompts.keys())
                 try:
+                    self.text_features_cache["viewpoint_tokens"] = clip.tokenize(viewpoint_texts).to(self.device)
                 except Exception as e:
                     print(f"Warning: Error tokenizing viewpoint_prompts: {e}")
                     self.text_features_cache["viewpoint_tokens"] = None
             if object_combination_texts:
                 self.text_features_cache["object_combination_keys"] = list(self.object_combination_prompts.keys())
                 try:
+                    self.text_features_cache["object_combination_tokens"] = clip.tokenize(object_combination_texts).to(self.device)
                 except Exception as e:
                     print(f"Warning: Error tokenizing object_combination_prompts: {e}")
                     self.text_features_cache["object_combination_tokens"] = None
             if activity_texts:
                 self.text_features_cache["activity_keys"] = list(self.activity_prompts.keys())
                 try:
+                    self.text_features_cache["activity_tokens"] = clip.tokenize(activity_texts).to(self.device)
                 except Exception as e:
                     print(f"Warning: Error tokenizing activity_prompts: {e}")
                     self.text_features_cache["activity_tokens"] = None
         self.cultural_tokens_dict = self.text_features_cache["cultural_tokens_dict"]
         self.specialized_tokens_dict = self.text_features_cache["specialized_tokens_dict"]
+        print("CLIP text_features_cache prepared.")
     def analyze_image(self, image, include_cultural_analysis=True, exclude_categories=None, enable_landmark=True, places365_guidance=None):
         """
         return image_features.cpu().numpy()[0] if self.device == "cuda" else image_features.numpy()[0]
     def text_to_embedding(self, text: str) -> np.ndarray:
+        """
+        將文本轉換為 CLIP 嵌入表示
+        Args:
+            text: 輸入文本
+        Returns:
+            np.ndarray: 文本的 CLIP 特徵向量
+        """
+        text_token = clip.tokenize([text]).to(self.device)
         with torch.no_grad():
             text_features = self.model.encode_text(text_token)

clip_model_manager.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import torch
-import open_clip
 import numpy as np
 import logging
 import traceback
@@ -12,7 +12,7 @@ class CLIPModelManager:
     專門管理 CLIP 模型相關的操作，包括模型載入、設備管理、圖像和文本的特徵編碼等核心功能
     """
-    def __init__(self, model_name: str = "ViT-B-16", device: str = None, pretrained: str = "laion2b_s34b_b88k"):
         """
         初始化 CLIP 模型管理器
@@ -22,8 +22,6 @@ class CLIPModelManager:
         """
         self.logger = logging.getLogger(__name__)
         self.model_name = model_name
-        self.pretrained = pretrained
-        self.tokenizer = None
         # 設置運行設備
         if device is None:
@@ -31,32 +29,19 @@ class CLIPModelManager:
         else:
             self.device = device
         self.preprocess = None
         self._initialize_model()
     def _initialize_model(self):
         """
-        初始化OpenCLIP模型
         """
         try:
-            self.logger.info(f"Initializing OpenCLIP model ({self.model_name}) with {self.pretrained} on {self.device}")
-            self.model, _, self.preprocess = open_clip.create_model_and_transforms(
-                self.model_name,
-                pretrained=self.pretrained,
-                device=self.device
-            )
-            self.tokenizer = open_clip.get_tokenizer(self.model_name)
-            self.logger.info("Successfully loaded OpenCLIP model")
-            # 立即清理 OpenCLIP 載入過程中的記憶體碎片
-            import gc
-            gc.collect()
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-                torch.cuda.synchronize()
-            self.logger.info("Memory cleanup completed after OpenCLIP loading in CLIPModelManager")
         except Exception as e:
             self.logger.error(f"Error loading CLIP model: {e}")
             self.logger.error(traceback.format_exc())
@@ -102,7 +87,7 @@ class CLIPModelManager:
                 for i in range(0, len(text_prompts), batch_size):
                     batch_prompts = text_prompts[i:i+batch_size]
-                    text_tokens = self.tokenizer(batch_prompts).to(self.device)
                     batch_features = self.model.encode_text(text_tokens)
                     batch_features = batch_features / batch_features.norm(dim=-1, keepdim=True)
                     features_list.append(batch_features)
@@ -121,9 +106,18 @@ class CLIPModelManager:
             raise
     def encode_single_text(self, text_prompts: List[str]) -> torch.Tensor:
         try:
             with torch.no_grad():
-                text_tokens = self.tokenizer(text_prompts).to(self.device)
                 text_features = self.model.encode_text(text_tokens)
                 text_features = text_features / text_features.norm(dim=-1, keepdim=True)
                 return text_features

 import torch
+import clip
 import numpy as np
 import logging
 import traceback
     專門管理 CLIP 模型相關的操作，包括模型載入、設備管理、圖像和文本的特徵編碼等核心功能
     """
+    def __init__(self, model_name: str = "ViT-B/16", device: str = None):
         """
         初始化 CLIP 模型管理器
         """
         self.logger = logging.getLogger(__name__)
         self.model_name = model_name
         # 設置運行設備
         if device is None:
         else:
             self.device = device
+        self.model = None
         self.preprocess = None
         self._initialize_model()
     def _initialize_model(self):
         """
+        初始化CLIP模型
         """
         try:
+            self.logger.info(f"Initializing CLIP model ({self.model_name}) on {self.device}")
+            self.model, self.preprocess = clip.load(self.model_name, device=self.device)
+            self.logger.info("Successfully loaded CLIP model")
         except Exception as e:
             self.logger.error(f"Error loading CLIP model: {e}")
             self.logger.error(traceback.format_exc())
                 for i in range(0, len(text_prompts), batch_size):
                     batch_prompts = text_prompts[i:i+batch_size]
+                    text_tokens = clip.tokenize(batch_prompts).to(self.device)
                     batch_features = self.model.encode_text(text_tokens)
                     batch_features = batch_features / batch_features.norm(dim=-1, keepdim=True)
                     features_list.append(batch_features)
             raise
     def encode_single_text(self, text_prompts: List[str]) -> torch.Tensor:
+        """
+        編碼單個文本批次的特徵
+        Args:
+            text_prompts: 文本提示列表
+        Returns:
+            torch.Tensor: 標準化後的文本特徵
+        """
         try:
             with torch.no_grad():
+                text_tokens = clip.tokenize(text_prompts).to(self.device)
                 text_features = self.model.encode_text(text_tokens)
                 text_features = text_features / text_features.norm(dim=-1, keepdim=True)
                 return text_features

clip_zero_shot_classifier.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import torch
-import open_clip
 from PIL import Image
 import numpy as np
 import logging
@@ -21,18 +21,18 @@ class CLIPZeroShotClassifier:
     這是一個總窗口class，協調各個組件的工作以提供統一的對外接口。
     """
-    def __init__(self, model_name: str = "ViT-B-16", device: str = None, pretrained: str = "laion2b_s34b_b88k"):
         """
         初始化CLIP零樣本分類器
         Args:
-            model_name: OpenCLIP模型名稱，默認為"ViT-B-16"
             device: 運行設備，None則自動選擇
         """
         self.logger = logging.getLogger(__name__)
         # 初始化各個組件
-        self.clip_model_manager = CLIPModelManager(model_name, device, pretrained)
         self.landmark_data_manager = LandmarkDataManager()
         self.image_analyzer = ImageAnalyzer()
         self.confidence_manager = ConfidenceManager()

 import torch
+import clip
 from PIL import Image
 import numpy as np
 import logging
     這是一個總窗口class，協調各個組件的工作以提供統一的對外接口。
     """
+    def __init__(self, model_name: str = "ViT-B/16", device: str = None):
         """
         初始化CLIP零樣本分類器
         Args:
+            model_name: CLIP模型名稱，默認為"ViT-B/16"
             device: 運行設備，None則自動選擇
         """
         self.logger = logging.getLogger(__name__)
         # 初始化各個組件
+        self.clip_model_manager = CLIPModelManager(model_name, device)
         self.landmark_data_manager = LandmarkDataManager()
         self.image_analyzer = ImageAnalyzer()
         self.confidence_manager = ConfidenceManager()