Spaces:

retromarz
/

plavu_MA

Runtime error

App Files Files Community

retromarz commited on Jun 5

Commit

ead87b5

verified ·

1 Parent(s): f5dc979

changed to 4bit

Browse files

Files changed (1) hide show

app.py +8 -11

app.py CHANGED Viewed

@@ -15,14 +15,15 @@ logger = logging.getLogger(__name__)
 # Define output JSON file path
 OUTPUT_JSON_PATH = "captions.json"
-# Load the model and processor
 MODEL_PATH = "fancyfeast/llama-joycaption-beta-one-hf-llava"
 try:
     processor = AutoProcessor.from_pretrained(MODEL_PATH)
     model = LlavaForConditionalGeneration.from_pretrained(
         MODEL_PATH,
-        torch_dtype=torch.float32,  # Use float32 for CPU compatibility
-        low_cpu_mem_usage=True  # Optimize for low memory
     ).to("cpu")
     model.eval()
     logger.info("Model and processor loaded successfully.")
@@ -40,7 +41,6 @@ def save_to_json(image_name, caption, caption_type, caption_length, error=None):
         "timestamp": datetime.now().isoformat(),
         "error": error
     }
-    # Load existing data or initialize empty list
     try:
         if os.path.exists(OUTPUT_JSON_PATH):
             with open(OUTPUT_JSON_PATH, "r") as f:
@@ -51,10 +51,7 @@ def save_to_json(image_name, caption, caption_type, caption_length, error=None):
         logger.error(f"Error reading JSON file: {str(e)}")
         data = []
-    # Append new result
     data.append(result)
-    # Save to JSON file
     try:
         with open(OUTPUT_JSON_PATH, "w") as f:
             json.dump(data, f, indent=4)
@@ -69,12 +66,12 @@ def generate_caption(input_image: Image.Image, caption_type: str = "descriptive"
         save_to_json("unknown", error_msg, caption_type, caption_length, error=error_msg)
         return error_msg
-    # Generate a unique image name if none provided
     image_name = f"image_{uuid.uuid4().hex}.jpg"
     try:
         # Resize image to reduce memory usage
-        input_image = input_image.resize((512, 512))
         # Prepare the prompt
         prompt = f"Write a {caption_length} {caption_type} caption for this image."
@@ -92,9 +89,9 @@ def generate_caption(input_image: Image.Image, caption_type: str = "descriptive"
         # Process the image and prompt
         inputs = processor(images=input_image, text=convo[1]["content"], return_tensors="pt").to("cpu")
-        # Generate the caption
         with torch.no_grad():
-            output = model.generate(**inputs, max_new_tokens=100, temperature=0.7, top_p=0.9)
         # Decode the output
         caption = processor.decode(output[0], skip_special_tokens=True).strip()

 # Define output JSON file path
 OUTPUT_JSON_PATH = "captions.json"
+# Load the model and processor with memory optimizations
 MODEL_PATH = "fancyfeast/llama-joycaption-beta-one-hf-llava"
 try:
     processor = AutoProcessor.from_pretrained(MODEL_PATH)
     model = LlavaForConditionalGeneration.from_pretrained(
         MODEL_PATH,
+        torch_dtype=torch.float32,  # CPU-compatible dtype
+        low_cpu_mem_usage=True,    # Minimize memory usage
+        load_in_4bit=True          # Enable 4-bit quantization
     ).to("cpu")
     model.eval()
     logger.info("Model and processor loaded successfully.")
         "timestamp": datetime.now().isoformat(),
         "error": error
     }
     try:
         if os.path.exists(OUTPUT_JSON_PATH):
             with open(OUTPUT_JSON_PATH, "r") as f:
         logger.error(f"Error reading JSON file: {str(e)}")
         data = []
     data.append(result)
     try:
         with open(OUTPUT_JSON_PATH, "w") as f:
             json.dump(data, f, indent=4)
         save_to_json("unknown", error_msg, caption_type, caption_length, error=error_msg)
         return error_msg
+    # Generate a unique image name
     image_name = f"image_{uuid.uuid4().hex}.jpg"
     try:
         # Resize image to reduce memory usage
+        input_image = input_image.resize((256, 256))  # Smaller resolution
         # Prepare the prompt
         prompt = f"Write a {caption_length} {caption_type} caption for this image."
         # Process the image and prompt
         inputs = processor(images=input_image, text=convo[1]["content"], return_tensors="pt").to("cpu")
+        # Generate the caption with reduced max tokens
         with torch.no_grad():
+            output = model.generate(**inputs, max_new_tokens=50, temperature=0.7, top_p=0.9)
         # Decode the output
         caption = processor.decode(output[0], skip_special_tokens=True).strip()