Spaces:

Cicici1109
/

IEAP

Running on Zero

App Files Files Community

Cicici1109 commited on Jun 2

Commit

b7130f0

verified ·

1 Parent(s): 5fb32e4

Update utils.py

Browse files

Files changed (1) hide show

utils.py +46 -43

utils.py CHANGED Viewed

@@ -5,11 +5,12 @@ from src.flux.condition import Condition
 from PIL import Image
 import argparse
 import os
 import json
 import base64
 import io
 import re
-from PIL import Image, ImageFilter
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from scipy.ndimage import binary_dilation
 import cv2
@@ -27,6 +28,31 @@ except ImportError:
 import re
 def encode_image_to_datauri(path, size=(512, 512)):
     with Image.open(path).convert('RGB') as img:
         img = img.resize(size, Image.LANCZOS)
@@ -34,8 +60,6 @@ def encode_image_to_datauri(path, size=(512, 512)):
         img.save(buffer, format='PNG')
     b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
     return b64
-    # return f"data:image/png;base64,{b64}"
 @retry(
     reraise=True,
@@ -93,7 +117,6 @@ def cot_with_gpt(image_uri, instruction):
     categories, instructions = extract_instructions(text)
     return categories, instructions
 def extract_instructions(text):
     categories = []
     instructions = []
@@ -134,9 +157,9 @@ def extract_last_bbox(result):
     x0, y0, x1, y1 = map(int, last_match[1:])
     return x0, y0, x1, y1
 def infer_with_DiT(task, image, instruction, category):
-    # seed_everything(3407)
     if task == 'RoI Inpainting':
         if category == 'Add' or category == 'Replace':
@@ -180,18 +203,14 @@ def infer_with_DiT(task, image, instruction, category):
         condition = Condition("scene", image, position_delta=(0, -32))
     else:
         raise ValueError(f"Invalid task: '{task}'")
-    pipe = FluxPipeline.from_pretrained(
-        "black-forest-labs/FLUX.1-dev",
-        torch_dtype=torch.bfloat16
-    )
-    pipe = pipe.to("cuda")
     pipe.load_lora_weights(
         "Cicici1109/IEAP",
         weight_name=lora_path,
         adapter_name="scene",
     )
     result_img = generate(
         pipe,
         prompt=instruction_dit,
@@ -201,15 +220,13 @@ def infer_with_DiT(task, image, instruction, category):
         height=512,
         width=512,
     ).images[0]
-    # result_img
     if task == 'RoI Editing' and category == 'Action Change':
         text_roi = extract_object_with_gpt(instruction)
         instruction_loc = f"<image>Please segment {text_roi}."
-        # (model, tokenizer, image_path, instruction, work_dir, dilate):
         img = result_img
-        # print(f"Instruction: {instruction_loc}")
-        model, tokenizer = load_model("ByteDance/Sa2VA-8B")
         result = model.predict_forward(
             image=img,
@@ -218,13 +235,11 @@ def infer_with_DiT(task, image, instruction, category):
         )
         prediction = result['prediction']
-        # print(f"Model Output: {prediction}")
         if '[SEG]' in prediction and 'prediction_masks' in result:
             pred_mask = result['prediction_masks'][0]
             pred_mask_np = np.squeeze(np.array(pred_mask))
-            ## obtain region bbox
             rows = np.any(pred_mask_np, axis=1)
             cols = np.any(pred_mask_np, axis=0)
             if not np.any(rows) or not np.any(cols):
@@ -238,18 +253,10 @@ def infer_with_DiT(task, image, instruction, category):
             return changed_instance, x0, y1, 1
     return result_img
 def load_model(model_path):
-    model = AutoModelForCausalLM.from_pretrained(
-        model_path,
-        torch_dtype="auto",
-        device_map="auto",
-        trust_remote_code=True
-    ).eval()
-    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-    return model, tokenizer
 def extract_object_with_gpt(instruction):
     system_prompt = (
@@ -304,7 +311,6 @@ def extract_region_with_gpt(instruction):
             max_tokens=20,
         )
         object_phrase = response.choices[0].message['content'].strip().strip('"')
-        # print(f"Identified object: {object_phrase}")
         return object_phrase
     except Exception as e:
         print(f"GPT extraction failed: {e}")
@@ -372,8 +378,9 @@ def crop_masked_region(image, pred_mask_np):
     return Image.fromarray(cropped_image, mode='RGBA')
-def roi_localization(image, instruction, category): # add, remove, replace, action change, move, resize
-    model, tokenizer = load_model("ByteDance/Sa2VA-8B")
     if category == 'Add':
         text_roi = extract_region_with_gpt(instruction)
     else:
@@ -389,13 +396,11 @@ def roi_localization(image, instruction, category): # add, remove, replace, acti
     )
     prediction = result['prediction']
-    # print(f"Model Output: {prediction}")
     if '[SEG]' in prediction and 'prediction_masks' in result:
         pred_mask = result['prediction_masks'][0]
         pred_mask_np = np.squeeze(np.array(pred_mask))
         if category == 'Add':
-            ## obtain region bbox
             rows = np.any(pred_mask_np, axis=1)
             cols = np.any(pred_mask_np, axis=0)
             if not np.any(rows) or not np.any(cols):
@@ -405,17 +410,14 @@ def roi_localization(image, instruction, category): # add, remove, replace, acti
             y0, y1 = np.where(rows)[0][[0, -1]]
             x0, x1 = np.where(cols)[0][[0, -1]]
-            ## obtain inpainting bbox
-            bbox = combine_bbox(text_roi, x0, y0, x1, y1) #? multiple?
-            # print(bbox)
             x0, y0, x1, y1 = layout_add(bbox, instruction)
             mask = bbox_to_mask(x0, y0, x1, y1)
-            ## make it black
             masked_img = get_masked(mask, img)
         elif category == 'Move' or category == 'Resize':
             dilated_original_mask = binary_dilation(pred_mask_np, iterations=3)
             masked_img = get_masked(dilated_original_mask, img)
-            ## obtain region bbox
             rows = np.any(pred_mask_np, axis=1)
             cols = np.any(pred_mask_np, axis=0)
             if not np.any(rows) or not np.any(cols):
@@ -425,12 +427,10 @@ def roi_localization(image, instruction, category): # add, remove, replace, acti
             y0, y1 = np.where(rows)[0][[0, -1]]
             x0, x1 = np.where(cols)[0][[0, -1]]
-            ## obtain inpainting bbox
-            bbox = combine_bbox(text_roi, x0, y0, x1, y1) #? multiple?
-            # print(bbox)
             x0_new, y0_new, x1_new, y1_new, = layout_change(bbox, instruction)
             scale = (y1_new - y0_new) / (y1 - y0)
-            # print(scale)
             changed_instance = crop_masked_region(img, pred_mask_np)
             return masked_img, changed_instance, x0_new, y1_new, scale
@@ -588,4 +588,7 @@ def layout_change(bbox, instruction):
     result = response.choices[0].message.content.strip()
     bbox = extract_last_bbox(result)
-    return bbox

 from PIL import Image
 import argparse
 import os
+import spaces
 import json
 import base64
 import io
 import re
+from PIL import ImageFilter
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from scipy.ndimage import binary_dilation
 import cv2
 import re
+pipe = None
+model_dict = {}
+def init_flux_pipeline():
+    global pipe
+    if pipe is None:
+        pipe = FluxPipeline.from_pretrained(
+            "black-forest-labs/FLUX.1-dev",
+            torch_dtype=torch.bfloat16
+        )
+        pipe = pipe.to("cuda")
+def get_model(model_path):
+    global model_dict
+    if model_path not in model_dict:
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            torch_dtype="auto",
+            device_map="auto",
+            trust_remote_code=True
+        ).eval()
+        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        model_dict[model_path] = (model, tokenizer)
+    return model_dict[model_path]
 def encode_image_to_datauri(path, size=(512, 512)):
     with Image.open(path).convert('RGB') as img:
         img = img.resize(size, Image.LANCZOS)
         img.save(buffer, format='PNG')
     b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
     return b64
 @retry(
     reraise=True,
     categories, instructions = extract_instructions(text)
     return categories, instructions
 def extract_instructions(text):
     categories = []
     instructions = []
     x0, y0, x1, y1 = map(int, last_match[1:])
     return x0, y0, x1, y1
+@spaces.GPU
 def infer_with_DiT(task, image, instruction, category):
+    init_flux_pipeline()
     if task == 'RoI Inpainting':
         if category == 'Add' or category == 'Replace':
         condition = Condition("scene", image, position_delta=(0, -32))
     else:
         raise ValueError(f"Invalid task: '{task}'")
+    pipe.unload_lora_weights()
     pipe.load_lora_weights(
         "Cicici1109/IEAP",
         weight_name=lora_path,
         adapter_name="scene",
     )
     result_img = generate(
         pipe,
         prompt=instruction_dit,
         height=512,
         width=512,
     ).images[0]
     if task == 'RoI Editing' and category == 'Action Change':
         text_roi = extract_object_with_gpt(instruction)
         instruction_loc = f"<image>Please segment {text_roi}."
         img = result_img
+        model, tokenizer = get_model("ByteDance/Sa2VA-8B")
         result = model.predict_forward(
             image=img,
         )
         prediction = result['prediction']
         if '[SEG]' in prediction and 'prediction_masks' in result:
             pred_mask = result['prediction_masks'][0]
             pred_mask_np = np.squeeze(np.array(pred_mask))
             rows = np.any(pred_mask_np, axis=1)
             cols = np.any(pred_mask_np, axis=0)
             if not np.any(rows) or not np.any(cols):
             return changed_instance, x0, y1, 1
     return result_img
 def load_model(model_path):
+    return get_model(model_path)
 def extract_object_with_gpt(instruction):
     system_prompt = (
             max_tokens=20,
         )
         object_phrase = response.choices[0].message['content'].strip().strip('"')
         return object_phrase
     except Exception as e:
         print(f"GPT extraction failed: {e}")
     return Image.fromarray(cropped_image, mode='RGBA')
+@spaces.GPU
+def roi_localization(image, instruction, category):
+    model, tokenizer = get_model("ByteDance/Sa2VA-8B")
     if category == 'Add':
         text_roi = extract_region_with_gpt(instruction)
     else:
     )
     prediction = result['prediction']
     if '[SEG]' in prediction and 'prediction_masks' in result:
         pred_mask = result['prediction_masks'][0]
         pred_mask_np = np.squeeze(np.array(pred_mask))
         if category == 'Add':
             rows = np.any(pred_mask_np, axis=1)
             cols = np.any(pred_mask_np, axis=0)
             if not np.any(rows) or not np.any(cols):
             y0, y1 = np.where(rows)[0][[0, -1]]
             x0, x1 = np.where(cols)[0][[0, -1]]
+            bbox = combine_bbox(text_roi, x0, y0, x1, y1)
             x0, y0, x1, y1 = layout_add(bbox, instruction)
             mask = bbox_to_mask(x0, y0, x1, y1)
             masked_img = get_masked(mask, img)
         elif category == 'Move' or category == 'Resize':
             dilated_original_mask = binary_dilation(pred_mask_np, iterations=3)
             masked_img = get_masked(dilated_original_mask, img)
             rows = np.any(pred_mask_np, axis=1)
             cols = np.any(pred_mask_np, axis=0)
             if not np.any(rows) or not np.any(cols):
             y0, y1 = np.where(rows)[0][[0, -1]]
             x0, x1 = np.where(cols)[0][[0, -1]]
+            bbox = combine_bbox(text_roi, x0, y0, x1, y1)
             x0_new, y0_new, x1_new, y1_new, = layout_change(bbox, instruction)
             scale = (y1_new - y0_new) / (y1 - y0)
             changed_instance = crop_masked_region(img, pred_mask_np)
             return masked_img, changed_instance, x0_new, y1_new, scale
     result = response.choices[0].message.content.strip()
     bbox = extract_last_bbox(result)
+    return bbox
+if __name__ == "__main__":
+    init_flux_pipeline()