Spaces:

witcher23
/

nanoVLM-inference

Running

App Files Files Community

vidhanm commited on May 23

Commit

2af1927

1 Parent(s): fb82462

updated generate function

Browse files

Files changed (1) hide show

app.py +19 -15

app.py CHANGED Viewed

@@ -1,5 +1,8 @@
 import sys
 import os
 # Add the cloned nanoVLM directory to Python's system path
 NANOVLM_REPO_PATH = "/app/nanoVLM"
@@ -60,21 +63,20 @@ else:
     print("Custom VisionLanguageModel class not imported, cannot load model.")
 def prepare_inputs(text_list, image_input, image_processor_instance, tokenizer_instance, device_to_use):
     if image_processor_instance is None or tokenizer_instance is None:
         raise ValueError("Image processor or tokenizer not initialized.")
     processed_image = image_processor_instance(images=image_input, return_tensors="pt").pixel_values.to(device_to_use)
     processed_text = tokenizer_instance(
         text=text_list, return_tensors="pt", padding=True, truncation=True, max_length=getattr(tokenizer_instance, 'model_max_length', 512)
     )
     input_ids = processed_text.input_ids.to(device_to_use)
     attention_mask = processed_text.attention_mask.to(device_to_use)
     return {"pixel_values": processed_image, "input_ids": input_ids, "attention_mask": attention_mask}
-from typing import Optional
-from PIL import Image as PILImage # Add at the top of your app.py
-# ... (other imports and model loading) ...
 def generate_text_for_image(image_input: Optional[PILImage.Image], prompt_input: Optional[str]) -> str:
     if model is None or image_processor is None or tokenizer is None:
         return "Error: Model or processor components not loaded correctly. Check logs."
@@ -93,21 +95,25 @@ def generate_text_for_image(image_input: Optional[PILImage.Image], prompt_input:
             image_processor_instance=image_processor, tokenizer_instance=tokenizer, device_to_use=device
         )
-        print(f"Debug: Passing to model.generate: pixel_values_shape={inputs['pixel_values'].shape}, input_ids_shape={inputs['input_ids'].shape}, attention_mask_shape={inputs['attention_mask'].shape}")
-        # Call model.generate with positional arguments matching nanoVLM's VisionLanguageModel.generate
         generated_ids = model.generate(
-            inputs['pixel_values'],       # pixel_values
-            inputs['input_ids'],          # prompt_token_ids
-            inputs['attention_mask'],     # attention_mask
-            150                           # max_new_tokens (as a positional argument)
-            # You can add temperature=..., top_k=... here if desired, as they are keyword args in nanoVLM's generate
         )
         generated_text_list = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
         generated_text = generated_text_list[0] if generated_text_list else ""
-        # Clean up prompt if it's echoed (optional, depends on model behavior)
         if prompt_input and generated_text.startswith(prompt_input):
              cleaned_text = generated_text[len(prompt_input):].lstrip(" ,.:")
         else:
@@ -121,8 +127,6 @@ def generate_text_for_image(image_input: Optional[PILImage.Image], prompt_input:
         traceback.print_exc()
         return f"An error occurred during text generation: {str(e)}"
-# ... (rest of app.py)
 description = "Interactive demo for lusxvr/nanoVLM-222M."
 # example_image_url = "http://images.cocodataset.org/val2017/000000039769.jpg" # Not used for now

 import sys
 import os
+from PIL import Image as PILImage # Add at the top of your app.py if not already there
+from typing import Optional
 # Add the cloned nanoVLM directory to Python's system path
 NANOVLM_REPO_PATH = "/app/nanoVLM"
     print("Custom VisionLanguageModel class not imported, cannot load model.")
 def prepare_inputs(text_list, image_input, image_processor_instance, tokenizer_instance, device_to_use):
+    # This function is fine
     if image_processor_instance is None or tokenizer_instance is None:
         raise ValueError("Image processor or tokenizer not initialized.")
     processed_image = image_processor_instance(images=image_input, return_tensors="pt").pixel_values.to(device_to_use)
     processed_text = tokenizer_instance(
         text=text_list, return_tensors="pt", padding=True, truncation=True, max_length=getattr(tokenizer_instance, 'model_max_length', 512)
     )
     input_ids = processed_text.input_ids.to(device_to_use)
     attention_mask = processed_text.attention_mask.to(device_to_use)
     return {"pixel_values": processed_image, "input_ids": input_ids, "attention_mask": attention_mask}
 def generate_text_for_image(image_input: Optional[PILImage.Image], prompt_input: Optional[str]) -> str:
     if model is None or image_processor is None or tokenizer is None:
         return "Error: Model or processor components not loaded correctly. Check logs."
             image_processor_instance=image_processor, tokenizer_instance=tokenizer, device_to_use=device
         )
+        print(f"Debug: Shapes before model.generate: pixel_values={inputs['pixel_values'].shape}, input_ids={inputs['input_ids'].shape}, attention_mask={inputs['attention_mask'].shape}")
+        # --- CORRECTED model.generate CALL ---
+        # Match the signature: def generate(self, input_ids, image, attention_mask=None, max_new_tokens=...)
         generated_ids = model.generate(
+            inputs['input_ids'],          # 1st argument: input_ids (text prompt)
+            inputs['pixel_values'],       # 2nd argument: image (pixel values)
+            inputs['attention_mask'],     # 3rd argument: attention_mask (for text)
+            max_new_tokens=150,           # Keyword argument for max_new_tokens
+            # Other optional keyword arguments from the signature can be added here:
+            # top_k=50,
+            # top_p=0.9,
+            # temperature=0.7, # Default is 0.5 in the provided signature
+            # greedy=False
         )
         generated_text_list = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
         generated_text = generated_text_list[0] if generated_text_list else ""
         if prompt_input and generated_text.startswith(prompt_input):
              cleaned_text = generated_text[len(prompt_input):].lstrip(" ,.:")
         else:
         traceback.print_exc()
         return f"An error occurred during text generation: {str(e)}"
 description = "Interactive demo for lusxvr/nanoVLM-222M."
 # example_image_url = "http://images.cocodataset.org/val2017/000000039769.jpg" # Not used for now