Spaces:
Running
Running
vidhanm
commited on
Commit
·
2af1927
1
Parent(s):
fb82462
updated generate function
Browse files
app.py
CHANGED
@@ -1,5 +1,8 @@
|
|
1 |
import sys
|
2 |
import os
|
|
|
|
|
|
|
3 |
|
4 |
# Add the cloned nanoVLM directory to Python's system path
|
5 |
NANOVLM_REPO_PATH = "/app/nanoVLM"
|
@@ -60,21 +63,20 @@ else:
|
|
60 |
print("Custom VisionLanguageModel class not imported, cannot load model.")
|
61 |
|
62 |
def prepare_inputs(text_list, image_input, image_processor_instance, tokenizer_instance, device_to_use):
|
|
|
63 |
if image_processor_instance is None or tokenizer_instance is None:
|
64 |
raise ValueError("Image processor or tokenizer not initialized.")
|
|
|
65 |
processed_image = image_processor_instance(images=image_input, return_tensors="pt").pixel_values.to(device_to_use)
|
|
|
66 |
processed_text = tokenizer_instance(
|
67 |
text=text_list, return_tensors="pt", padding=True, truncation=True, max_length=getattr(tokenizer_instance, 'model_max_length', 512)
|
68 |
)
|
69 |
input_ids = processed_text.input_ids.to(device_to_use)
|
70 |
attention_mask = processed_text.attention_mask.to(device_to_use)
|
|
|
71 |
return {"pixel_values": processed_image, "input_ids": input_ids, "attention_mask": attention_mask}
|
72 |
|
73 |
-
from typing import Optional
|
74 |
-
from PIL import Image as PILImage # Add at the top of your app.py
|
75 |
-
|
76 |
-
# ... (other imports and model loading) ...
|
77 |
-
|
78 |
def generate_text_for_image(image_input: Optional[PILImage.Image], prompt_input: Optional[str]) -> str:
|
79 |
if model is None or image_processor is None or tokenizer is None:
|
80 |
return "Error: Model or processor components not loaded correctly. Check logs."
|
@@ -93,21 +95,25 @@ def generate_text_for_image(image_input: Optional[PILImage.Image], prompt_input:
|
|
93 |
image_processor_instance=image_processor, tokenizer_instance=tokenizer, device_to_use=device
|
94 |
)
|
95 |
|
96 |
-
print(f"Debug:
|
97 |
|
98 |
-
#
|
|
|
99 |
generated_ids = model.generate(
|
100 |
-
inputs['
|
101 |
-
inputs['
|
102 |
-
inputs['attention_mask'], # attention_mask
|
103 |
-
150
|
104 |
-
#
|
|
|
|
|
|
|
|
|
105 |
)
|
106 |
|
107 |
generated_text_list = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
|
108 |
generated_text = generated_text_list[0] if generated_text_list else ""
|
109 |
|
110 |
-
# Clean up prompt if it's echoed (optional, depends on model behavior)
|
111 |
if prompt_input and generated_text.startswith(prompt_input):
|
112 |
cleaned_text = generated_text[len(prompt_input):].lstrip(" ,.:")
|
113 |
else:
|
@@ -121,8 +127,6 @@ def generate_text_for_image(image_input: Optional[PILImage.Image], prompt_input:
|
|
121 |
traceback.print_exc()
|
122 |
return f"An error occurred during text generation: {str(e)}"
|
123 |
|
124 |
-
# ... (rest of app.py)
|
125 |
-
|
126 |
description = "Interactive demo for lusxvr/nanoVLM-222M."
|
127 |
# example_image_url = "http://images.cocodataset.org/val2017/000000039769.jpg" # Not used for now
|
128 |
|
|
|
1 |
import sys
|
2 |
import os
|
3 |
+
from PIL import Image as PILImage # Add at the top of your app.py if not already there
|
4 |
+
from typing import Optional
|
5 |
+
|
6 |
|
7 |
# Add the cloned nanoVLM directory to Python's system path
|
8 |
NANOVLM_REPO_PATH = "/app/nanoVLM"
|
|
|
63 |
print("Custom VisionLanguageModel class not imported, cannot load model.")
|
64 |
|
65 |
def prepare_inputs(text_list, image_input, image_processor_instance, tokenizer_instance, device_to_use):
|
66 |
+
# This function is fine
|
67 |
if image_processor_instance is None or tokenizer_instance is None:
|
68 |
raise ValueError("Image processor or tokenizer not initialized.")
|
69 |
+
|
70 |
processed_image = image_processor_instance(images=image_input, return_tensors="pt").pixel_values.to(device_to_use)
|
71 |
+
|
72 |
processed_text = tokenizer_instance(
|
73 |
text=text_list, return_tensors="pt", padding=True, truncation=True, max_length=getattr(tokenizer_instance, 'model_max_length', 512)
|
74 |
)
|
75 |
input_ids = processed_text.input_ids.to(device_to_use)
|
76 |
attention_mask = processed_text.attention_mask.to(device_to_use)
|
77 |
+
|
78 |
return {"pixel_values": processed_image, "input_ids": input_ids, "attention_mask": attention_mask}
|
79 |
|
|
|
|
|
|
|
|
|
|
|
80 |
def generate_text_for_image(image_input: Optional[PILImage.Image], prompt_input: Optional[str]) -> str:
|
81 |
if model is None or image_processor is None or tokenizer is None:
|
82 |
return "Error: Model or processor components not loaded correctly. Check logs."
|
|
|
95 |
image_processor_instance=image_processor, tokenizer_instance=tokenizer, device_to_use=device
|
96 |
)
|
97 |
|
98 |
+
print(f"Debug: Shapes before model.generate: pixel_values={inputs['pixel_values'].shape}, input_ids={inputs['input_ids'].shape}, attention_mask={inputs['attention_mask'].shape}")
|
99 |
|
100 |
+
# --- CORRECTED model.generate CALL ---
|
101 |
+
# Match the signature: def generate(self, input_ids, image, attention_mask=None, max_new_tokens=...)
|
102 |
generated_ids = model.generate(
|
103 |
+
inputs['input_ids'], # 1st argument: input_ids (text prompt)
|
104 |
+
inputs['pixel_values'], # 2nd argument: image (pixel values)
|
105 |
+
inputs['attention_mask'], # 3rd argument: attention_mask (for text)
|
106 |
+
max_new_tokens=150, # Keyword argument for max_new_tokens
|
107 |
+
# Other optional keyword arguments from the signature can be added here:
|
108 |
+
# top_k=50,
|
109 |
+
# top_p=0.9,
|
110 |
+
# temperature=0.7, # Default is 0.5 in the provided signature
|
111 |
+
# greedy=False
|
112 |
)
|
113 |
|
114 |
generated_text_list = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
|
115 |
generated_text = generated_text_list[0] if generated_text_list else ""
|
116 |
|
|
|
117 |
if prompt_input and generated_text.startswith(prompt_input):
|
118 |
cleaned_text = generated_text[len(prompt_input):].lstrip(" ,.:")
|
119 |
else:
|
|
|
127 |
traceback.print_exc()
|
128 |
return f"An error occurred during text generation: {str(e)}"
|
129 |
|
|
|
|
|
130 |
description = "Interactive demo for lusxvr/nanoVLM-222M."
|
131 |
# example_image_url = "http://images.cocodataset.org/val2017/000000039769.jpg" # Not used for now
|
132 |
|