vidhanm commited on
Commit
2af1927
·
1 Parent(s): fb82462

updated generate function

Browse files
Files changed (1) hide show
  1. app.py +19 -15
app.py CHANGED
@@ -1,5 +1,8 @@
1
  import sys
2
  import os
 
 
 
3
 
4
  # Add the cloned nanoVLM directory to Python's system path
5
  NANOVLM_REPO_PATH = "/app/nanoVLM"
@@ -60,21 +63,20 @@ else:
60
  print("Custom VisionLanguageModel class not imported, cannot load model.")
61
 
62
  def prepare_inputs(text_list, image_input, image_processor_instance, tokenizer_instance, device_to_use):
 
63
  if image_processor_instance is None or tokenizer_instance is None:
64
  raise ValueError("Image processor or tokenizer not initialized.")
 
65
  processed_image = image_processor_instance(images=image_input, return_tensors="pt").pixel_values.to(device_to_use)
 
66
  processed_text = tokenizer_instance(
67
  text=text_list, return_tensors="pt", padding=True, truncation=True, max_length=getattr(tokenizer_instance, 'model_max_length', 512)
68
  )
69
  input_ids = processed_text.input_ids.to(device_to_use)
70
  attention_mask = processed_text.attention_mask.to(device_to_use)
 
71
  return {"pixel_values": processed_image, "input_ids": input_ids, "attention_mask": attention_mask}
72
 
73
- from typing import Optional
74
- from PIL import Image as PILImage # Add at the top of your app.py
75
-
76
- # ... (other imports and model loading) ...
77
-
78
  def generate_text_for_image(image_input: Optional[PILImage.Image], prompt_input: Optional[str]) -> str:
79
  if model is None or image_processor is None or tokenizer is None:
80
  return "Error: Model or processor components not loaded correctly. Check logs."
@@ -93,21 +95,25 @@ def generate_text_for_image(image_input: Optional[PILImage.Image], prompt_input:
93
  image_processor_instance=image_processor, tokenizer_instance=tokenizer, device_to_use=device
94
  )
95
 
96
- print(f"Debug: Passing to model.generate: pixel_values_shape={inputs['pixel_values'].shape}, input_ids_shape={inputs['input_ids'].shape}, attention_mask_shape={inputs['attention_mask'].shape}")
97
 
98
- # Call model.generate with positional arguments matching nanoVLM's VisionLanguageModel.generate
 
99
  generated_ids = model.generate(
100
- inputs['pixel_values'], # pixel_values
101
- inputs['input_ids'], # prompt_token_ids
102
- inputs['attention_mask'], # attention_mask
103
- 150 # max_new_tokens (as a positional argument)
104
- # You can add temperature=..., top_k=... here if desired, as they are keyword args in nanoVLM's generate
 
 
 
 
105
  )
106
 
107
  generated_text_list = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
108
  generated_text = generated_text_list[0] if generated_text_list else ""
109
 
110
- # Clean up prompt if it's echoed (optional, depends on model behavior)
111
  if prompt_input and generated_text.startswith(prompt_input):
112
  cleaned_text = generated_text[len(prompt_input):].lstrip(" ,.:")
113
  else:
@@ -121,8 +127,6 @@ def generate_text_for_image(image_input: Optional[PILImage.Image], prompt_input:
121
  traceback.print_exc()
122
  return f"An error occurred during text generation: {str(e)}"
123
 
124
- # ... (rest of app.py)
125
-
126
  description = "Interactive demo for lusxvr/nanoVLM-222M."
127
  # example_image_url = "http://images.cocodataset.org/val2017/000000039769.jpg" # Not used for now
128
 
 
1
  import sys
2
  import os
3
+ from PIL import Image as PILImage # Add at the top of your app.py if not already there
4
+ from typing import Optional
5
+
6
 
7
  # Add the cloned nanoVLM directory to Python's system path
8
  NANOVLM_REPO_PATH = "/app/nanoVLM"
 
63
  print("Custom VisionLanguageModel class not imported, cannot load model.")
64
 
65
  def prepare_inputs(text_list, image_input, image_processor_instance, tokenizer_instance, device_to_use):
66
+ # This function is fine
67
  if image_processor_instance is None or tokenizer_instance is None:
68
  raise ValueError("Image processor or tokenizer not initialized.")
69
+
70
  processed_image = image_processor_instance(images=image_input, return_tensors="pt").pixel_values.to(device_to_use)
71
+
72
  processed_text = tokenizer_instance(
73
  text=text_list, return_tensors="pt", padding=True, truncation=True, max_length=getattr(tokenizer_instance, 'model_max_length', 512)
74
  )
75
  input_ids = processed_text.input_ids.to(device_to_use)
76
  attention_mask = processed_text.attention_mask.to(device_to_use)
77
+
78
  return {"pixel_values": processed_image, "input_ids": input_ids, "attention_mask": attention_mask}
79
 
 
 
 
 
 
80
  def generate_text_for_image(image_input: Optional[PILImage.Image], prompt_input: Optional[str]) -> str:
81
  if model is None or image_processor is None or tokenizer is None:
82
  return "Error: Model or processor components not loaded correctly. Check logs."
 
95
  image_processor_instance=image_processor, tokenizer_instance=tokenizer, device_to_use=device
96
  )
97
 
98
+ print(f"Debug: Shapes before model.generate: pixel_values={inputs['pixel_values'].shape}, input_ids={inputs['input_ids'].shape}, attention_mask={inputs['attention_mask'].shape}")
99
 
100
+ # --- CORRECTED model.generate CALL ---
101
+ # Match the signature: def generate(self, input_ids, image, attention_mask=None, max_new_tokens=...)
102
  generated_ids = model.generate(
103
+ inputs['input_ids'], # 1st argument: input_ids (text prompt)
104
+ inputs['pixel_values'], # 2nd argument: image (pixel values)
105
+ inputs['attention_mask'], # 3rd argument: attention_mask (for text)
106
+ max_new_tokens=150, # Keyword argument for max_new_tokens
107
+ # Other optional keyword arguments from the signature can be added here:
108
+ # top_k=50,
109
+ # top_p=0.9,
110
+ # temperature=0.7, # Default is 0.5 in the provided signature
111
+ # greedy=False
112
  )
113
 
114
  generated_text_list = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
115
  generated_text = generated_text_list[0] if generated_text_list else ""
116
 
 
117
  if prompt_input and generated_text.startswith(prompt_input):
118
  cleaned_text = generated_text[len(prompt_input):].lstrip(" ,.:")
119
  else:
 
127
  traceback.print_exc()
128
  return f"An error occurred during text generation: {str(e)}"
129
 
 
 
130
  description = "Interactive demo for lusxvr/nanoVLM-222M."
131
  # example_image_url = "http://images.cocodataset.org/val2017/000000039769.jpg" # Not used for now
132