vidhanm commited on
Commit
0b8c303
·
1 Parent(s): 4d396f8

removed examples

Browse files
Files changed (1) hide show
  1. app.py +51 -65
app.py CHANGED
@@ -9,10 +9,8 @@ if NANOVLM_REPO_PATH not in sys.path:
9
  import gradio as gr
10
  from PIL import Image
11
  import torch
12
- # Import specific processor components
13
  from transformers import CLIPImageProcessor, GPT2TokenizerFast
14
 
15
- # Import the custom VisionLanguageModel class
16
  try:
17
  from models.vision_language_model import VisionLanguageModel
18
  print("Successfully imported VisionLanguageModel from nanoVLM clone.")
@@ -20,7 +18,6 @@ except ImportError as e:
20
  print(f"Error importing VisionLanguageModel from nanoVLM clone: {e}.")
21
  VisionLanguageModel = None
22
 
23
- # Determine the device to use
24
  device_choice = os.environ.get("DEVICE", "auto")
25
  if device_choice == "auto":
26
  device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -28,7 +25,6 @@ else:
28
  device = device_choice
29
  print(f"Using device: {device}")
30
 
31
- # --- Configuration for model components ---
32
  model_id_for_weights = "lusxvr/nanoVLM-222M"
33
  image_processor_id = "openai/clip-vit-base-patch32"
34
  tokenizer_id = "gpt2"
@@ -40,22 +36,18 @@ model = None
40
  if VisionLanguageModel:
41
  try:
42
  print(f"Attempting to load CLIPImageProcessor from: {image_processor_id}")
43
- # trust_remote_code for HF's classes is fine if they support it.
44
- image_processor = CLIPImageProcessor.from_pretrained(image_processor_id, trust_remote_code=True)
45
  print("CLIPImageProcessor loaded.")
46
 
47
  print(f"Attempting to load GPT2TokenizerFast from: {tokenizer_id}")
48
- tokenizer = GPT2TokenizerFast.from_pretrained(tokenizer_id, trust_remote_code=True)
49
  if tokenizer.pad_token is None:
50
  tokenizer.pad_token = tokenizer.eos_token
51
  print("Set tokenizer pad_token to eos_token.")
52
  print("GPT2TokenizerFast loaded.")
53
 
54
  print(f"Attempting to load model weights from {model_id_for_weights} using VisionLanguageModel.from_pretrained")
55
- # Removed trust_remote_code=True as the custom VisionLanguageModel.from_pretrained doesn't expect it.
56
- model = VisionLanguageModel.from_pretrained(
57
- model_id_for_weights
58
- ).to(device)
59
  print("Model loaded successfully.")
60
  model.eval()
61
 
@@ -63,62 +55,43 @@ if VisionLanguageModel:
63
  print(f"Error loading model or processor components: {e}")
64
  import traceback
65
  traceback.print_exc()
66
- image_processor = None
67
- tokenizer = None
68
- model = None
69
  else:
70
  print("Custom VisionLanguageModel class not imported, cannot load model.")
71
 
72
- # ... (rest of the app.py remains the same) ...
73
  def prepare_inputs(text_list, image_input, image_processor_instance, tokenizer_instance, device_to_use):
74
  if image_processor_instance is None or tokenizer_instance is None:
75
  raise ValueError("Image processor or tokenizer not initialized.")
76
-
77
  processed_image = image_processor_instance(images=image_input, return_tensors="pt").pixel_values.to(device_to_use)
78
-
79
  processed_text = tokenizer_instance(
80
- text=text_list, return_tensors="pt", padding=True, truncation=True, max_length=tokenizer_instance.model_max_length
81
  )
82
  input_ids = processed_text.input_ids.to(device_to_use)
83
  attention_mask = processed_text.attention_mask.to(device_to_use)
84
-
85
  return {"pixel_values": processed_image, "input_ids": input_ids, "attention_mask": attention_mask}
86
 
87
  def generate_text_for_image(image_input, prompt_input):
88
  if model is None or image_processor is None or tokenizer is None:
89
  return "Error: Model or processor components not loaded correctly. Check logs."
90
-
91
- if image_input is None:
92
- return "Please upload an image."
93
- if not prompt_input:
94
- return "Please provide a prompt."
95
 
96
  try:
97
  if not isinstance(image_input, Image.Image):
98
  pil_image = Image.fromarray(image_input)
99
  else:
100
  pil_image = image_input
101
-
102
- if pil_image.mode != "RGB":
103
- pil_image = pil_image.convert("RGB")
104
 
105
  inputs = prepare_inputs(
106
- text_list=[prompt_input],
107
- image_input=pil_image,
108
- image_processor_instance=image_processor,
109
- tokenizer_instance=tokenizer,
110
- device_to_use=device
111
  )
112
 
113
  generated_ids = model.generate(
114
- pixel_values=inputs['pixel_values'],
115
- input_ids=inputs['input_ids'],
116
- attention_mask=inputs['attention_mask'],
117
- max_new_tokens=150,
118
- num_beams=3,
119
- no_repeat_ngram_size=2,
120
- early_stopping=True,
121
- pad_token_id=tokenizer.pad_token_id
122
  )
123
 
124
  generated_text_list = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
@@ -128,38 +101,51 @@ def generate_text_for_image(image_input, prompt_input):
128
  cleaned_text = generated_text[len(prompt_input):].lstrip(" ,.:")
129
  else:
130
  cleaned_text = generated_text
131
-
132
  return cleaned_text.strip()
133
-
134
  except Exception as e:
135
  print(f"Error during generation: {e}")
136
- import traceback
137
- traceback.print_exc()
138
  return f"An error occurred during text generation: {str(e)}"
139
 
140
  description = "Interactive demo for lusxvr/nanoVLM-222M."
141
- example_image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
142
-
143
- iface = gr.Interface(
144
- fn=generate_text_for_image,
145
- inputs=[
146
- gr.Image(type="pil", label="Upload Image"),
147
- gr.Textbox(label="Your Prompt/Question")
148
- ],
149
- outputs=gr.Textbox(label="Generated Text", show_copy_button=True),
150
- title="Interactive nanoVLM-222M Demo",
151
- description=description,
152
- examples=[
153
- [example_image_url, "a photo of a"],
154
- [example_image_url, "Describe the image in detail."],
155
- ],
156
- # cache_examples=True, # Keep commented out for now
157
- allow_flagging="never"
158
- )
 
 
 
 
 
 
 
159
 
160
  if __name__ == "__main__":
161
  if model is None or image_processor is None or tokenizer is None:
162
- print("CRITICAL: Model or processor components failed to load.")
163
- else:
 
164
  print("Launching Gradio interface...")
165
- iface.launch(server_name="0.0.0.0", server_port=7860)
 
 
 
 
 
 
 
 
 
9
  import gradio as gr
10
  from PIL import Image
11
  import torch
 
12
  from transformers import CLIPImageProcessor, GPT2TokenizerFast
13
 
 
14
  try:
15
  from models.vision_language_model import VisionLanguageModel
16
  print("Successfully imported VisionLanguageModel from nanoVLM clone.")
 
18
  print(f"Error importing VisionLanguageModel from nanoVLM clone: {e}.")
19
  VisionLanguageModel = None
20
 
 
21
  device_choice = os.environ.get("DEVICE", "auto")
22
  if device_choice == "auto":
23
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
25
  device = device_choice
26
  print(f"Using device: {device}")
27
 
 
28
  model_id_for_weights = "lusxvr/nanoVLM-222M"
29
  image_processor_id = "openai/clip-vit-base-patch32"
30
  tokenizer_id = "gpt2"
 
36
  if VisionLanguageModel:
37
  try:
38
  print(f"Attempting to load CLIPImageProcessor from: {image_processor_id}")
39
+ image_processor = CLIPImageProcessor.from_pretrained(image_processor_id) # Removed trust_remote_code if not strictly needed by processor
 
40
  print("CLIPImageProcessor loaded.")
41
 
42
  print(f"Attempting to load GPT2TokenizerFast from: {tokenizer_id}")
43
+ tokenizer = GPT2TokenizerFast.from_pretrained(tokenizer_id) # Removed trust_remote_code if not strictly needed by tokenizer
44
  if tokenizer.pad_token is None:
45
  tokenizer.pad_token = tokenizer.eos_token
46
  print("Set tokenizer pad_token to eos_token.")
47
  print("GPT2TokenizerFast loaded.")
48
 
49
  print(f"Attempting to load model weights from {model_id_for_weights} using VisionLanguageModel.from_pretrained")
50
+ model = VisionLanguageModel.from_pretrained(model_id_for_weights).to(device)
 
 
 
51
  print("Model loaded successfully.")
52
  model.eval()
53
 
 
55
  print(f"Error loading model or processor components: {e}")
56
  import traceback
57
  traceback.print_exc()
58
+ image_processor = None; tokenizer = None; model = None
 
 
59
  else:
60
  print("Custom VisionLanguageModel class not imported, cannot load model.")
61
 
 
62
  def prepare_inputs(text_list, image_input, image_processor_instance, tokenizer_instance, device_to_use):
63
  if image_processor_instance is None or tokenizer_instance is None:
64
  raise ValueError("Image processor or tokenizer not initialized.")
 
65
  processed_image = image_processor_instance(images=image_input, return_tensors="pt").pixel_values.to(device_to_use)
 
66
  processed_text = tokenizer_instance(
67
+ text=text_list, return_tensors="pt", padding=True, truncation=True, max_length=getattr(tokenizer_instance, 'model_max_length', 512)
68
  )
69
  input_ids = processed_text.input_ids.to(device_to_use)
70
  attention_mask = processed_text.attention_mask.to(device_to_use)
 
71
  return {"pixel_values": processed_image, "input_ids": input_ids, "attention_mask": attention_mask}
72
 
73
  def generate_text_for_image(image_input, prompt_input):
74
  if model is None or image_processor is None or tokenizer is None:
75
  return "Error: Model or processor components not loaded correctly. Check logs."
76
+ if image_input is None: return "Please upload an image."
77
+ if not prompt_input: return "Please provide a prompt."
 
 
 
78
 
79
  try:
80
  if not isinstance(image_input, Image.Image):
81
  pil_image = Image.fromarray(image_input)
82
  else:
83
  pil_image = image_input
84
+ if pil_image.mode != "RGB": pil_image = pil_image.convert("RGB")
 
 
85
 
86
  inputs = prepare_inputs(
87
+ text_list=[prompt_input], image_input=pil_image,
88
+ image_processor_instance=image_processor, tokenizer_instance=tokenizer, device_to_use=device
 
 
 
89
  )
90
 
91
  generated_ids = model.generate(
92
+ pixel_values=inputs['pixel_values'], input_ids=inputs['input_ids'],
93
+ attention_mask=inputs['attention_mask'], max_new_tokens=150, num_beams=3,
94
+ no_repeat_ngram_size=2, early_stopping=True, pad_token_id=tokenizer.pad_token_id
 
 
 
 
 
95
  )
96
 
97
  generated_text_list = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
 
101
  cleaned_text = generated_text[len(prompt_input):].lstrip(" ,.:")
102
  else:
103
  cleaned_text = generated_text
 
104
  return cleaned_text.strip()
 
105
  except Exception as e:
106
  print(f"Error during generation: {e}")
107
+ import traceback; traceback.print_exc()
 
108
  return f"An error occurred during text generation: {str(e)}"
109
 
110
  description = "Interactive demo for lusxvr/nanoVLM-222M."
111
+ # example_image_url = "http://images.cocodataset.org/val2017/000000039769.jpg" # Not used for now
112
+
113
+ print("Defining Gradio interface...")
114
+ try:
115
+ iface = gr.Interface(
116
+ fn=generate_text_for_image,
117
+ inputs=[
118
+ gr.Image(type="pil", label="Upload Image"),
119
+ gr.Textbox(label="Your Prompt/Question")
120
+ ],
121
+ outputs=gr.Textbox(label="Generated Text", show_copy_button=True),
122
+ title="Interactive nanoVLM-222M Demo",
123
+ description=description,
124
+ # examples=[ # <<<< REMOVED EXAMPLES
125
+ # [example_image_url, "a photo of a"],
126
+ # [example_image_url, "Describe the image in detail."],
127
+ # ],
128
+ allow_flagging="never"
129
+ )
130
+ print("Gradio interface defined.")
131
+ except Exception as e:
132
+ print(f"Error defining Gradio interface: {e}")
133
+ import traceback; traceback.print_exc()
134
+ iface = None
135
+
136
 
137
  if __name__ == "__main__":
138
  if model is None or image_processor is None or tokenizer is None:
139
+ print("CRITICAL: Model or processor components failed to load. Gradio might not work.")
140
+
141
+ if iface is not None:
142
  print("Launching Gradio interface...")
143
+ try:
144
+ iface.launch(server_name="0.0.0.0", server_port=7860)
145
+ except Exception as e:
146
+ print(f"Error launching Gradio interface: {e}")
147
+ import traceback; traceback.print_exc()
148
+ # This is where the ValueError: When localhost is not accessible... usually comes from
149
+ # if the underlying TypeError has already happened during iface setup.
150
+ else:
151
+ print("Gradio interface could not be defined due to earlier errors.")