vidhanm commited on
Commit
a4ebbec
·
1 Parent(s): 984c158
Files changed (2) hide show
  1. Dockerfile +5 -19
  2. app.py +141 -147
Dockerfile CHANGED
@@ -1,33 +1,19 @@
1
- FROM python:3.9-slim
2
 
3
  WORKDIR /app
4
 
5
- # Install git
6
  RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
7
 
8
- # Copy requirements and install
9
  COPY requirements.txt requirements.txt
10
- RUN echo "DEBUG: Installing packages from requirements.txt for Gradio app" && \
11
- pip install --no-cache-dir -r requirements.txt && \
12
- echo "DEBUG: Finished installing packages."
13
 
14
- # Clone the nanoVLM repository
15
- RUN echo "DEBUG: Cloning huggingface/nanoVLM repository..." && \
16
- git clone https://github.com/huggingface/nanoVLM.git /app/nanoVLM && \
17
- echo "DEBUG: nanoVLM repository cloned to /app/nanoVLM."
18
 
19
- # Set Python path
20
- ENV PYTHONPATH="/app/nanoVLM:${PYTHONPATH}"
21
  ENV HF_HOME=/app/.cache/huggingface
22
-
23
- # Create cache directory
24
  RUN mkdir -p $HF_HOME && chmod -R 777 $HF_HOME
25
 
26
- # Copy your Gradio application
27
- COPY app.py app.py
28
 
29
- # Expose the port Gradio runs on
30
  EXPOSE 7860
31
-
32
- # Command to run the Gradio application
33
  CMD ["python", "-u", "app.py"]
 
1
+ FROM python:3.9-slim
2
 
3
  WORKDIR /app
4
 
 
5
  RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
6
 
 
7
  COPY requirements.txt requirements.txt
8
+ RUN pip install --no-cache-dir -r requirements.txt
 
 
9
 
10
+ RUN git clone https://github.com/huggingface/nanoVLM.git /app/nanoVLM
 
 
 
11
 
12
+ ENV PYTHONPATH="/app/nanoVLM:${PYTHONPATH}" # So generate.py can find 'from models...'
 
13
  ENV HF_HOME=/app/.cache/huggingface
 
 
14
  RUN mkdir -p $HF_HOME && chmod -R 777 $HF_HOME
15
 
16
+ COPY app.py app.py # Your new Gradio app.py that calls generate.py
 
17
 
 
18
  EXPOSE 7860
 
 
19
  CMD ["python", "-u", "app.py"]
app.py CHANGED
@@ -1,179 +1,173 @@
1
  import sys
2
  import os
 
 
3
  from typing import Optional
4
  from PIL import Image as PILImage
 
5
 
6
- # Add the cloned nanoVLM directory to Python's system path
7
- NANOVLM_REPO_PATH = "/app/nanoVLM" # This path is where your Dockerfile clones huggingface/nanoVLM
8
  if NANOVLM_REPO_PATH not in sys.path:
9
  print(f"DEBUG: Adding {NANOVLM_REPO_PATH} to sys.path")
10
  sys.path.insert(0, NANOVLM_REPO_PATH)
11
 
12
- import gradio as gr
13
- import torch
14
- from transformers import AutoProcessor # Using AutoProcessor as in the successful generate.py
15
-
16
- # Import the custom VisionLanguageModel class
17
- VisionLanguageModel = None
18
- try:
19
- print("DEBUG: Attempting to import VisionLanguageModel from models.vision_language_model")
20
- from models.vision_language_model import VisionLanguageModel
21
- print("DEBUG: Successfully imported VisionLanguageModel.")
22
- except ImportError as e:
23
- print(f"CRITICAL ERROR: Importing VisionLanguageModel failed: {e}")
24
- except Exception as e:
25
- print(f"CRITICAL ERROR: An unexpected error occurred during VisionLanguageModel import: {e}")
26
-
27
- # --- Device Setup ---
28
- device = "cuda" if torch.cuda.is_available() else "cpu"
29
- print(f"DEBUG: Using device: {device}")
30
-
31
- # --- Configuration ---
32
- model_repo_id = "lusxvr/nanoVLM-222M" # Used for both processor and model weights
33
- print(f"DEBUG: Model Repository ID for processor and model: {model_repo_id}")
34
-
35
- # --- Initialize ---
36
- processor = None
37
- model = None
38
-
39
- if VisionLanguageModel: # Only proceed if custom model class was imported
 
 
40
  try:
41
- # Load processor using AutoProcessor, mirroring generate.py
42
- print(f"DEBUG: Loading processor using AutoProcessor.from_pretrained('{model_repo_id}')")
43
- # generate.py doesn't explicitly use trust_remote_code=True for processor,
44
- # but it might be implicitly active in your local transformers or not needed if processor_config is clear.
45
- # Let's try without it first for AutoProcessor, then add if "Unrecognized model" for processor reappears.
46
- processor = AutoProcessor.from_pretrained(model_repo_id) # Try without TRC first for processor
47
- print(f"DEBUG: AutoProcessor loaded: {type(processor)}")
48
-
49
- # Ensure tokenizer has pad_token set if it's GPT-2 based (AutoProcessor should handle a tokenizer component)
50
- if hasattr(processor, 'tokenizer') and processor.tokenizer is not None:
51
- current_tokenizer = processor.tokenizer
52
- if getattr(current_tokenizer, 'pad_token', None) is None and hasattr(current_tokenizer, 'eos_token'):
53
- current_tokenizer.pad_token = current_tokenizer.eos_token
54
- print(f"DEBUG: Set processor.tokenizer.pad_token to eos_token (ID: {current_tokenizer.eos_token_id})")
55
- else:
56
- print("WARN: Processor does not have a 'tokenizer' attribute or it's None. Cannot set pad_token.")
57
-
58
- # Load model using VisionLanguageModel.from_pretrained, mirroring generate.py
59
- print(f"DEBUG: Loading model VisionLanguageModel.from_pretrained('{model_repo_id}')")
60
- # The custom VLM.from_pretrained doesn't take trust_remote_code
61
- model = VisionLanguageModel.from_pretrained(model_repo_id).to(device)
62
- print(f"DEBUG: VisionLanguageModel loaded: {type(model)}")
63
- model.eval()
64
- print("DEBUG: Model set to eval() mode.")
65
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  except Exception as e:
67
- print(f"CRITICAL ERROR loading model or processor: {e}")
68
  import traceback
69
  traceback.print_exc()
70
- processor = None; model = None # Ensure they are None if loading fails
71
- else:
72
- print("CRITICAL ERROR: VisionLanguageModel class not imported. Cannot load model.")
73
 
74
 
75
- # --- Text Generation Function ---
76
- def generate_text_for_image(image_input_pil: Optional[PILImage.Image], prompt_input_str: Optional[str]) -> str:
77
- print(f"DEBUG (generate_text_for_image): Received prompt: '{prompt_input_str}'")
78
- if model is None or processor is None:
79
- print("ERROR (generate_text_for_image): Model or processor not loaded.")
80
- return "Error: Model or processor not loaded. Please check the application logs."
81
  if image_input_pil is None:
82
- print("WARN (generate_text_for_image): No image uploaded.")
83
  return "Please upload an image."
84
- if not prompt_input_str: # Check for empty or None prompt
85
- print("WARN (generate_text_for_image): No prompt provided.")
86
  return "Please provide a prompt."
87
 
 
 
 
 
88
  try:
89
- current_pil_image = image_input_pil
90
- if not isinstance(current_pil_image, PILImage.Image): # Should be PIL from Gradio's type="pil"
91
- print(f"WARN (generate_text_for_image): Input image not PIL, type: {type(current_pil_image)}. Converting.")
92
- current_pil_image = PILImage.fromarray(current_pil_image)
93
- if current_pil_image.mode != "RGB":
94
- print(f"DEBUG (generate_text_for_image): Converting image from {current_pil_image.mode} to RGB.")
95
- current_pil_image = current_pil_image.convert("RGB")
96
- print(f"DEBUG (generate_text_for_image): Image prepped - size: {current_pil_image.size}, mode: {current_pil_image.mode}")
97
-
98
- # Prepare inputs using the AutoProcessor, as in generate.py
99
- print("DEBUG (generate_text_for_image): Processing inputs with AutoProcessor...")
100
- inputs = processor(
101
- text=[prompt_input_str], images=current_pil_image, return_tensors="pt"
102
- ).to(device)
103
- print(f"DEBUG (generate_text_for_image): Inputs from AutoProcessor - keys: {inputs.keys()}")
104
- print(f"DEBUG (generate_text_for_image): input_ids shape: {inputs['input_ids'].shape}, values: {inputs['input_ids']}")
105
- print(f"DEBUG (generate_text_for_image): pixel_values shape: {inputs['pixel_values'].shape}")
106
 
107
- attention_mask = inputs.get('attention_mask')
108
- if attention_mask is None: # Should be provided by AutoProcessor
109
- print("WARN (generate_text_for_image): attention_mask not in processor output. Creating default.")
110
- attention_mask = torch.ones_like(inputs['input_ids']).to(device)
111
- print(f"DEBUG (generate_text_for_image): attention_mask shape: {attention_mask.shape}")
112
-
113
- print("DEBUG (generate_text_for_image): Calling model.generate...")
114
- # Signature for nanoVLM's generate: (self, input_ids, image, attention_mask, max_new_tokens, ...)
115
- generated_ids_tensor = model.generate(
116
- inputs['input_ids'],
117
- inputs['pixel_values'], # This is the 'image' argument for the model's generate method
118
- attention_mask,
119
- max_new_tokens=50, # Consistent with successful generate.py test
120
- temperature=0.7, # From generate.py defaults (or adjust as preferred)
121
- top_k=50, # From generate.py defaults (or adjust as preferred)
122
- # greedy=False is default in nanoVLM's generate
123
- )
124
- print(f"DEBUG (generate_text_for_image): Raw generated_ids: {generated_ids_tensor}")
125
-
126
- # Use processor.batch_decode, as in generate.py
127
- generated_text_list = processor.batch_decode(generated_ids_tensor, skip_special_tokens=True)
128
- print(f"DEBUG (generate_text_for_image): Decoded text list: {generated_text_list}")
129
- generated_text_str = generated_text_list[0] if generated_text_list else ""
130
 
131
- # Optional: Clean up prompt if echoed
132
- cleaned_text_str = generated_text_str
133
- if prompt_input_str and generated_text_str.startswith(prompt_input_str):
134
- cleaned_text_str = generated_text_str[len(prompt_input_str):].lstrip(" ,.:")
135
- print(f"DEBUG (generate_text_for_image): Final cleaned text: '{cleaned_text_str}'")
136
- return cleaned_text_str.strip()
137
-
138
  except Exception as e:
139
- print(f"CRITICAL ERROR during generation: {e}")
140
- import traceback
141
- traceback.print_exc()
142
- return f"Error during generation: {str(e)}. Check logs."
143
-
144
- # --- Gradio Interface ---
 
 
 
 
 
 
 
 
145
  description_md = """
146
- ## nanoVLM-222M Interactive Demo
147
- Upload an image and type a prompt to get a description or answer from the model.
148
- This Space uses the `lusxvr/nanoVLM-222M` model weights with the `huggingface/nanoVLM` model code.
149
  """
150
- iface = None
151
- # Only define the interface if the model and processor loaded successfully
152
- if VisionLanguageModel and model and processor:
153
- try:
154
- print("DEBUG: Defining Gradio interface...")
155
- iface = gr.Interface(
156
- fn=generate_text_for_image,
157
- inputs=[
158
- gr.Image(type="pil", label="Upload Image"),
159
- gr.Textbox(label="Your Prompt / Question", info="e.g., 'describe this image in detail'")
160
- ],
161
- outputs=gr.Textbox(label="Generated Text", show_copy_button=True),
162
- title="nanoVLM-222M Demo",
163
- description=description_md,
164
- allow_flagging="never" # No examples or caching for now to keep it simple
165
- )
166
- print("DEBUG: Gradio interface defined successfully.")
167
- except Exception as e:
168
- print(f"CRITICAL ERROR defining Gradio interface: {e}")
169
- import traceback; traceback.print_exc()
170
- else:
171
- print("WARN: Model and/or processor did not load. Gradio interface will not be created.")
172
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
 
174
  # --- Launch Gradio App ---
175
  if __name__ == "__main__":
176
  print("DEBUG: Entered __main__ block for Gradio launch.")
 
 
 
 
177
  if iface is not None:
178
  print("DEBUG: Attempting to launch Gradio interface...")
179
  try:
@@ -183,4 +177,4 @@ if __name__ == "__main__":
183
  print(f"CRITICAL ERROR launching Gradio interface: {e}")
184
  import traceback; traceback.print_exc()
185
  else:
186
- print("CRITICAL ERROR: Gradio interface (iface) is None or not defined due to loading errors. Cannot launch.")
 
1
  import sys
2
  import os
3
+ import subprocess # For calling generate.py
4
+ import tempfile # For handling temporary image files
5
  from typing import Optional
6
  from PIL import Image as PILImage
7
+ import gradio as gr
8
 
9
+ # Add the cloned nanoVLM directory to Python's system path (generate.py might need this too if it imports from 'models')
10
+ NANOVLM_REPO_PATH = "/app/nanoVLM"
11
  if NANOVLM_REPO_PATH not in sys.path:
12
  print(f"DEBUG: Adding {NANOVLM_REPO_PATH} to sys.path")
13
  sys.path.insert(0, NANOVLM_REPO_PATH)
14
 
15
+ print(f"DEBUG: Python sys.path: {sys.path}")
16
+
17
+ # Path to the generate.py script within our Docker container
18
+ GENERATE_SCRIPT_PATH = "/app/nanoVLM/generate.py"
19
+ MODEL_REPO_ID = "lusxvr/nanoVLM-222M" # Model ID for generate.py
20
+
21
+ print(f"DEBUG: Using generate.py script at: {GENERATE_SCRIPT_PATH}")
22
+ print(f"DEBUG: Using model repo ID: {MODEL_REPO_ID}")
23
+
24
+
25
+ def call_generate_script(image_path: str, prompt_text: str) -> str:
26
+ """
27
+ Calls the generate.py script as a subprocess and returns its output.
28
+ """
29
+ print(f"DEBUG (call_generate_script): Calling with image_path='{image_path}', prompt='{prompt_text}'")
30
+
31
+ # Arguments for generate.py (ensure they match its expected format)
32
+ # From previous success: --hf_model, --image, --prompt, --generations, --max_new_tokens
33
+ cmd_args = [
34
+ "python", "-u", GENERATE_SCRIPT_PATH,
35
+ "--hf_model", MODEL_REPO_ID,
36
+ "--image", image_path,
37
+ "--prompt", prompt_text,
38
+ "--generations", "1", # Get one generation for the UI
39
+ "--max_new_tokens", "70" # Adjust as needed
40
+ # --device is handled by generate.py internally
41
+ ]
42
+
43
+ print(f"DEBUG (call_generate_script): Executing command: {' '.join(cmd_args)}")
44
+
45
  try:
46
+ # Execute the command
47
+ # capture_output=True, text=True are for Python 3.7+
48
+ # For Python 3.9 (as in your Dockerfile base), this is fine.
49
+ process = subprocess.run(
50
+ cmd_args,
51
+ capture_output=True,
52
+ text=True,
53
+ check=True, # Raise an exception for non-zero exit codes
54
+ timeout=120 # Add a timeout (e.g., 2 minutes)
55
+ )
56
+
57
+ stdout = process.stdout
58
+ stderr = process.stderr
59
+
60
+ print(f"DEBUG (call_generate_script): generate.py STDOUT:\n{stdout}")
61
+ if stderr:
62
+ print(f"DEBUG (call_generate_script): generate.py STDERR:\n{stderr}")
63
+
64
+ # --- Parse the output from generate.py ---
65
+ # The generate.py script prints:
66
+ # Outputs:
67
+ # >> Generation 1: Actual generated text here.
68
+ # We need to extract "Actual generated text here."
69
+
70
+ output_lines = stdout.splitlines()
71
+ generated_text = "Error: Could not parse output from generate.py script." # Default
72
+
73
+ parsing_output = False
74
+ for line in output_lines:
75
+ if "Outputs:" in line:
76
+ parsing_output = True
77
+ continue
78
+ if parsing_output and line.strip().startswith(">> Generation 1:"):
79
+ # Extract text after ">> Generation 1: " (note the two spaces)
80
+ generated_text = line.split(">> Generation 1: ", 1)[-1].strip()
81
+ break # Found the first generation
82
+
83
+ print(f"DEBUG (call_generate_script): Parsed generated text: '{generated_text}'")
84
+ return generated_text
85
+
86
+ except subprocess.CalledProcessError as e:
87
+ print(f"ERROR (call_generate_script): generate.py exited with error code {e.returncode}")
88
+ print(f"ERROR (call_generate_script): STDOUT: {e.stdout}")
89
+ print(f"ERROR (call_generate_script): STDERR: {e.stderr}")
90
+ return f"Error executing generation script (Code {e.returncode}). Check logs."
91
+ except subprocess.TimeoutExpired:
92
+ print("ERROR (call_generate_script): generate.py timed out.")
93
+ return "Error: Generation script timed out."
94
  except Exception as e:
95
+ print(f"ERROR (call_generate_script): An unexpected error occurred: {e}")
96
  import traceback
97
  traceback.print_exc()
98
+ return f"An unexpected error occurred while calling generation script: {str(e)}"
 
 
99
 
100
 
101
+ def gradio_interface_fn(image_input_pil: Optional[PILImage.Image], prompt_input_str: Optional[str]) -> str:
102
+ print(f"DEBUG (gradio_interface_fn): Received prompt: '{prompt_input_str}'")
 
 
 
 
103
  if image_input_pil is None:
 
104
  return "Please upload an image."
105
+ if not prompt_input_str:
 
106
  return "Please provide a prompt."
107
 
108
+ # Save the uploaded PIL image to a temporary file
109
+ # tempfile.NamedTemporaryFile creates a file that is deleted when closed.
110
+ # We need to ensure it has a .jpg extension for some image libraries if they are picky.
111
+ # The 'delete=False' allows us to close it, pass its name, and then delete it manually.
112
  try:
113
+ with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_image_file:
114
+ image_input_pil.save(tmp_image_file, format="JPEG")
115
+ tmp_image_path = tmp_image_file.name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
+ print(f"DEBUG (gradio_interface_fn): Temporary image saved to: {tmp_image_path}")
118
+
119
+ # Call the generate.py script with the path to the temporary image
120
+ result_text = call_generate_script(tmp_image_path, prompt_input_str)
121
+
122
+ return result_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
 
 
 
 
 
 
 
 
124
  except Exception as e:
125
+ print(f"ERROR (gradio_interface_fn): Error processing image or calling script: {e}")
126
+ import traceback; traceback.print_exc()
127
+ return f"An error occurred: {str(e)}"
128
+ finally:
129
+ # Clean up the temporary image file
130
+ if 'tmp_image_path' in locals() and os.path.exists(tmp_image_path):
131
+ try:
132
+ os.remove(tmp_image_path)
133
+ print(f"DEBUG (gradio_interface_fn): Temporary image {tmp_image_path} removed.")
134
+ except Exception as e_remove:
135
+ print(f"WARN (gradio_interface_fn): Could not remove temporary image {tmp_image_path}: {e_remove}")
136
+
137
+
138
+ # --- Gradio Interface Definition ---
139
  description_md = """
140
+ ## nanoVLM-222M Interactive Demo (via generate.py)
141
+ Upload an image and type a prompt. This interface calls the `generate.py` script from
142
+ `huggingface/nanoVLM` under the hood to perform inference.
143
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
+ print("DEBUG: Defining Gradio interface...")
146
+ iface = None
147
+ try:
148
+ iface = gr.Interface(
149
+ fn=gradio_interface_fn,
150
+ inputs=[
151
+ gr.Image(type="pil", label="Upload Image"),
152
+ gr.Textbox(label="Your Prompt / Question", info="e.g., 'describe this image in detail'")
153
+ ],
154
+ outputs=gr.Textbox(label="Generated Text", show_copy_button=True),
155
+ title="nanoVLM-222M Demo (via Script)",
156
+ description=description_md,
157
+ allow_flagging="never"
158
+ )
159
+ print("DEBUG: Gradio interface defined successfully.")
160
+ except Exception as e:
161
+ print(f"CRITICAL ERROR defining Gradio interface: {e}")
162
+ import traceback; traceback.print_exc()
163
 
164
  # --- Launch Gradio App ---
165
  if __name__ == "__main__":
166
  print("DEBUG: Entered __main__ block for Gradio launch.")
167
+ if not os.path.exists(GENERATE_SCRIPT_PATH):
168
+ print(f"CRITICAL ERROR: The script {GENERATE_SCRIPT_PATH} was not found. Cannot launch app.")
169
+ iface = None # Prevent launch
170
+
171
  if iface is not None:
172
  print("DEBUG: Attempting to launch Gradio interface...")
173
  try:
 
177
  print(f"CRITICAL ERROR launching Gradio interface: {e}")
178
  import traceback; traceback.print_exc()
179
  else:
180
+ print("CRITICAL ERROR: Gradio interface (iface) is None or not defined. Cannot launch.")