Spaces:
Running
Running
vidhanm
commited on
Commit
·
a4ebbec
1
Parent(s):
984c158
- Dockerfile +5 -19
- app.py +141 -147
Dockerfile
CHANGED
@@ -1,33 +1,19 @@
|
|
1 |
-
FROM python:3.9-slim
|
2 |
|
3 |
WORKDIR /app
|
4 |
|
5 |
-
# Install git
|
6 |
RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
|
7 |
|
8 |
-
# Copy requirements and install
|
9 |
COPY requirements.txt requirements.txt
|
10 |
-
RUN
|
11 |
-
pip install --no-cache-dir -r requirements.txt && \
|
12 |
-
echo "DEBUG: Finished installing packages."
|
13 |
|
14 |
-
|
15 |
-
RUN echo "DEBUG: Cloning huggingface/nanoVLM repository..." && \
|
16 |
-
git clone https://github.com/huggingface/nanoVLM.git /app/nanoVLM && \
|
17 |
-
echo "DEBUG: nanoVLM repository cloned to /app/nanoVLM."
|
18 |
|
19 |
-
#
|
20 |
-
ENV PYTHONPATH="/app/nanoVLM:${PYTHONPATH}"
|
21 |
ENV HF_HOME=/app/.cache/huggingface
|
22 |
-
|
23 |
-
# Create cache directory
|
24 |
RUN mkdir -p $HF_HOME && chmod -R 777 $HF_HOME
|
25 |
|
26 |
-
#
|
27 |
-
COPY app.py app.py
|
28 |
|
29 |
-
# Expose the port Gradio runs on
|
30 |
EXPOSE 7860
|
31 |
-
|
32 |
-
# Command to run the Gradio application
|
33 |
CMD ["python", "-u", "app.py"]
|
|
|
1 |
+
FROM python:3.9-slim
|
2 |
|
3 |
WORKDIR /app
|
4 |
|
|
|
5 |
RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
|
6 |
|
|
|
7 |
COPY requirements.txt requirements.txt
|
8 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
|
|
|
|
9 |
|
10 |
+
RUN git clone https://github.com/huggingface/nanoVLM.git /app/nanoVLM
|
|
|
|
|
|
|
11 |
|
12 |
+
ENV PYTHONPATH="/app/nanoVLM:${PYTHONPATH}" # So generate.py can find 'from models...'
|
|
|
13 |
ENV HF_HOME=/app/.cache/huggingface
|
|
|
|
|
14 |
RUN mkdir -p $HF_HOME && chmod -R 777 $HF_HOME
|
15 |
|
16 |
+
COPY app.py app.py # Your new Gradio app.py that calls generate.py
|
|
|
17 |
|
|
|
18 |
EXPOSE 7860
|
|
|
|
|
19 |
CMD ["python", "-u", "app.py"]
|
app.py
CHANGED
@@ -1,179 +1,173 @@
|
|
1 |
import sys
|
2 |
import os
|
|
|
|
|
3 |
from typing import Optional
|
4 |
from PIL import Image as PILImage
|
|
|
5 |
|
6 |
-
# Add the cloned nanoVLM directory to Python's system path
|
7 |
-
NANOVLM_REPO_PATH = "/app/nanoVLM"
|
8 |
if NANOVLM_REPO_PATH not in sys.path:
|
9 |
print(f"DEBUG: Adding {NANOVLM_REPO_PATH} to sys.path")
|
10 |
sys.path.insert(0, NANOVLM_REPO_PATH)
|
11 |
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
#
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
#
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
|
|
|
|
40 |
try:
|
41 |
-
#
|
42 |
-
|
43 |
-
#
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
# The
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
except Exception as e:
|
67 |
-
print(f"
|
68 |
import traceback
|
69 |
traceback.print_exc()
|
70 |
-
|
71 |
-
else:
|
72 |
-
print("CRITICAL ERROR: VisionLanguageModel class not imported. Cannot load model.")
|
73 |
|
74 |
|
75 |
-
|
76 |
-
|
77 |
-
print(f"DEBUG (generate_text_for_image): Received prompt: '{prompt_input_str}'")
|
78 |
-
if model is None or processor is None:
|
79 |
-
print("ERROR (generate_text_for_image): Model or processor not loaded.")
|
80 |
-
return "Error: Model or processor not loaded. Please check the application logs."
|
81 |
if image_input_pil is None:
|
82 |
-
print("WARN (generate_text_for_image): No image uploaded.")
|
83 |
return "Please upload an image."
|
84 |
-
if not prompt_input_str:
|
85 |
-
print("WARN (generate_text_for_image): No prompt provided.")
|
86 |
return "Please provide a prompt."
|
87 |
|
|
|
|
|
|
|
|
|
88 |
try:
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
current_pil_image = PILImage.fromarray(current_pil_image)
|
93 |
-
if current_pil_image.mode != "RGB":
|
94 |
-
print(f"DEBUG (generate_text_for_image): Converting image from {current_pil_image.mode} to RGB.")
|
95 |
-
current_pil_image = current_pil_image.convert("RGB")
|
96 |
-
print(f"DEBUG (generate_text_for_image): Image prepped - size: {current_pil_image.size}, mode: {current_pil_image.mode}")
|
97 |
-
|
98 |
-
# Prepare inputs using the AutoProcessor, as in generate.py
|
99 |
-
print("DEBUG (generate_text_for_image): Processing inputs with AutoProcessor...")
|
100 |
-
inputs = processor(
|
101 |
-
text=[prompt_input_str], images=current_pil_image, return_tensors="pt"
|
102 |
-
).to(device)
|
103 |
-
print(f"DEBUG (generate_text_for_image): Inputs from AutoProcessor - keys: {inputs.keys()}")
|
104 |
-
print(f"DEBUG (generate_text_for_image): input_ids shape: {inputs['input_ids'].shape}, values: {inputs['input_ids']}")
|
105 |
-
print(f"DEBUG (generate_text_for_image): pixel_values shape: {inputs['pixel_values'].shape}")
|
106 |
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
print("DEBUG (generate_text_for_image): Calling model.generate...")
|
114 |
-
# Signature for nanoVLM's generate: (self, input_ids, image, attention_mask, max_new_tokens, ...)
|
115 |
-
generated_ids_tensor = model.generate(
|
116 |
-
inputs['input_ids'],
|
117 |
-
inputs['pixel_values'], # This is the 'image' argument for the model's generate method
|
118 |
-
attention_mask,
|
119 |
-
max_new_tokens=50, # Consistent with successful generate.py test
|
120 |
-
temperature=0.7, # From generate.py defaults (or adjust as preferred)
|
121 |
-
top_k=50, # From generate.py defaults (or adjust as preferred)
|
122 |
-
# greedy=False is default in nanoVLM's generate
|
123 |
-
)
|
124 |
-
print(f"DEBUG (generate_text_for_image): Raw generated_ids: {generated_ids_tensor}")
|
125 |
-
|
126 |
-
# Use processor.batch_decode, as in generate.py
|
127 |
-
generated_text_list = processor.batch_decode(generated_ids_tensor, skip_special_tokens=True)
|
128 |
-
print(f"DEBUG (generate_text_for_image): Decoded text list: {generated_text_list}")
|
129 |
-
generated_text_str = generated_text_list[0] if generated_text_list else ""
|
130 |
|
131 |
-
# Optional: Clean up prompt if echoed
|
132 |
-
cleaned_text_str = generated_text_str
|
133 |
-
if prompt_input_str and generated_text_str.startswith(prompt_input_str):
|
134 |
-
cleaned_text_str = generated_text_str[len(prompt_input_str):].lstrip(" ,.:")
|
135 |
-
print(f"DEBUG (generate_text_for_image): Final cleaned text: '{cleaned_text_str}'")
|
136 |
-
return cleaned_text_str.strip()
|
137 |
-
|
138 |
except Exception as e:
|
139 |
-
print(f"
|
140 |
-
import traceback
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
description_md = """
|
146 |
-
## nanoVLM-222M Interactive Demo
|
147 |
-
Upload an image and type a prompt
|
148 |
-
|
149 |
"""
|
150 |
-
iface = None
|
151 |
-
# Only define the interface if the model and processor loaded successfully
|
152 |
-
if VisionLanguageModel and model and processor:
|
153 |
-
try:
|
154 |
-
print("DEBUG: Defining Gradio interface...")
|
155 |
-
iface = gr.Interface(
|
156 |
-
fn=generate_text_for_image,
|
157 |
-
inputs=[
|
158 |
-
gr.Image(type="pil", label="Upload Image"),
|
159 |
-
gr.Textbox(label="Your Prompt / Question", info="e.g., 'describe this image in detail'")
|
160 |
-
],
|
161 |
-
outputs=gr.Textbox(label="Generated Text", show_copy_button=True),
|
162 |
-
title="nanoVLM-222M Demo",
|
163 |
-
description=description_md,
|
164 |
-
allow_flagging="never" # No examples or caching for now to keep it simple
|
165 |
-
)
|
166 |
-
print("DEBUG: Gradio interface defined successfully.")
|
167 |
-
except Exception as e:
|
168 |
-
print(f"CRITICAL ERROR defining Gradio interface: {e}")
|
169 |
-
import traceback; traceback.print_exc()
|
170 |
-
else:
|
171 |
-
print("WARN: Model and/or processor did not load. Gradio interface will not be created.")
|
172 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
|
174 |
# --- Launch Gradio App ---
|
175 |
if __name__ == "__main__":
|
176 |
print("DEBUG: Entered __main__ block for Gradio launch.")
|
|
|
|
|
|
|
|
|
177 |
if iface is not None:
|
178 |
print("DEBUG: Attempting to launch Gradio interface...")
|
179 |
try:
|
@@ -183,4 +177,4 @@ if __name__ == "__main__":
|
|
183 |
print(f"CRITICAL ERROR launching Gradio interface: {e}")
|
184 |
import traceback; traceback.print_exc()
|
185 |
else:
|
186 |
-
print("CRITICAL ERROR: Gradio interface (iface) is None or not defined
|
|
|
1 |
import sys
|
2 |
import os
|
3 |
+
import subprocess # For calling generate.py
|
4 |
+
import tempfile # For handling temporary image files
|
5 |
from typing import Optional
|
6 |
from PIL import Image as PILImage
|
7 |
+
import gradio as gr
|
8 |
|
9 |
+
# Add the cloned nanoVLM directory to Python's system path (generate.py might need this too if it imports from 'models')
|
10 |
+
NANOVLM_REPO_PATH = "/app/nanoVLM"
|
11 |
if NANOVLM_REPO_PATH not in sys.path:
|
12 |
print(f"DEBUG: Adding {NANOVLM_REPO_PATH} to sys.path")
|
13 |
sys.path.insert(0, NANOVLM_REPO_PATH)
|
14 |
|
15 |
+
print(f"DEBUG: Python sys.path: {sys.path}")
|
16 |
+
|
17 |
+
# Path to the generate.py script within our Docker container
|
18 |
+
GENERATE_SCRIPT_PATH = "/app/nanoVLM/generate.py"
|
19 |
+
MODEL_REPO_ID = "lusxvr/nanoVLM-222M" # Model ID for generate.py
|
20 |
+
|
21 |
+
print(f"DEBUG: Using generate.py script at: {GENERATE_SCRIPT_PATH}")
|
22 |
+
print(f"DEBUG: Using model repo ID: {MODEL_REPO_ID}")
|
23 |
+
|
24 |
+
|
25 |
+
def call_generate_script(image_path: str, prompt_text: str) -> str:
|
26 |
+
"""
|
27 |
+
Calls the generate.py script as a subprocess and returns its output.
|
28 |
+
"""
|
29 |
+
print(f"DEBUG (call_generate_script): Calling with image_path='{image_path}', prompt='{prompt_text}'")
|
30 |
+
|
31 |
+
# Arguments for generate.py (ensure they match its expected format)
|
32 |
+
# From previous success: --hf_model, --image, --prompt, --generations, --max_new_tokens
|
33 |
+
cmd_args = [
|
34 |
+
"python", "-u", GENERATE_SCRIPT_PATH,
|
35 |
+
"--hf_model", MODEL_REPO_ID,
|
36 |
+
"--image", image_path,
|
37 |
+
"--prompt", prompt_text,
|
38 |
+
"--generations", "1", # Get one generation for the UI
|
39 |
+
"--max_new_tokens", "70" # Adjust as needed
|
40 |
+
# --device is handled by generate.py internally
|
41 |
+
]
|
42 |
+
|
43 |
+
print(f"DEBUG (call_generate_script): Executing command: {' '.join(cmd_args)}")
|
44 |
+
|
45 |
try:
|
46 |
+
# Execute the command
|
47 |
+
# capture_output=True, text=True are for Python 3.7+
|
48 |
+
# For Python 3.9 (as in your Dockerfile base), this is fine.
|
49 |
+
process = subprocess.run(
|
50 |
+
cmd_args,
|
51 |
+
capture_output=True,
|
52 |
+
text=True,
|
53 |
+
check=True, # Raise an exception for non-zero exit codes
|
54 |
+
timeout=120 # Add a timeout (e.g., 2 minutes)
|
55 |
+
)
|
56 |
+
|
57 |
+
stdout = process.stdout
|
58 |
+
stderr = process.stderr
|
59 |
+
|
60 |
+
print(f"DEBUG (call_generate_script): generate.py STDOUT:\n{stdout}")
|
61 |
+
if stderr:
|
62 |
+
print(f"DEBUG (call_generate_script): generate.py STDERR:\n{stderr}")
|
63 |
+
|
64 |
+
# --- Parse the output from generate.py ---
|
65 |
+
# The generate.py script prints:
|
66 |
+
# Outputs:
|
67 |
+
# >> Generation 1: Actual generated text here.
|
68 |
+
# We need to extract "Actual generated text here."
|
69 |
+
|
70 |
+
output_lines = stdout.splitlines()
|
71 |
+
generated_text = "Error: Could not parse output from generate.py script." # Default
|
72 |
+
|
73 |
+
parsing_output = False
|
74 |
+
for line in output_lines:
|
75 |
+
if "Outputs:" in line:
|
76 |
+
parsing_output = True
|
77 |
+
continue
|
78 |
+
if parsing_output and line.strip().startswith(">> Generation 1:"):
|
79 |
+
# Extract text after ">> Generation 1: " (note the two spaces)
|
80 |
+
generated_text = line.split(">> Generation 1: ", 1)[-1].strip()
|
81 |
+
break # Found the first generation
|
82 |
+
|
83 |
+
print(f"DEBUG (call_generate_script): Parsed generated text: '{generated_text}'")
|
84 |
+
return generated_text
|
85 |
+
|
86 |
+
except subprocess.CalledProcessError as e:
|
87 |
+
print(f"ERROR (call_generate_script): generate.py exited with error code {e.returncode}")
|
88 |
+
print(f"ERROR (call_generate_script): STDOUT: {e.stdout}")
|
89 |
+
print(f"ERROR (call_generate_script): STDERR: {e.stderr}")
|
90 |
+
return f"Error executing generation script (Code {e.returncode}). Check logs."
|
91 |
+
except subprocess.TimeoutExpired:
|
92 |
+
print("ERROR (call_generate_script): generate.py timed out.")
|
93 |
+
return "Error: Generation script timed out."
|
94 |
except Exception as e:
|
95 |
+
print(f"ERROR (call_generate_script): An unexpected error occurred: {e}")
|
96 |
import traceback
|
97 |
traceback.print_exc()
|
98 |
+
return f"An unexpected error occurred while calling generation script: {str(e)}"
|
|
|
|
|
99 |
|
100 |
|
101 |
+
def gradio_interface_fn(image_input_pil: Optional[PILImage.Image], prompt_input_str: Optional[str]) -> str:
|
102 |
+
print(f"DEBUG (gradio_interface_fn): Received prompt: '{prompt_input_str}'")
|
|
|
|
|
|
|
|
|
103 |
if image_input_pil is None:
|
|
|
104 |
return "Please upload an image."
|
105 |
+
if not prompt_input_str:
|
|
|
106 |
return "Please provide a prompt."
|
107 |
|
108 |
+
# Save the uploaded PIL image to a temporary file
|
109 |
+
# tempfile.NamedTemporaryFile creates a file that is deleted when closed.
|
110 |
+
# We need to ensure it has a .jpg extension for some image libraries if they are picky.
|
111 |
+
# The 'delete=False' allows us to close it, pass its name, and then delete it manually.
|
112 |
try:
|
113 |
+
with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_image_file:
|
114 |
+
image_input_pil.save(tmp_image_file, format="JPEG")
|
115 |
+
tmp_image_path = tmp_image_file.name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
|
117 |
+
print(f"DEBUG (gradio_interface_fn): Temporary image saved to: {tmp_image_path}")
|
118 |
+
|
119 |
+
# Call the generate.py script with the path to the temporary image
|
120 |
+
result_text = call_generate_script(tmp_image_path, prompt_input_str)
|
121 |
+
|
122 |
+
return result_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
except Exception as e:
|
125 |
+
print(f"ERROR (gradio_interface_fn): Error processing image or calling script: {e}")
|
126 |
+
import traceback; traceback.print_exc()
|
127 |
+
return f"An error occurred: {str(e)}"
|
128 |
+
finally:
|
129 |
+
# Clean up the temporary image file
|
130 |
+
if 'tmp_image_path' in locals() and os.path.exists(tmp_image_path):
|
131 |
+
try:
|
132 |
+
os.remove(tmp_image_path)
|
133 |
+
print(f"DEBUG (gradio_interface_fn): Temporary image {tmp_image_path} removed.")
|
134 |
+
except Exception as e_remove:
|
135 |
+
print(f"WARN (gradio_interface_fn): Could not remove temporary image {tmp_image_path}: {e_remove}")
|
136 |
+
|
137 |
+
|
138 |
+
# --- Gradio Interface Definition ---
|
139 |
description_md = """
|
140 |
+
## nanoVLM-222M Interactive Demo (via generate.py)
|
141 |
+
Upload an image and type a prompt. This interface calls the `generate.py` script from
|
142 |
+
`huggingface/nanoVLM` under the hood to perform inference.
|
143 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
|
145 |
+
print("DEBUG: Defining Gradio interface...")
|
146 |
+
iface = None
|
147 |
+
try:
|
148 |
+
iface = gr.Interface(
|
149 |
+
fn=gradio_interface_fn,
|
150 |
+
inputs=[
|
151 |
+
gr.Image(type="pil", label="Upload Image"),
|
152 |
+
gr.Textbox(label="Your Prompt / Question", info="e.g., 'describe this image in detail'")
|
153 |
+
],
|
154 |
+
outputs=gr.Textbox(label="Generated Text", show_copy_button=True),
|
155 |
+
title="nanoVLM-222M Demo (via Script)",
|
156 |
+
description=description_md,
|
157 |
+
allow_flagging="never"
|
158 |
+
)
|
159 |
+
print("DEBUG: Gradio interface defined successfully.")
|
160 |
+
except Exception as e:
|
161 |
+
print(f"CRITICAL ERROR defining Gradio interface: {e}")
|
162 |
+
import traceback; traceback.print_exc()
|
163 |
|
164 |
# --- Launch Gradio App ---
|
165 |
if __name__ == "__main__":
|
166 |
print("DEBUG: Entered __main__ block for Gradio launch.")
|
167 |
+
if not os.path.exists(GENERATE_SCRIPT_PATH):
|
168 |
+
print(f"CRITICAL ERROR: The script {GENERATE_SCRIPT_PATH} was not found. Cannot launch app.")
|
169 |
+
iface = None # Prevent launch
|
170 |
+
|
171 |
if iface is not None:
|
172 |
print("DEBUG: Attempting to launch Gradio interface...")
|
173 |
try:
|
|
|
177 |
print(f"CRITICAL ERROR launching Gradio interface: {e}")
|
178 |
import traceback; traceback.print_exc()
|
179 |
else:
|
180 |
+
print("CRITICAL ERROR: Gradio interface (iface) is None or not defined. Cannot launch.")
|