Spaces:
Running
Running
vidhanm
commited on
Commit
·
137b7f1
1
Parent(s):
876ed10
trying new approach
Browse files- Dockerfile +36 -34
- app.py +0 -162
- b01847e1e13f032d8a7309a460d5d2c5.jpg +0 -0
- requirements.txt +15 -76
Dockerfile
CHANGED
@@ -1,39 +1,41 @@
|
|
1 |
-
#
|
2 |
-
FROM python:3.10-slim
|
3 |
|
4 |
-
# Set the working directory in the container
|
5 |
WORKDIR /app
|
6 |
|
7 |
-
#
|
8 |
-
|
9 |
-
ENV GRADIO_TEMP_DIR=/tmp/gradio_tmp
|
10 |
-
ENV GRADIO_FLAGGING_DIR=/tmp/gradio_flags
|
11 |
|
12 |
-
#
|
13 |
-
RUN apt-get update && apt-get install -y \
|
14 |
-
git \
|
15 |
-
build-essential \
|
16 |
-
&& rm -rf /var/lib/apt/lists/*
|
17 |
-
|
18 |
-
# Clone the original nanoVLM repository for its model definition files
|
19 |
-
# This makes the `models` directory from nanoVLM available under /app/nanoVLM
|
20 |
-
RUN git clone https://github.com/huggingface/nanoVLM.git /app/nanoVLM
|
21 |
-
|
22 |
-
# Create the cache and temp directories and make them writable
|
23 |
-
RUN mkdir -p $HF_HOME $GRADIO_TEMP_DIR $GRADIO_FLAGGING_DIR && \
|
24 |
-
chmod -R 777 $HF_HOME $GRADIO_TEMP_DIR $GRADIO_FLAGGING_DIR
|
25 |
-
|
26 |
-
# Copy the requirements file first
|
27 |
COPY requirements.txt requirements.txt
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
#
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
#
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9-slim # Or your preferred Python version matching local
|
|
|
2 |
|
|
|
3 |
WORKDIR /app
|
4 |
|
5 |
+
# Install git
|
6 |
+
RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
|
|
|
|
|
7 |
|
8 |
+
# Copy requirements and install
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
COPY requirements.txt requirements.txt
|
10 |
+
RUN echo "DEBUG: Installing packages from requirements.txt" && \
|
11 |
+
pip install --no-cache-dir -r requirements.txt && \
|
12 |
+
echo "DEBUG: Finished installing packages."
|
13 |
+
|
14 |
+
# Clone the nanoVLM repository which contains generate.py and the models directory
|
15 |
+
# This also ensures the 'models' module is available for VisionLanguageModel import
|
16 |
+
RUN echo "DEBUG: Cloning huggingface/nanoVLM repository..." && \
|
17 |
+
git clone https://github.com/huggingface/nanoVLM.git /app/nanoVLM && \
|
18 |
+
echo "DEBUG: nanoVLM repository cloned to /app/nanoVLM."
|
19 |
+
|
20 |
+
# Add a test image to the Space.
|
21 |
+
# You need to create a simple 'test_image.jpg' and add it to the root of your Space repo.
|
22 |
+
COPY ./test_image.jpg /app/test_image.jpg
|
23 |
+
RUN if [ ! -f /app/test_image.jpg ]; then echo "ERROR: test_image.jpg not found!"; exit 1; fi
|
24 |
+
|
25 |
+
# Set Python path to include the nanoVLM models directory, so `from models...` works
|
26 |
+
ENV PYTHONPATH="/app/nanoVLM:${PYTHONPATH}"
|
27 |
+
ENV HF_HOME=/app/.cache/huggingface # Define a writable cache directory
|
28 |
+
|
29 |
+
# Create cache directory with write permissions
|
30 |
+
RUN mkdir -p $HF_HOME && chmod -R 777 $HF_HOME
|
31 |
+
|
32 |
+
# The generate.py script is at /app/nanoVLM/generate.py
|
33 |
+
# It takes arguments like --model_path, --image_path, --prompt, --device
|
34 |
+
# We will run it directly. Its output will go to the Space's container logs.
|
35 |
+
CMD ["python", "-u", "/app/nanoVLM/generate.py", \
|
36 |
+
"--model_path", "lusxvr/nanoVLM-222M", \
|
37 |
+
"--image_path", "/app/test_image.jpg", \
|
38 |
+
"--prompt", "describe this image in detail", \
|
39 |
+
"--device", "cpu", \
|
40 |
+
"--num_generations", "1", \
|
41 |
+
"--max_new_tokens", "50"]
|
app.py
DELETED
@@ -1,162 +0,0 @@
|
|
1 |
-
import sys
|
2 |
-
import os
|
3 |
-
from typing import Optional
|
4 |
-
from PIL import Image as PILImage
|
5 |
-
|
6 |
-
# Add the cloned nanoVLM directory to Python's system path
|
7 |
-
NANOVLM_REPO_PATH = "/app/nanoVLM"
|
8 |
-
if NANOVLM_REPO_PATH not in sys.path:
|
9 |
-
print(f"DEBUG: Adding {NANOVLM_REPO_PATH} to sys.path")
|
10 |
-
sys.path.insert(0, NANOVLM_REPO_PATH)
|
11 |
-
|
12 |
-
import gradio as gr
|
13 |
-
import torch
|
14 |
-
from transformers import AutoProcessor # Using AutoProcessor as in generate.py
|
15 |
-
|
16 |
-
VisionLanguageModel = None
|
17 |
-
try:
|
18 |
-
print("DEBUG: Attempting to import VisionLanguageModel")
|
19 |
-
from models.vision_language_model import VisionLanguageModel
|
20 |
-
print("DEBUG: Successfully imported VisionLanguageModel.")
|
21 |
-
except ImportError as e:
|
22 |
-
print(f"CRITICAL ERROR: Importing VisionLanguageModel: {e}")
|
23 |
-
|
24 |
-
# --- Device Setup ---
|
25 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
26 |
-
print(f"DEBUG: Using device: {device}")
|
27 |
-
|
28 |
-
# --- Configuration ---
|
29 |
-
# This will be used for both model and processor, as in generate.py
|
30 |
-
model_repo_id = "lusxvr/nanoVLM-222M"
|
31 |
-
print(f"DEBUG: Model Repository ID for model and processor: {model_repo_id}")
|
32 |
-
|
33 |
-
# --- Initialize ---
|
34 |
-
processor = None
|
35 |
-
model = None
|
36 |
-
|
37 |
-
if VisionLanguageModel: # Only proceed if custom model class was imported
|
38 |
-
try:
|
39 |
-
# Load processor using AutoProcessor, like in generate.py
|
40 |
-
print(f"DEBUG: Loading processor using AutoProcessor.from_pretrained('{model_repo_id}')")
|
41 |
-
# Using trust_remote_code=True here as a precaution,
|
42 |
-
# though ideally not needed if processor_config.json is complete.
|
43 |
-
processor = AutoProcessor.from_pretrained(model_repo_id, trust_remote_code=True)
|
44 |
-
print(f"DEBUG: AutoProcessor loaded: {type(processor)}")
|
45 |
-
|
46 |
-
# Ensure tokenizer has pad_token set if it's GPT-2 based
|
47 |
-
if hasattr(processor, 'tokenizer') and processor.tokenizer is not None:
|
48 |
-
if getattr(processor.tokenizer, 'pad_token', None) is None: # Check if pad_token attribute exists and is None
|
49 |
-
processor.tokenizer.pad_token = processor.tokenizer.eos_token
|
50 |
-
print(f"DEBUG: Set processor.tokenizer.pad_token to eos_token (ID: {processor.tokenizer.eos_token_id})")
|
51 |
-
else:
|
52 |
-
print("DEBUG: Processor does not have a 'tokenizer' attribute or it is None.")
|
53 |
-
|
54 |
-
|
55 |
-
# Load model, like in generate.py
|
56 |
-
print(f"DEBUG: Loading model VisionLanguageModel.from_pretrained('{model_repo_id}')")
|
57 |
-
model = VisionLanguageModel.from_pretrained(model_repo_id).to(device)
|
58 |
-
print(f"DEBUG: VisionLanguageModel loaded: {type(model)}")
|
59 |
-
model.eval()
|
60 |
-
print("DEBUG: Model set to eval() mode.")
|
61 |
-
|
62 |
-
except Exception as e:
|
63 |
-
print(f"CRITICAL ERROR loading model or processor with AutoProcessor: {e}")
|
64 |
-
import traceback
|
65 |
-
traceback.print_exc()
|
66 |
-
processor = None; model = None
|
67 |
-
else:
|
68 |
-
print("CRITICAL ERROR: VisionLanguageModel class not imported. Cannot load model.")
|
69 |
-
|
70 |
-
|
71 |
-
# --- Text Generation Function ---
|
72 |
-
def generate_text_for_image(image_input_pil: Optional[PILImage.Image], prompt_input_str: Optional[str]) -> str:
|
73 |
-
print(f"DEBUG (generate_text_for_image): Received prompt: '{prompt_input_str}'")
|
74 |
-
if model is None or processor is None:
|
75 |
-
return "Error: Model or processor not loaded. Check logs."
|
76 |
-
if image_input_pil is None: return "Please upload an image."
|
77 |
-
if not prompt_input_str: return "Please provide a prompt."
|
78 |
-
|
79 |
-
try:
|
80 |
-
current_pil_image = image_input_pil
|
81 |
-
if not isinstance(current_pil_image, PILImage.Image):
|
82 |
-
current_pil_image = PILImage.fromarray(current_pil_image)
|
83 |
-
if current_pil_image.mode != "RGB":
|
84 |
-
current_pil_image = current_pil_image.convert("RGB")
|
85 |
-
print(f"DEBUG: Image prepped - size: {current_pil_image.size}, mode: {current_pil_image.mode}")
|
86 |
-
|
87 |
-
# Prepare inputs using the AutoProcessor, as in generate.py
|
88 |
-
print("DEBUG: Processing inputs with AutoProcessor...")
|
89 |
-
inputs = processor(
|
90 |
-
text=[prompt_input_str], images=current_pil_image, return_tensors="pt"
|
91 |
-
).to(device)
|
92 |
-
print(f"DEBUG: Inputs from AutoProcessor - keys: {inputs.keys()}")
|
93 |
-
print(f"DEBUG: input_ids shape: {inputs['input_ids'].shape}, values: {inputs['input_ids']}")
|
94 |
-
print(f"DEBUG: pixel_values shape: {inputs['pixel_values'].shape}")
|
95 |
-
|
96 |
-
# Ensure attention_mask is present, default to ones if not (though AutoProcessor should provide it)
|
97 |
-
attention_mask = inputs.get('attention_mask')
|
98 |
-
if attention_mask is None:
|
99 |
-
print("WARN: attention_mask not found in processor output, creating a default one of all 1s.")
|
100 |
-
attention_mask = torch.ones_like(inputs['input_ids']).to(device)
|
101 |
-
print(f"DEBUG: attention_mask shape: {attention_mask.shape}")
|
102 |
-
|
103 |
-
|
104 |
-
print("DEBUG: Calling model.generate (aligning with nanoVLM's generate.py)...")
|
105 |
-
# Signature for nanoVLM's generate: (self, input_ids, image, attention_mask, max_new_tokens, ...)
|
106 |
-
# `image` parameter in generate() corresponds to `pixel_values` from processor output
|
107 |
-
generated_ids_tensor = model.generate(
|
108 |
-
inputs['input_ids'], # 1st argument to model.generate: input_ids (text prompt)
|
109 |
-
inputs['pixel_values'], # 2nd argument to model.generate: image (pixel values)
|
110 |
-
attention_mask, # 3rd argument to model.generate: attention_mask
|
111 |
-
max_new_tokens=30, # Corresponds to 4th argument in model.generate
|
112 |
-
temperature=0.7, # Match generate.py default or your choice
|
113 |
-
top_k=50, # Match generate.py default or your choice
|
114 |
-
greedy=False # Match generate.py default or your choice
|
115 |
-
# top_p is also an option from generate.py's model.generate
|
116 |
-
)
|
117 |
-
print(f"DEBUG: Raw generated_ids: {generated_ids_tensor}")
|
118 |
-
|
119 |
-
generated_text_list = processor.batch_decode(generated_ids_tensor, skip_special_tokens=True)
|
120 |
-
print(f"DEBUG: Decoded text list: {generated_text_list}")
|
121 |
-
generated_text_str = generated_text_list[0] if generated_text_list else ""
|
122 |
-
|
123 |
-
cleaned_text_str = generated_text_str
|
124 |
-
if prompt_input_str and generated_text_str.startswith(prompt_input_str):
|
125 |
-
cleaned_text_str = generated_text_str[len(prompt_input_str):].lstrip(" ,.:")
|
126 |
-
print(f"DEBUG: Final cleaned text: '{cleaned_text_str}'")
|
127 |
-
return cleaned_text_str.strip()
|
128 |
-
|
129 |
-
except Exception as e:
|
130 |
-
print(f"CRITICAL ERROR during generation: {e}")
|
131 |
-
import traceback
|
132 |
-
traceback.print_exc()
|
133 |
-
return f"Error during generation: {str(e)}"
|
134 |
-
|
135 |
-
# --- Gradio Interface ---
|
136 |
-
description_md = """
|
137 |
-
## Interactive nanoVLM-222M Demo (Mirroring generate.py)
|
138 |
-
Trying to replicate the working `generate.py` script from `huggingface/nanoVLM`.
|
139 |
-
Using AutoProcessor for inputs.
|
140 |
-
"""
|
141 |
-
iface = None
|
142 |
-
if processor and model:
|
143 |
-
try:
|
144 |
-
iface = gr.Interface(
|
145 |
-
fn=generate_text_for_image,
|
146 |
-
inputs=[gr.Image(type="pil", label="Upload Image"), gr.Textbox(label="Your Prompt")],
|
147 |
-
outputs=gr.Textbox(label="Generated Text", show_copy_button=True),
|
148 |
-
title="nanoVLM-222M Demo (generate.py Alignment)",
|
149 |
-
description=description_md,
|
150 |
-
allow_flagging="never"
|
151 |
-
)
|
152 |
-
print("DEBUG: Gradio interface defined.")
|
153 |
-
except Exception as e:
|
154 |
-
print(f"CRITICAL ERROR defining Gradio interface: {e}")
|
155 |
-
import traceback; traceback.print_exc()
|
156 |
-
|
157 |
-
if __name__ == "__main__":
|
158 |
-
if iface:
|
159 |
-
print("DEBUG: Launching Gradio...")
|
160 |
-
iface.launch(server_name="0.0.0.0", server_port=7860)
|
161 |
-
else:
|
162 |
-
print("CRITICAL ERROR: Gradio interface not defined or model/processor failed to load. Cannot launch.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
b01847e1e13f032d8a7309a460d5d2c5.jpg
ADDED
![]() |
requirements.txt
CHANGED
@@ -1,78 +1,17 @@
|
|
1 |
-
|
2 |
-
sentencepiece
|
3 |
-
accelerate
|
4 |
-
aiohappyeyeballs==2.6.1
|
5 |
-
aiohttp==3.11.18
|
6 |
-
aiosignal==1.3.2
|
7 |
-
annotated-types==0.7.0
|
8 |
-
attrs==25.3.0
|
9 |
-
certifi==2025.4.26
|
10 |
-
charset-normalizer==3.4.2
|
11 |
-
click==8.2.1
|
12 |
-
datasets==3.6.0
|
13 |
-
dill==0.3.8
|
14 |
-
docker-pycreds==0.4.0
|
15 |
-
filelock==3.18.0
|
16 |
-
frozenlist==1.6.0
|
17 |
-
fsspec==2025.3.0
|
18 |
-
gitdb==4.0.12
|
19 |
-
GitPython==3.1.44
|
20 |
-
hf-xet==1.1.2
|
21 |
-
huggingface-hub==0.32.0
|
22 |
-
idna==3.10
|
23 |
-
Jinja2==3.1.6
|
24 |
-
MarkupSafe==2.0
|
25 |
-
mpmath==1.3.0
|
26 |
-
multidict==6.4.4
|
27 |
-
multiprocess==0.70.16
|
28 |
-
networkx==3.4.2
|
29 |
-
numpy
|
30 |
-
nvidia-cublas-cu12==12.6.4.1
|
31 |
-
nvidia-cuda-cupti-cu12==12.6.80
|
32 |
-
nvidia-cuda-nvrtc-cu12==12.6.77
|
33 |
-
nvidia-cuda-runtime-cu12==12.6.77
|
34 |
-
nvidia-cudnn-cu12==9.5.1.17
|
35 |
-
nvidia-cufft-cu12==11.3.0.4
|
36 |
-
nvidia-cufile-cu12==1.11.1.6
|
37 |
-
nvidia-curand-cu12==10.3.7.77
|
38 |
-
nvidia-cusolver-cu12==11.7.1.2
|
39 |
-
nvidia-cusparse-cu12==12.5.4.2
|
40 |
-
nvidia-cusparselt-cu12==0.6.3
|
41 |
-
nvidia-nccl-cu12==2.26.2
|
42 |
-
nvidia-nvjitlink-cu12==12.6.85
|
43 |
-
nvidia-nvtx-cu12==12.6.77
|
44 |
-
packaging==25.0
|
45 |
-
pandas==2.2.3
|
46 |
-
pillow==10.4.0
|
47 |
-
platformdirs==4.3.8
|
48 |
-
propcache==0.3.1
|
49 |
-
protobuf==6.31.0
|
50 |
-
psutil==7.0.0
|
51 |
-
pyarrow==20.0.0
|
52 |
-
pydantic==2.11.5
|
53 |
-
pydantic_core==2.33.2
|
54 |
-
python-dateutil==2.9.0.post0
|
55 |
-
pytz==2025.2
|
56 |
-
PyYAML==6.0.2
|
57 |
-
regex==2024.11.6
|
58 |
-
requests==2.32.3
|
59 |
-
safetensors==0.5.3
|
60 |
-
sentry-sdk==2.29.1
|
61 |
-
setproctitle==1.3.6
|
62 |
-
setuptools==80.8.0
|
63 |
-
six==1.17.0
|
64 |
-
smmap==5.0.2
|
65 |
-
sympy==1.14.0
|
66 |
-
tokenizers==0.21.1
|
67 |
torch==2.7.0
|
68 |
-
|
69 |
-
|
|
|
|
|
70 |
transformers==4.52.3
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
|
|
|
|
|
1 |
+
# Try to match your local working environment for generate.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
torch==2.7.0
|
3 |
+
# If 'transformers==4.52.3' and 'tokenizers==0.21.1' are from custom/dev builds,
|
4 |
+
# you MUST find a way to install those exact versions in Docker, or use the
|
5 |
+
# closest standard PyPI versions and test generate.py locally with THOSE first.
|
6 |
+
# For this example, I'm assuming they are pip-installable. If not, adjust.
|
7 |
transformers==4.52.3
|
8 |
+
tokenizers==0.21.1
|
9 |
+
huggingface-hub==0.32.0
|
10 |
+
safetensors==0.5.3
|
11 |
+
Pillow==11.2.1 # generate.py uses PIL.Image
|
12 |
+
# For protobuf, if your local 6.31.0 is confirmed, use it. Otherwise, a standard one:
|
13 |
+
protobuf==4.25.3 # Or your confirmed local 6.31.0 if pip-installable
|
14 |
+
accelerate # Good to include, though generate.py might not explicitly use it
|
15 |
+
sentencepiece # Often a dependency for tokenizers
|
16 |
+
|
17 |
+
# NO Gradio needed for this test
|