vidhanm commited on
Commit
137b7f1
·
1 Parent(s): 876ed10

trying new approach

Browse files
Files changed (4) hide show
  1. Dockerfile +36 -34
  2. app.py +0 -162
  3. b01847e1e13f032d8a7309a460d5d2c5.jpg +0 -0
  4. requirements.txt +15 -76
Dockerfile CHANGED
@@ -1,39 +1,41 @@
1
- # Use a slim Python base image.
2
- FROM python:3.10-slim
3
 
4
- # Set the working directory in the container
5
  WORKDIR /app
6
 
7
- # Set Hugging Face cache directory and Gradio temp/flagging dir
8
- ENV HF_HOME=/app/.cache/huggingface
9
- ENV GRADIO_TEMP_DIR=/tmp/gradio_tmp
10
- ENV GRADIO_FLAGGING_DIR=/tmp/gradio_flags
11
 
12
- # Install git and build-essential
13
- RUN apt-get update && apt-get install -y \
14
- git \
15
- build-essential \
16
- && rm -rf /var/lib/apt/lists/*
17
-
18
- # Clone the original nanoVLM repository for its model definition files
19
- # This makes the `models` directory from nanoVLM available under /app/nanoVLM
20
- RUN git clone https://github.com/huggingface/nanoVLM.git /app/nanoVLM
21
-
22
- # Create the cache and temp directories and make them writable
23
- RUN mkdir -p $HF_HOME $GRADIO_TEMP_DIR $GRADIO_FLAGGING_DIR && \
24
- chmod -R 777 $HF_HOME $GRADIO_TEMP_DIR $GRADIO_FLAGGING_DIR
25
-
26
- # Copy the requirements file first
27
  COPY requirements.txt requirements.txt
28
-
29
- # Install Python dependencies
30
- RUN pip install --no-cache-dir --prefer-binary -r requirements.txt
31
-
32
- # Copy the application code into the container
33
- COPY app.py app.py
34
-
35
- # Expose the port Gradio will run on
36
- EXPOSE 7860
37
-
38
- # Set the default command to run the Gradio application
39
- CMD ["python", "-u", "app.py"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim # Or your preferred Python version matching local
 
2
 
 
3
  WORKDIR /app
4
 
5
+ # Install git
6
+ RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
 
 
7
 
8
+ # Copy requirements and install
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  COPY requirements.txt requirements.txt
10
+ RUN echo "DEBUG: Installing packages from requirements.txt" && \
11
+ pip install --no-cache-dir -r requirements.txt && \
12
+ echo "DEBUG: Finished installing packages."
13
+
14
+ # Clone the nanoVLM repository which contains generate.py and the models directory
15
+ # This also ensures the 'models' module is available for VisionLanguageModel import
16
+ RUN echo "DEBUG: Cloning huggingface/nanoVLM repository..." && \
17
+ git clone https://github.com/huggingface/nanoVLM.git /app/nanoVLM && \
18
+ echo "DEBUG: nanoVLM repository cloned to /app/nanoVLM."
19
+
20
+ # Add a test image to the Space.
21
+ # You need to create a simple 'test_image.jpg' and add it to the root of your Space repo.
22
+ COPY ./test_image.jpg /app/test_image.jpg
23
+ RUN if [ ! -f /app/test_image.jpg ]; then echo "ERROR: test_image.jpg not found!"; exit 1; fi
24
+
25
+ # Set Python path to include the nanoVLM models directory, so `from models...` works
26
+ ENV PYTHONPATH="/app/nanoVLM:${PYTHONPATH}"
27
+ ENV HF_HOME=/app/.cache/huggingface # Define a writable cache directory
28
+
29
+ # Create cache directory with write permissions
30
+ RUN mkdir -p $HF_HOME && chmod -R 777 $HF_HOME
31
+
32
+ # The generate.py script is at /app/nanoVLM/generate.py
33
+ # It takes arguments like --model_path, --image_path, --prompt, --device
34
+ # We will run it directly. Its output will go to the Space's container logs.
35
+ CMD ["python", "-u", "/app/nanoVLM/generate.py", \
36
+ "--model_path", "lusxvr/nanoVLM-222M", \
37
+ "--image_path", "/app/test_image.jpg", \
38
+ "--prompt", "describe this image in detail", \
39
+ "--device", "cpu", \
40
+ "--num_generations", "1", \
41
+ "--max_new_tokens", "50"]
app.py DELETED
@@ -1,162 +0,0 @@
1
- import sys
2
- import os
3
- from typing import Optional
4
- from PIL import Image as PILImage
5
-
6
- # Add the cloned nanoVLM directory to Python's system path
7
- NANOVLM_REPO_PATH = "/app/nanoVLM"
8
- if NANOVLM_REPO_PATH not in sys.path:
9
- print(f"DEBUG: Adding {NANOVLM_REPO_PATH} to sys.path")
10
- sys.path.insert(0, NANOVLM_REPO_PATH)
11
-
12
- import gradio as gr
13
- import torch
14
- from transformers import AutoProcessor # Using AutoProcessor as in generate.py
15
-
16
- VisionLanguageModel = None
17
- try:
18
- print("DEBUG: Attempting to import VisionLanguageModel")
19
- from models.vision_language_model import VisionLanguageModel
20
- print("DEBUG: Successfully imported VisionLanguageModel.")
21
- except ImportError as e:
22
- print(f"CRITICAL ERROR: Importing VisionLanguageModel: {e}")
23
-
24
- # --- Device Setup ---
25
- device = "cuda" if torch.cuda.is_available() else "cpu"
26
- print(f"DEBUG: Using device: {device}")
27
-
28
- # --- Configuration ---
29
- # This will be used for both model and processor, as in generate.py
30
- model_repo_id = "lusxvr/nanoVLM-222M"
31
- print(f"DEBUG: Model Repository ID for model and processor: {model_repo_id}")
32
-
33
- # --- Initialize ---
34
- processor = None
35
- model = None
36
-
37
- if VisionLanguageModel: # Only proceed if custom model class was imported
38
- try:
39
- # Load processor using AutoProcessor, like in generate.py
40
- print(f"DEBUG: Loading processor using AutoProcessor.from_pretrained('{model_repo_id}')")
41
- # Using trust_remote_code=True here as a precaution,
42
- # though ideally not needed if processor_config.json is complete.
43
- processor = AutoProcessor.from_pretrained(model_repo_id, trust_remote_code=True)
44
- print(f"DEBUG: AutoProcessor loaded: {type(processor)}")
45
-
46
- # Ensure tokenizer has pad_token set if it's GPT-2 based
47
- if hasattr(processor, 'tokenizer') and processor.tokenizer is not None:
48
- if getattr(processor.tokenizer, 'pad_token', None) is None: # Check if pad_token attribute exists and is None
49
- processor.tokenizer.pad_token = processor.tokenizer.eos_token
50
- print(f"DEBUG: Set processor.tokenizer.pad_token to eos_token (ID: {processor.tokenizer.eos_token_id})")
51
- else:
52
- print("DEBUG: Processor does not have a 'tokenizer' attribute or it is None.")
53
-
54
-
55
- # Load model, like in generate.py
56
- print(f"DEBUG: Loading model VisionLanguageModel.from_pretrained('{model_repo_id}')")
57
- model = VisionLanguageModel.from_pretrained(model_repo_id).to(device)
58
- print(f"DEBUG: VisionLanguageModel loaded: {type(model)}")
59
- model.eval()
60
- print("DEBUG: Model set to eval() mode.")
61
-
62
- except Exception as e:
63
- print(f"CRITICAL ERROR loading model or processor with AutoProcessor: {e}")
64
- import traceback
65
- traceback.print_exc()
66
- processor = None; model = None
67
- else:
68
- print("CRITICAL ERROR: VisionLanguageModel class not imported. Cannot load model.")
69
-
70
-
71
- # --- Text Generation Function ---
72
- def generate_text_for_image(image_input_pil: Optional[PILImage.Image], prompt_input_str: Optional[str]) -> str:
73
- print(f"DEBUG (generate_text_for_image): Received prompt: '{prompt_input_str}'")
74
- if model is None or processor is None:
75
- return "Error: Model or processor not loaded. Check logs."
76
- if image_input_pil is None: return "Please upload an image."
77
- if not prompt_input_str: return "Please provide a prompt."
78
-
79
- try:
80
- current_pil_image = image_input_pil
81
- if not isinstance(current_pil_image, PILImage.Image):
82
- current_pil_image = PILImage.fromarray(current_pil_image)
83
- if current_pil_image.mode != "RGB":
84
- current_pil_image = current_pil_image.convert("RGB")
85
- print(f"DEBUG: Image prepped - size: {current_pil_image.size}, mode: {current_pil_image.mode}")
86
-
87
- # Prepare inputs using the AutoProcessor, as in generate.py
88
- print("DEBUG: Processing inputs with AutoProcessor...")
89
- inputs = processor(
90
- text=[prompt_input_str], images=current_pil_image, return_tensors="pt"
91
- ).to(device)
92
- print(f"DEBUG: Inputs from AutoProcessor - keys: {inputs.keys()}")
93
- print(f"DEBUG: input_ids shape: {inputs['input_ids'].shape}, values: {inputs['input_ids']}")
94
- print(f"DEBUG: pixel_values shape: {inputs['pixel_values'].shape}")
95
-
96
- # Ensure attention_mask is present, default to ones if not (though AutoProcessor should provide it)
97
- attention_mask = inputs.get('attention_mask')
98
- if attention_mask is None:
99
- print("WARN: attention_mask not found in processor output, creating a default one of all 1s.")
100
- attention_mask = torch.ones_like(inputs['input_ids']).to(device)
101
- print(f"DEBUG: attention_mask shape: {attention_mask.shape}")
102
-
103
-
104
- print("DEBUG: Calling model.generate (aligning with nanoVLM's generate.py)...")
105
- # Signature for nanoVLM's generate: (self, input_ids, image, attention_mask, max_new_tokens, ...)
106
- # `image` parameter in generate() corresponds to `pixel_values` from processor output
107
- generated_ids_tensor = model.generate(
108
- inputs['input_ids'], # 1st argument to model.generate: input_ids (text prompt)
109
- inputs['pixel_values'], # 2nd argument to model.generate: image (pixel values)
110
- attention_mask, # 3rd argument to model.generate: attention_mask
111
- max_new_tokens=30, # Corresponds to 4th argument in model.generate
112
- temperature=0.7, # Match generate.py default or your choice
113
- top_k=50, # Match generate.py default or your choice
114
- greedy=False # Match generate.py default or your choice
115
- # top_p is also an option from generate.py's model.generate
116
- )
117
- print(f"DEBUG: Raw generated_ids: {generated_ids_tensor}")
118
-
119
- generated_text_list = processor.batch_decode(generated_ids_tensor, skip_special_tokens=True)
120
- print(f"DEBUG: Decoded text list: {generated_text_list}")
121
- generated_text_str = generated_text_list[0] if generated_text_list else ""
122
-
123
- cleaned_text_str = generated_text_str
124
- if prompt_input_str and generated_text_str.startswith(prompt_input_str):
125
- cleaned_text_str = generated_text_str[len(prompt_input_str):].lstrip(" ,.:")
126
- print(f"DEBUG: Final cleaned text: '{cleaned_text_str}'")
127
- return cleaned_text_str.strip()
128
-
129
- except Exception as e:
130
- print(f"CRITICAL ERROR during generation: {e}")
131
- import traceback
132
- traceback.print_exc()
133
- return f"Error during generation: {str(e)}"
134
-
135
- # --- Gradio Interface ---
136
- description_md = """
137
- ## Interactive nanoVLM-222M Demo (Mirroring generate.py)
138
- Trying to replicate the working `generate.py` script from `huggingface/nanoVLM`.
139
- Using AutoProcessor for inputs.
140
- """
141
- iface = None
142
- if processor and model:
143
- try:
144
- iface = gr.Interface(
145
- fn=generate_text_for_image,
146
- inputs=[gr.Image(type="pil", label="Upload Image"), gr.Textbox(label="Your Prompt")],
147
- outputs=gr.Textbox(label="Generated Text", show_copy_button=True),
148
- title="nanoVLM-222M Demo (generate.py Alignment)",
149
- description=description_md,
150
- allow_flagging="never"
151
- )
152
- print("DEBUG: Gradio interface defined.")
153
- except Exception as e:
154
- print(f"CRITICAL ERROR defining Gradio interface: {e}")
155
- import traceback; traceback.print_exc()
156
-
157
- if __name__ == "__main__":
158
- if iface:
159
- print("DEBUG: Launching Gradio...")
160
- iface.launch(server_name="0.0.0.0", server_port=7860)
161
- else:
162
- print("CRITICAL ERROR: Gradio interface not defined or model/processor failed to load. Cannot launch.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b01847e1e13f032d8a7309a460d5d2c5.jpg ADDED
requirements.txt CHANGED
@@ -1,78 +1,17 @@
1
- gradio==3.50.2 # Pin to a specific, widely used Gradio 3.x version
2
- sentencepiece
3
- accelerate
4
- aiohappyeyeballs==2.6.1
5
- aiohttp==3.11.18
6
- aiosignal==1.3.2
7
- annotated-types==0.7.0
8
- attrs==25.3.0
9
- certifi==2025.4.26
10
- charset-normalizer==3.4.2
11
- click==8.2.1
12
- datasets==3.6.0
13
- dill==0.3.8
14
- docker-pycreds==0.4.0
15
- filelock==3.18.0
16
- frozenlist==1.6.0
17
- fsspec==2025.3.0
18
- gitdb==4.0.12
19
- GitPython==3.1.44
20
- hf-xet==1.1.2
21
- huggingface-hub==0.32.0
22
- idna==3.10
23
- Jinja2==3.1.6
24
- MarkupSafe==2.0
25
- mpmath==1.3.0
26
- multidict==6.4.4
27
- multiprocess==0.70.16
28
- networkx==3.4.2
29
- numpy
30
- nvidia-cublas-cu12==12.6.4.1
31
- nvidia-cuda-cupti-cu12==12.6.80
32
- nvidia-cuda-nvrtc-cu12==12.6.77
33
- nvidia-cuda-runtime-cu12==12.6.77
34
- nvidia-cudnn-cu12==9.5.1.17
35
- nvidia-cufft-cu12==11.3.0.4
36
- nvidia-cufile-cu12==1.11.1.6
37
- nvidia-curand-cu12==10.3.7.77
38
- nvidia-cusolver-cu12==11.7.1.2
39
- nvidia-cusparse-cu12==12.5.4.2
40
- nvidia-cusparselt-cu12==0.6.3
41
- nvidia-nccl-cu12==2.26.2
42
- nvidia-nvjitlink-cu12==12.6.85
43
- nvidia-nvtx-cu12==12.6.77
44
- packaging==25.0
45
- pandas==2.2.3
46
- pillow==10.4.0
47
- platformdirs==4.3.8
48
- propcache==0.3.1
49
- protobuf==6.31.0
50
- psutil==7.0.0
51
- pyarrow==20.0.0
52
- pydantic==2.11.5
53
- pydantic_core==2.33.2
54
- python-dateutil==2.9.0.post0
55
- pytz==2025.2
56
- PyYAML==6.0.2
57
- regex==2024.11.6
58
- requests==2.32.3
59
- safetensors==0.5.3
60
- sentry-sdk==2.29.1
61
- setproctitle==1.3.6
62
- setuptools==80.8.0
63
- six==1.17.0
64
- smmap==5.0.2
65
- sympy==1.14.0
66
- tokenizers==0.21.1
67
  torch==2.7.0
68
- torchvision==0.22.0
69
- tqdm==4.67.1
 
 
70
  transformers==4.52.3
71
- triton==3.3.0
72
- typing-inspection==0.4.1
73
- typing_extensions==4.13.2
74
- tzdata==2025.2
75
- urllib3==2.4.0
76
- wandb==0.19.11
77
- xxhash==3.5.0
78
- yarl==1.20.0
 
 
 
1
+ # Try to match your local working environment for generate.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  torch==2.7.0
3
+ # If 'transformers==4.52.3' and 'tokenizers==0.21.1' are from custom/dev builds,
4
+ # you MUST find a way to install those exact versions in Docker, or use the
5
+ # closest standard PyPI versions and test generate.py locally with THOSE first.
6
+ # For this example, I'm assuming they are pip-installable. If not, adjust.
7
  transformers==4.52.3
8
+ tokenizers==0.21.1
9
+ huggingface-hub==0.32.0
10
+ safetensors==0.5.3
11
+ Pillow==11.2.1 # generate.py uses PIL.Image
12
+ # For protobuf, if your local 6.31.0 is confirmed, use it. Otherwise, a standard one:
13
+ protobuf==4.25.3 # Or your confirmed local 6.31.0 if pip-installable
14
+ accelerate # Good to include, though generate.py might not explicitly use it
15
+ sentencepiece # Often a dependency for tokenizers
16
+
17
+ # NO Gradio needed for this test