vidhanm commited on
Commit
e198913
·
1 Parent(s): fbe5121

now cloning github repo for its files

Browse files
Files changed (2) hide show
  1. Dockerfile +8 -7
  2. app.py +67 -34
Dockerfile CHANGED
@@ -5,34 +5,35 @@ FROM python:3.9-slim
5
  WORKDIR /app
6
 
7
  # Set Hugging Face cache directory and Gradio temp/flagging dir
8
- # These will be within /app or /tmp, which we can make writable.
9
  ENV HF_HOME=/app/.cache/huggingface
10
  ENV GRADIO_TEMP_DIR=/tmp/gradio_tmp
11
  ENV GRADIO_FLAGGING_DIR=/tmp/gradio_flags
12
 
13
- # Install git and build-essential (good practice for some pip installs)
14
  RUN apt-get update && apt-get install -y \
15
  git \
16
  build-essential \
17
  && rm -rf /var/lib/apt/lists/*
18
 
19
- # Create the cache and temp directories and make them writable by any user.
 
 
 
 
20
  RUN mkdir -p $HF_HOME $GRADIO_TEMP_DIR $GRADIO_FLAGGING_DIR && \
21
  chmod -R 777 $HF_HOME $GRADIO_TEMP_DIR $GRADIO_FLAGGING_DIR
22
 
23
- # Copy the requirements file first to leverage Docker layer caching
24
  COPY requirements.txt requirements.txt
25
 
26
  # Install Python dependencies
27
- # --no-cache-dir reduces image size
28
  RUN pip install --no-cache-dir --prefer-binary -r requirements.txt
29
 
30
  # Copy the application code into the container
31
  COPY app.py app.py
32
 
33
- # Expose the port Gradio will run on (default is 7860)
34
  EXPOSE 7860
35
 
36
  # Set the default command to run the Gradio application
37
- # Using `python -u` for unbuffered output, which is good for logging
38
  CMD ["python", "-u", "app.py"]
 
5
  WORKDIR /app
6
 
7
  # Set Hugging Face cache directory and Gradio temp/flagging dir
 
8
  ENV HF_HOME=/app/.cache/huggingface
9
  ENV GRADIO_TEMP_DIR=/tmp/gradio_tmp
10
  ENV GRADIO_FLAGGING_DIR=/tmp/gradio_flags
11
 
12
+ # Install git and build-essential
13
  RUN apt-get update && apt-get install -y \
14
  git \
15
  build-essential \
16
  && rm -rf /var/lib/apt/lists/*
17
 
18
+ # Clone the original nanoVLM repository for its model definition files
19
+ # This makes the `models` directory from nanoVLM available under /app/nanoVLM
20
+ RUN git clone https://github.com/huggingface/nanoVLM.git /app/nanoVLM
21
+
22
+ # Create the cache and temp directories and make them writable
23
  RUN mkdir -p $HF_HOME $GRADIO_TEMP_DIR $GRADIO_FLAGGING_DIR && \
24
  chmod -R 777 $HF_HOME $GRADIO_TEMP_DIR $GRADIO_FLAGGING_DIR
25
 
26
+ # Copy the requirements file first
27
  COPY requirements.txt requirements.txt
28
 
29
  # Install Python dependencies
 
30
  RUN pip install --no-cache-dir --prefer-binary -r requirements.txt
31
 
32
  # Copy the application code into the container
33
  COPY app.py app.py
34
 
35
+ # Expose the port Gradio will run on
36
  EXPOSE 7860
37
 
38
  # Set the default command to run the Gradio application
 
39
  CMD ["python", "-u", "app.py"]
app.py CHANGED
@@ -1,8 +1,27 @@
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  from PIL import Image
3
  import torch
4
- from transformers import AutoProcessor, AutoModelForVision2Seq # Keep these for now
5
- import os
 
 
 
 
 
 
 
 
 
 
6
 
7
  # Determine the device to use
8
  device_choice = os.environ.get("DEVICE", "auto")
@@ -17,25 +36,44 @@ model_id = "lusxvr/nanoVLM-222M"
17
  processor = None
18
  model = None
19
 
20
- try:
21
- print(f"Attempting to load processor for {model_id} with trust_remote_code=True")
22
- # For custom models like nanoVLM, trust_remote_code=True is often needed.
23
- processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
24
- print(f"Processor loaded. Attempting to load model for {model_id} with trust_remote_code=True")
25
- model = AutoModelForVision2Seq.from_pretrained(model_id, trust_remote_code=True).to(device)
26
- print("Model and processor loaded successfully.")
27
- except Exception as e:
28
- print(f"Error loading model/processor: {e}")
29
- # More detailed error logging or fallback could be added here.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
  def generate_text_for_image(image_input, prompt_input):
32
- if model is None or processor is None:
33
- return "Error: Model or processor not loaded. Check the Space logs. This might be due to missing 'trust_remote_code=True' or model compatibility issues."
34
 
35
  if image_input is None:
36
  return "Please upload an image."
37
  if not prompt_input:
38
- return "Please provide a prompt (e.g., 'Describe this image' or 'What color is the car?')."
39
 
40
  try:
41
  if not isinstance(image_input, Image.Image):
@@ -46,19 +84,26 @@ def generate_text_for_image(image_input, prompt_input):
46
  if pil_image.mode != "RGB":
47
  pil_image = pil_image.convert("RGB")
48
 
 
 
 
49
  inputs = processor(text=[prompt_input], images=[pil_image], return_tensors="pt").to(device)
50
 
 
 
51
  generated_ids = model.generate(
52
- **inputs,
 
 
53
  max_new_tokens=150,
54
  num_beams=3,
55
  no_repeat_ngram_size=2,
56
  early_stopping=True
 
57
  )
58
 
59
  generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
60
 
61
- # Basic cleaning of the prompt if the model includes it in the output
62
  if prompt_input and generated_text.startswith(prompt_input):
63
  cleaned_text = generated_text[len(prompt_input):].lstrip(" ,.:")
64
  else:
@@ -68,26 +113,17 @@ def generate_text_for_image(image_input, prompt_input):
68
 
69
  except Exception as e:
70
  print(f"Error during generation: {e}")
71
- # Provide a more user-friendly error if possible
72
  return f"An error occurred during text generation: {str(e)}"
73
 
74
- description = """
75
- Upload an image and provide a text prompt (e.g., "What is in this image?", "Describe the animal in detail.").
76
- The model will generate a textual response based on the visual content and your query.
77
- This Space uses the `lusxvr/nanoVLM-222M` model.
78
- """
79
- example_image_url = "http://images.cocodataset.org/val2017/000000039769.jpg" # A cat and a remote
80
-
81
- # Get the pre-defined writable directory for Gradio's temporary files/cache
82
- # This environment variable is set in your Dockerfile.
83
  gradio_cache_dir = os.environ.get("GRADIO_TEMP_DIR", "/tmp/gradio_tmp")
84
 
85
-
86
  iface = gr.Interface(
87
  fn=generate_text_for_image,
88
  inputs=[
89
  gr.Image(type="pil", label="Upload Image"),
90
- gr.Textbox(label="Your Prompt/Question", info="e.g., 'What is this a picture of?', 'Describe the main subject.', 'How many animals are there?'")
91
  ],
92
  outputs=gr.Textbox(label="Generated Text", show_copy_button=True),
93
  title="Interactive nanoVLM-222M Demo",
@@ -95,18 +131,15 @@ iface = gr.Interface(
95
  examples=[
96
  [example_image_url, "a photo of a"],
97
  [example_image_url, "Describe the image in detail."],
98
- [example_image_url, "What objects are on the sofa?"],
99
  ],
100
  cache_examples=True,
101
- # Use the writable directory for caching examples
102
  examples_cache_folder=gradio_cache_dir,
103
  allow_flagging="never"
104
  )
105
 
106
  if __name__ == "__main__":
107
  if model is None or processor is None:
108
- print("CRITICAL: Model or processor failed to load. Gradio interface will not start.")
109
- # You could raise an error here or sys.exit(1) to make the Space fail clearly if loading is essential.
110
  else:
111
  print("Launching Gradio interface...")
112
- iface.launch(server_name="0.0.0.0", server_port=7860)
 
1
+ import sys
2
+ import os
3
+
4
+ # Add the cloned nanoVLM directory to Python's system path
5
+ # This allows us to import from the 'models' directory within nanoVLM
6
+ NANOVLM_REPO_PATH = "/app/nanoVLM" # Path where we cloned it in Dockerfile
7
+ if NANOVLM_REPO_PATH not in sys.path:
8
+ sys.path.insert(0, NANOVLM_REPO_PATH)
9
+
10
  import gradio as gr
11
  from PIL import Image
12
  import torch
13
+ from transformers import AutoProcessor # AutoProcessor might still work
14
+
15
+ # Now import the custom classes from the cloned nanoVLM repository
16
+ try:
17
+ from models.vision_language_model import VisionLanguageModel
18
+ from models.configurations import VisionLanguageConfig # Or the specific config class used by nanoVLM
19
+ print("Successfully imported VisionLanguageModel and VisionLanguageConfig from nanoVLM clone.")
20
+ except ImportError as e:
21
+ print(f"Error importing from nanoVLM clone: {e}. Check NANOVLM_REPO_PATH and ensure nanoVLM cloned correctly.")
22
+ VisionLanguageModel = None
23
+ VisionLanguageConfig = None
24
+
25
 
26
  # Determine the device to use
27
  device_choice = os.environ.get("DEVICE", "auto")
 
36
  processor = None
37
  model = None
38
 
39
+ if VisionLanguageModel and VisionLanguageConfig:
40
+ try:
41
+ print(f"Attempting to load processor for {model_id}")
42
+ # Processor loading might still be okay with AutoProcessor,
43
+ # as processor_config.json is usually standard.
44
+ # trust_remote_code might be needed if processor has custom code too.
45
+ processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
46
+ print("Processor loaded.")
47
+
48
+ print(f"Attempting to load model config for {model_id} using VisionLanguageConfig")
49
+ # Load the configuration using the custom config class, pointing to your model_id
50
+ # trust_remote_code=True allows it to use any specific code paths from your model_id if needed for config.
51
+ config = VisionLanguageConfig.from_pretrained(model_id, trust_remote_code=True)
52
+ print("Model config loaded.")
53
+
54
+ print(f"Attempting to load model weights for {model_id} using VisionLanguageModel")
55
+ # Load the model weights using the custom model class and the loaded config
56
+ model = VisionLanguageModel.from_pretrained(model_id, config=config, trust_remote_code=True).to(device)
57
+ print("Model weights loaded successfully.")
58
+ model.eval() # Set to evaluation mode
59
+
60
+ except Exception as e:
61
+ print(f"Error loading model, processor, or config: {e}")
62
+ # Fallback if any step fails
63
+ processor = None
64
+ model = None
65
+ else:
66
+ print("Custom nanoVLM classes not imported, cannot load model.")
67
+
68
 
69
  def generate_text_for_image(image_input, prompt_input):
70
+ if model is None or processor is None or not hasattr(model, 'generate'): # Check if model has generate
71
+ return "Error: Model or processor not loaded correctly or model doesn't have 'generate' method. Check logs."
72
 
73
  if image_input is None:
74
  return "Please upload an image."
75
  if not prompt_input:
76
+ return "Please provide a prompt."
77
 
78
  try:
79
  if not isinstance(image_input, Image.Image):
 
84
  if pil_image.mode != "RGB":
85
  pil_image = pil_image.convert("RGB")
86
 
87
+ # Prepare inputs for the model using the processor
88
+ # The exact format for nanoVLM's custom model might require specific handling.
89
+ # The processor from AutoProcessor should generally work.
90
  inputs = processor(text=[prompt_input], images=[pil_image], return_tensors="pt").to(device)
91
 
92
+ # Generate text using the model's generate method
93
+ # Common parameters for generation:
94
  generated_ids = model.generate(
95
+ inputs['pixel_values'], # Assuming processor output has 'pixel_values'
96
+ inputs['input_ids'], # Assuming processor output has 'input_ids'
97
+ attention_mask=inputs.get('attention_mask'), # Optional, but good to include
98
  max_new_tokens=150,
99
  num_beams=3,
100
  no_repeat_ngram_size=2,
101
  early_stopping=True
102
+ # Check nanoVLM's VisionLanguageModel.generate() for specific parameters
103
  )
104
 
105
  generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
106
 
 
107
  if prompt_input and generated_text.startswith(prompt_input):
108
  cleaned_text = generated_text[len(prompt_input):].lstrip(" ,.:")
109
  else:
 
113
 
114
  except Exception as e:
115
  print(f"Error during generation: {e}")
 
116
  return f"An error occurred during text generation: {str(e)}"
117
 
118
+ description = "Interactive demo for lusxvr/nanoVLM-222M."
119
+ example_image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
 
 
 
 
 
 
 
120
  gradio_cache_dir = os.environ.get("GRADIO_TEMP_DIR", "/tmp/gradio_tmp")
121
 
 
122
  iface = gr.Interface(
123
  fn=generate_text_for_image,
124
  inputs=[
125
  gr.Image(type="pil", label="Upload Image"),
126
+ gr.Textbox(label="Your Prompt/Question")
127
  ],
128
  outputs=gr.Textbox(label="Generated Text", show_copy_button=True),
129
  title="Interactive nanoVLM-222M Demo",
 
131
  examples=[
132
  [example_image_url, "a photo of a"],
133
  [example_image_url, "Describe the image in detail."],
 
134
  ],
135
  cache_examples=True,
 
136
  examples_cache_folder=gradio_cache_dir,
137
  allow_flagging="never"
138
  )
139
 
140
  if __name__ == "__main__":
141
  if model is None or processor is None:
142
+ print("CRITICAL: Model or processor failed to load. Gradio interface may not function correctly.")
 
143
  else:
144
  print("Launching Gradio interface...")
145
+ iface.launch(server_name="0.0.0.0", server_port=7860)