Spaces:

witcher23
/

nanoVLM-inference

Running

App Files Files Community

vidhanm commited on May 23

Commit

e198913

1 Parent(s): fbe5121

now cloning github repo for its files

Browse files

Files changed (2) hide show

Dockerfile +8 -7
app.py +67 -34

Dockerfile CHANGED Viewed

@@ -5,34 +5,35 @@ FROM python:3.9-slim
 WORKDIR /app
 # Set Hugging Face cache directory and Gradio temp/flagging dir
-# These will be within /app or /tmp, which we can make writable.
 ENV HF_HOME=/app/.cache/huggingface
 ENV GRADIO_TEMP_DIR=/tmp/gradio_tmp
 ENV GRADIO_FLAGGING_DIR=/tmp/gradio_flags
-# Install git and build-essential (good practice for some pip installs)
 RUN apt-get update && apt-get install -y \
     git \
     build-essential \
     && rm -rf /var/lib/apt/lists/*
-# Create the cache and temp directories and make them writable by any user.
 RUN mkdir -p $HF_HOME $GRADIO_TEMP_DIR $GRADIO_FLAGGING_DIR && \
     chmod -R 777 $HF_HOME $GRADIO_TEMP_DIR $GRADIO_FLAGGING_DIR
-# Copy the requirements file first to leverage Docker layer caching
 COPY requirements.txt requirements.txt
 # Install Python dependencies
-# --no-cache-dir reduces image size
 RUN pip install --no-cache-dir --prefer-binary -r requirements.txt
 # Copy the application code into the container
 COPY app.py app.py
-# Expose the port Gradio will run on (default is 7860)
 EXPOSE 7860
 # Set the default command to run the Gradio application
-# Using `python -u` for unbuffered output, which is good for logging
 CMD ["python", "-u", "app.py"]

 WORKDIR /app
 # Set Hugging Face cache directory and Gradio temp/flagging dir
 ENV HF_HOME=/app/.cache/huggingface
 ENV GRADIO_TEMP_DIR=/tmp/gradio_tmp
 ENV GRADIO_FLAGGING_DIR=/tmp/gradio_flags
+# Install git and build-essential
 RUN apt-get update && apt-get install -y \
     git \
     build-essential \
     && rm -rf /var/lib/apt/lists/*
+# Clone the original nanoVLM repository for its model definition files
+# This makes the `models` directory from nanoVLM available under /app/nanoVLM
+RUN git clone https://github.com/huggingface/nanoVLM.git /app/nanoVLM
+# Create the cache and temp directories and make them writable
 RUN mkdir -p $HF_HOME $GRADIO_TEMP_DIR $GRADIO_FLAGGING_DIR && \
     chmod -R 777 $HF_HOME $GRADIO_TEMP_DIR $GRADIO_FLAGGING_DIR
+# Copy the requirements file first
 COPY requirements.txt requirements.txt
 # Install Python dependencies
 RUN pip install --no-cache-dir --prefer-binary -r requirements.txt
 # Copy the application code into the container
 COPY app.py app.py
+# Expose the port Gradio will run on
 EXPOSE 7860
 # Set the default command to run the Gradio application
 CMD ["python", "-u", "app.py"]

app.py CHANGED Viewed

@@ -1,8 +1,27 @@
 import gradio as gr
 from PIL import Image
 import torch
-from transformers import AutoProcessor, AutoModelForVision2Seq # Keep these for now
-import os
 # Determine the device to use
 device_choice = os.environ.get("DEVICE", "auto")
@@ -17,25 +36,44 @@ model_id = "lusxvr/nanoVLM-222M"
 processor = None
 model = None
-try:
-    print(f"Attempting to load processor for {model_id} with trust_remote_code=True")
-    # For custom models like nanoVLM, trust_remote_code=True is often needed.
-    processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
-    print(f"Processor loaded. Attempting to load model for {model_id} with trust_remote_code=True")
-    model = AutoModelForVision2Seq.from_pretrained(model_id, trust_remote_code=True).to(device)
-    print("Model and processor loaded successfully.")
-except Exception as e:
-    print(f"Error loading model/processor: {e}")
-    # More detailed error logging or fallback could be added here.
 def generate_text_for_image(image_input, prompt_input):
-    if model is None or processor is None:
-        return "Error: Model or processor not loaded. Check the Space logs. This might be due to missing 'trust_remote_code=True' or model compatibility issues."
     if image_input is None:
         return "Please upload an image."
     if not prompt_input:
-        return "Please provide a prompt (e.g., 'Describe this image' or 'What color is the car?')."
     try:
         if not isinstance(image_input, Image.Image):
@@ -46,19 +84,26 @@ def generate_text_for_image(image_input, prompt_input):
         if pil_image.mode != "RGB":
             pil_image = pil_image.convert("RGB")
         inputs = processor(text=[prompt_input], images=[pil_image], return_tensors="pt").to(device)
         generated_ids = model.generate(
-            **inputs,
             max_new_tokens=150,
             num_beams=3,
             no_repeat_ngram_size=2,
             early_stopping=True
         )
         generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-        # Basic cleaning of the prompt if the model includes it in the output
         if prompt_input and generated_text.startswith(prompt_input):
              cleaned_text = generated_text[len(prompt_input):].lstrip(" ,.:")
         else:
@@ -68,26 +113,17 @@ def generate_text_for_image(image_input, prompt_input):
     except Exception as e:
         print(f"Error during generation: {e}")
-        # Provide a more user-friendly error if possible
         return f"An error occurred during text generation: {str(e)}"
-description = """
-Upload an image and provide a text prompt (e.g., "What is in this image?", "Describe the animal in detail.").
-The model will generate a textual response based on the visual content and your query.
-This Space uses the `lusxvr/nanoVLM-222M` model.
-"""
-example_image_url = "http://images.cocodataset.org/val2017/000000039769.jpg" # A cat and a remote
-# Get the pre-defined writable directory for Gradio's temporary files/cache
-# This environment variable is set in your Dockerfile.
 gradio_cache_dir = os.environ.get("GRADIO_TEMP_DIR", "/tmp/gradio_tmp")
 iface = gr.Interface(
     fn=generate_text_for_image,
     inputs=[
         gr.Image(type="pil", label="Upload Image"),
-        gr.Textbox(label="Your Prompt/Question", info="e.g., 'What is this a picture of?', 'Describe the main subject.', 'How many animals are there?'")
     ],
     outputs=gr.Textbox(label="Generated Text", show_copy_button=True),
     title="Interactive nanoVLM-222M Demo",
@@ -95,18 +131,15 @@ iface = gr.Interface(
     examples=[
         [example_image_url, "a photo of a"],
         [example_image_url, "Describe the image in detail."],
-        [example_image_url, "What objects are on the sofa?"],
     ],
     cache_examples=True,
-    # Use the writable directory for caching examples
     examples_cache_folder=gradio_cache_dir,
     allow_flagging="never"
 )
 if __name__ == "__main__":
     if model is None or processor is None:
-        print("CRITICAL: Model or processor failed to load. Gradio interface will not start.")
-        # You could raise an error here or sys.exit(1) to make the Space fail clearly if loading is essential.
     else:
         print("Launching Gradio interface...")
-        iface.launch(server_name="0.0.0.0", server_port=7860)

+import sys
+import os
+# Add the cloned nanoVLM directory to Python's system path
+# This allows us to import from the 'models' directory within nanoVLM
+NANOVLM_REPO_PATH = "/app/nanoVLM" # Path where we cloned it in Dockerfile
+if NANOVLM_REPO_PATH not in sys.path:
+    sys.path.insert(0, NANOVLM_REPO_PATH)
 import gradio as gr
 from PIL import Image
 import torch
+from transformers import AutoProcessor # AutoProcessor might still work
+# Now import the custom classes from the cloned nanoVLM repository
+try:
+    from models.vision_language_model import VisionLanguageModel
+    from models.configurations import VisionLanguageConfig # Or the specific config class used by nanoVLM
+    print("Successfully imported VisionLanguageModel and VisionLanguageConfig from nanoVLM clone.")
+except ImportError as e:
+    print(f"Error importing from nanoVLM clone: {e}. Check NANOVLM_REPO_PATH and ensure nanoVLM cloned correctly.")
+    VisionLanguageModel = None
+    VisionLanguageConfig = None
 # Determine the device to use
 device_choice = os.environ.get("DEVICE", "auto")
 processor = None
 model = None
+if VisionLanguageModel and VisionLanguageConfig:
+    try:
+        print(f"Attempting to load processor for {model_id}")
+        # Processor loading might still be okay with AutoProcessor,
+        # as processor_config.json is usually standard.
+        # trust_remote_code might be needed if processor has custom code too.
+        processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
+        print("Processor loaded.")
+        print(f"Attempting to load model config for {model_id} using VisionLanguageConfig")
+        # Load the configuration using the custom config class, pointing to your model_id
+        # trust_remote_code=True allows it to use any specific code paths from your model_id if needed for config.
+        config = VisionLanguageConfig.from_pretrained(model_id, trust_remote_code=True)
+        print("Model config loaded.")
+        print(f"Attempting to load model weights for {model_id} using VisionLanguageModel")
+        # Load the model weights using the custom model class and the loaded config
+        model = VisionLanguageModel.from_pretrained(model_id, config=config, trust_remote_code=True).to(device)
+        print("Model weights loaded successfully.")
+        model.eval() # Set to evaluation mode
+    except Exception as e:
+        print(f"Error loading model, processor, or config: {e}")
+        # Fallback if any step fails
+        processor = None
+        model = None
+else:
+    print("Custom nanoVLM classes not imported, cannot load model.")
 def generate_text_for_image(image_input, prompt_input):
+    if model is None or processor is None or not hasattr(model, 'generate'): # Check if model has generate
+        return "Error: Model or processor not loaded correctly or model doesn't have 'generate' method. Check logs."
     if image_input is None:
         return "Please upload an image."
     if not prompt_input:
+        return "Please provide a prompt."
     try:
         if not isinstance(image_input, Image.Image):
         if pil_image.mode != "RGB":
             pil_image = pil_image.convert("RGB")
+        # Prepare inputs for the model using the processor
+        # The exact format for nanoVLM's custom model might require specific handling.
+        # The processor from AutoProcessor should generally work.
         inputs = processor(text=[prompt_input], images=[pil_image], return_tensors="pt").to(device)
+        # Generate text using the model's generate method
+        # Common parameters for generation:
         generated_ids = model.generate(
+            inputs['pixel_values'], # Assuming processor output has 'pixel_values'
+            inputs['input_ids'],    # Assuming processor output has 'input_ids'
+            attention_mask=inputs.get('attention_mask'), # Optional, but good to include
             max_new_tokens=150,
             num_beams=3,
             no_repeat_ngram_size=2,
             early_stopping=True
+            # Check nanoVLM's VisionLanguageModel.generate() for specific parameters
         )
         generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
         if prompt_input and generated_text.startswith(prompt_input):
              cleaned_text = generated_text[len(prompt_input):].lstrip(" ,.:")
         else:
     except Exception as e:
         print(f"Error during generation: {e}")
         return f"An error occurred during text generation: {str(e)}"
+description = "Interactive demo for lusxvr/nanoVLM-222M."
+example_image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
 gradio_cache_dir = os.environ.get("GRADIO_TEMP_DIR", "/tmp/gradio_tmp")
 iface = gr.Interface(
     fn=generate_text_for_image,
     inputs=[
         gr.Image(type="pil", label="Upload Image"),
+        gr.Textbox(label="Your Prompt/Question")
     ],
     outputs=gr.Textbox(label="Generated Text", show_copy_button=True),
     title="Interactive nanoVLM-222M Demo",
     examples=[
         [example_image_url, "a photo of a"],
         [example_image_url, "Describe the image in detail."],
     ],
     cache_examples=True,
     examples_cache_folder=gradio_cache_dir,
     allow_flagging="never"
 )
 if __name__ == "__main__":
     if model is None or processor is None:
+        print("CRITICAL: Model or processor failed to load. Gradio interface may not function correctly.")
     else:
         print("Launching Gradio interface...")
+    iface.launch(server_name="0.0.0.0", server_port=7860)