Spaces:

witcher23
/

nanoVLM-inference

Running

App Files Files Community

vidhanm commited on May 24

Commit

137b7f1

1 Parent(s): 876ed10

trying new approach

Browse files

Files changed (4) hide show

Dockerfile +36 -34
app.py +0 -162
b01847e1e13f032d8a7309a460d5d2c5.jpg +0 -0
requirements.txt +15 -76

Dockerfile CHANGED Viewed

@@ -1,39 +1,41 @@
-# Use a slim Python base image.
-FROM python:3.10-slim
-# Set the working directory in the container
 WORKDIR /app
-# Set Hugging Face cache directory and Gradio temp/flagging dir
-ENV HF_HOME=/app/.cache/huggingface
-ENV GRADIO_TEMP_DIR=/tmp/gradio_tmp
-ENV GRADIO_FLAGGING_DIR=/tmp/gradio_flags
-# Install git and build-essential
-RUN apt-get update && apt-get install -y \
-    git \
-    build-essential \
-    && rm -rf /var/lib/apt/lists/*
-# Clone the original nanoVLM repository for its model definition files
-# This makes the `models` directory from nanoVLM available under /app/nanoVLM
-RUN git clone https://github.com/huggingface/nanoVLM.git /app/nanoVLM
-# Create the cache and temp directories and make them writable
-RUN mkdir -p $HF_HOME $GRADIO_TEMP_DIR $GRADIO_FLAGGING_DIR && \
-    chmod -R 777 $HF_HOME $GRADIO_TEMP_DIR $GRADIO_FLAGGING_DIR
-# Copy the requirements file first
 COPY requirements.txt requirements.txt
-# Install Python dependencies
-RUN pip install --no-cache-dir --prefer-binary -r requirements.txt
-# Copy the application code into the container
-COPY app.py app.py
-# Expose the port Gradio will run on
-EXPOSE 7860
-# Set the default command to run the Gradio application
-CMD ["python", "-u", "app.py"]

+FROM python:3.9-slim # Or your preferred Python version matching local
 WORKDIR /app
+# Install git
+RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
+# Copy requirements and install
 COPY requirements.txt requirements.txt
+RUN echo "DEBUG: Installing packages from requirements.txt" && \
+    pip install --no-cache-dir -r requirements.txt && \
+    echo "DEBUG: Finished installing packages."
+# Clone the nanoVLM repository which contains generate.py and the models directory
+# This also ensures the 'models' module is available for VisionLanguageModel import
+RUN echo "DEBUG: Cloning huggingface/nanoVLM repository..." && \
+    git clone https://github.com/huggingface/nanoVLM.git /app/nanoVLM && \
+    echo "DEBUG: nanoVLM repository cloned to /app/nanoVLM."
+# Add a test image to the Space.
+# You need to create a simple 'test_image.jpg' and add it to the root of your Space repo.
+COPY ./test_image.jpg /app/test_image.jpg
+RUN if [ ! -f /app/test_image.jpg ]; then echo "ERROR: test_image.jpg not found!"; exit 1; fi
+# Set Python path to include the nanoVLM models directory, so `from models...` works
+ENV PYTHONPATH="/app/nanoVLM:${PYTHONPATH}"
+ENV HF_HOME=/app/.cache/huggingface # Define a writable cache directory
+# Create cache directory with write permissions
+RUN mkdir -p $HF_HOME && chmod -R 777 $HF_HOME
+# The generate.py script is at /app/nanoVLM/generate.py
+# It takes arguments like --model_path, --image_path, --prompt, --device
+# We will run it directly. Its output will go to the Space's container logs.
+CMD ["python", "-u", "/app/nanoVLM/generate.py", \
+     "--model_path", "lusxvr/nanoVLM-222M", \
+     "--image_path", "/app/test_image.jpg", \
+     "--prompt", "describe this image in detail", \
+     "--device", "cpu", \
+     "--num_generations", "1", \
+     "--max_new_tokens", "50"]

app.py DELETED Viewed

@@ -1,162 +0,0 @@
-import sys
-import os
-from typing import Optional
-from PIL import Image as PILImage
-# Add the cloned nanoVLM directory to Python's system path
-NANOVLM_REPO_PATH = "/app/nanoVLM"
-if NANOVLM_REPO_PATH not in sys.path:
-    print(f"DEBUG: Adding {NANOVLM_REPO_PATH} to sys.path")
-    sys.path.insert(0, NANOVLM_REPO_PATH)
-import gradio as gr
-import torch
-from transformers import AutoProcessor # Using AutoProcessor as in generate.py
-VisionLanguageModel = None
-try:
-    print("DEBUG: Attempting to import VisionLanguageModel")
-    from models.vision_language_model import VisionLanguageModel
-    print("DEBUG: Successfully imported VisionLanguageModel.")
-except ImportError as e:
-    print(f"CRITICAL ERROR: Importing VisionLanguageModel: {e}")
-# --- Device Setup ---
-device = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"DEBUG: Using device: {device}")
-# --- Configuration ---
-# This will be used for both model and processor, as in generate.py
-model_repo_id = "lusxvr/nanoVLM-222M"
-print(f"DEBUG: Model Repository ID for model and processor: {model_repo_id}")
-# --- Initialize ---
-processor = None
-model = None
-if VisionLanguageModel: # Only proceed if custom model class was imported
-    try:
-        # Load processor using AutoProcessor, like in generate.py
-        print(f"DEBUG: Loading processor using AutoProcessor.from_pretrained('{model_repo_id}')")
-        # Using trust_remote_code=True here as a precaution,
-        # though ideally not needed if processor_config.json is complete.
-        processor = AutoProcessor.from_pretrained(model_repo_id, trust_remote_code=True)
-        print(f"DEBUG: AutoProcessor loaded: {type(processor)}")
-        # Ensure tokenizer has pad_token set if it's GPT-2 based
-        if hasattr(processor, 'tokenizer') and processor.tokenizer is not None:
-            if getattr(processor.tokenizer, 'pad_token', None) is None: # Check if pad_token attribute exists and is None
-                processor.tokenizer.pad_token = processor.tokenizer.eos_token
-                print(f"DEBUG: Set processor.tokenizer.pad_token to eos_token (ID: {processor.tokenizer.eos_token_id})")
-        else:
-            print("DEBUG: Processor does not have a 'tokenizer' attribute or it is None.")
-        # Load model, like in generate.py
-        print(f"DEBUG: Loading model VisionLanguageModel.from_pretrained('{model_repo_id}')")
-        model = VisionLanguageModel.from_pretrained(model_repo_id).to(device)
-        print(f"DEBUG: VisionLanguageModel loaded: {type(model)}")
-        model.eval()
-        print("DEBUG: Model set to eval() mode.")
-    except Exception as e:
-        print(f"CRITICAL ERROR loading model or processor with AutoProcessor: {e}")
-        import traceback
-        traceback.print_exc()
-        processor = None; model = None
-else:
-    print("CRITICAL ERROR: VisionLanguageModel class not imported. Cannot load model.")
-# --- Text Generation Function ---
-def generate_text_for_image(image_input_pil: Optional[PILImage.Image], prompt_input_str: Optional[str]) -> str:
-    print(f"DEBUG (generate_text_for_image): Received prompt: '{prompt_input_str}'")
-    if model is None or processor is None:
-        return "Error: Model or processor not loaded. Check logs."
-    if image_input_pil is None: return "Please upload an image."
-    if not prompt_input_str: return "Please provide a prompt."
-    try:
-        current_pil_image = image_input_pil
-        if not isinstance(current_pil_image, PILImage.Image):
-             current_pil_image = PILImage.fromarray(current_pil_image)
-        if current_pil_image.mode != "RGB":
-            current_pil_image = current_pil_image.convert("RGB")
-        print(f"DEBUG: Image prepped - size: {current_pil_image.size}, mode: {current_pil_image.mode}")
-        # Prepare inputs using the AutoProcessor, as in generate.py
-        print("DEBUG: Processing inputs with AutoProcessor...")
-        inputs = processor(
-            text=[prompt_input_str], images=current_pil_image, return_tensors="pt"
-        ).to(device)
-        print(f"DEBUG: Inputs from AutoProcessor - keys: {inputs.keys()}")
-        print(f"DEBUG:   input_ids shape: {inputs['input_ids'].shape}, values: {inputs['input_ids']}")
-        print(f"DEBUG:   pixel_values shape: {inputs['pixel_values'].shape}")
-        # Ensure attention_mask is present, default to ones if not (though AutoProcessor should provide it)
-        attention_mask = inputs.get('attention_mask')
-        if attention_mask is None:
-            print("WARN: attention_mask not found in processor output, creating a default one of all 1s.")
-            attention_mask = torch.ones_like(inputs['input_ids']).to(device)
-        print(f"DEBUG:   attention_mask shape: {attention_mask.shape}")
-        print("DEBUG: Calling model.generate (aligning with nanoVLM's generate.py)...")
-        # Signature for nanoVLM's generate: (self, input_ids, image, attention_mask, max_new_tokens, ...)
-        # `image` parameter in generate() corresponds to `pixel_values` from processor output
-        generated_ids_tensor = model.generate(
-            inputs['input_ids'],          # 1st argument to model.generate: input_ids (text prompt)
-            inputs['pixel_values'],       # 2nd argument to model.generate: image (pixel values)
-            attention_mask,               # 3rd argument to model.generate: attention_mask
-            max_new_tokens=30,            # Corresponds to 4th argument in model.generate
-            temperature=0.7,              # Match generate.py default or your choice
-            top_k=50,                     # Match generate.py default or your choice
-            greedy=False                  # Match generate.py default or your choice
-            # top_p is also an option from generate.py's model.generate
-        )
-        print(f"DEBUG: Raw generated_ids: {generated_ids_tensor}")
-        generated_text_list = processor.batch_decode(generated_ids_tensor, skip_special_tokens=True)
-        print(f"DEBUG: Decoded text list: {generated_text_list}")
-        generated_text_str = generated_text_list[0] if generated_text_list else ""
-        cleaned_text_str = generated_text_str
-        if prompt_input_str and generated_text_str.startswith(prompt_input_str):
-             cleaned_text_str = generated_text_str[len(prompt_input_str):].lstrip(" ,.:")
-        print(f"DEBUG: Final cleaned text: '{cleaned_text_str}'")
-        return cleaned_text_str.strip()
-    except Exception as e:
-        print(f"CRITICAL ERROR during generation: {e}")
-        import traceback
-        traceback.print_exc()
-        return f"Error during generation: {str(e)}"
-# --- Gradio Interface ---
-description_md = """
-## Interactive nanoVLM-222M Demo (Mirroring generate.py)
-Trying to replicate the working `generate.py` script from `huggingface/nanoVLM`.
-Using AutoProcessor for inputs.
-"""
-iface = None
-if processor and model:
-    try:
-        iface = gr.Interface(
-            fn=generate_text_for_image,
-            inputs=[gr.Image(type="pil", label="Upload Image"), gr.Textbox(label="Your Prompt")],
-            outputs=gr.Textbox(label="Generated Text", show_copy_button=True),
-            title="nanoVLM-222M Demo (generate.py Alignment)",
-            description=description_md,
-            allow_flagging="never"
-        )
-        print("DEBUG: Gradio interface defined.")
-    except Exception as e:
-        print(f"CRITICAL ERROR defining Gradio interface: {e}")
-        import traceback; traceback.print_exc()
-if __name__ == "__main__":
-    if iface:
-        print("DEBUG: Launching Gradio...")
-        iface.launch(server_name="0.0.0.0", server_port=7860)
-    else:
-        print("CRITICAL ERROR: Gradio interface not defined or model/processor failed to load. Cannot launch.")

b01847e1e13f032d8a7309a460d5d2c5.jpg ADDED Viewed

requirements.txt CHANGED Viewed

@@ -1,78 +1,17 @@
-gradio==3.50.2  # Pin to a specific, widely used Gradio 3.x version
-sentencepiece
-accelerate
-aiohappyeyeballs==2.6.1
-aiohttp==3.11.18
-aiosignal==1.3.2
-annotated-types==0.7.0
-attrs==25.3.0
-certifi==2025.4.26
-charset-normalizer==3.4.2
-click==8.2.1
-datasets==3.6.0
-dill==0.3.8
-docker-pycreds==0.4.0
-filelock==3.18.0
-frozenlist==1.6.0
-fsspec==2025.3.0
-gitdb==4.0.12
-GitPython==3.1.44
-hf-xet==1.1.2
-huggingface-hub==0.32.0
-idna==3.10
-Jinja2==3.1.6
-MarkupSafe==2.0
-mpmath==1.3.0
-multidict==6.4.4
-multiprocess==0.70.16
-networkx==3.4.2
-numpy
-nvidia-cublas-cu12==12.6.4.1
-nvidia-cuda-cupti-cu12==12.6.80
-nvidia-cuda-nvrtc-cu12==12.6.77
-nvidia-cuda-runtime-cu12==12.6.77
-nvidia-cudnn-cu12==9.5.1.17
-nvidia-cufft-cu12==11.3.0.4
-nvidia-cufile-cu12==1.11.1.6
-nvidia-curand-cu12==10.3.7.77
-nvidia-cusolver-cu12==11.7.1.2
-nvidia-cusparse-cu12==12.5.4.2
-nvidia-cusparselt-cu12==0.6.3
-nvidia-nccl-cu12==2.26.2
-nvidia-nvjitlink-cu12==12.6.85
-nvidia-nvtx-cu12==12.6.77
-packaging==25.0
-pandas==2.2.3
-pillow==10.4.0
-platformdirs==4.3.8
-propcache==0.3.1
-protobuf==6.31.0
-psutil==7.0.0
-pyarrow==20.0.0
-pydantic==2.11.5
-pydantic_core==2.33.2
-python-dateutil==2.9.0.post0
-pytz==2025.2
-PyYAML==6.0.2
-regex==2024.11.6
-requests==2.32.3
-safetensors==0.5.3
-sentry-sdk==2.29.1
-setproctitle==1.3.6
-setuptools==80.8.0
-six==1.17.0
-smmap==5.0.2
-sympy==1.14.0
-tokenizers==0.21.1
 torch==2.7.0
-torchvision==0.22.0
-tqdm==4.67.1
 transformers==4.52.3
-triton==3.3.0
-typing-inspection==0.4.1
-typing_extensions==4.13.2
-tzdata==2025.2
-urllib3==2.4.0
-wandb==0.19.11
-xxhash==3.5.0
-yarl==1.20.0

+# Try to match your local working environment for generate.py
 torch==2.7.0
+# If 'transformers==4.52.3' and 'tokenizers==0.21.1' are from custom/dev builds,
+# you MUST find a way to install those exact versions in Docker, or use the
+# closest standard PyPI versions and test generate.py locally with THOSE first.
+# For this example, I'm assuming they are pip-installable. If not, adjust.
 transformers==4.52.3
+tokenizers==0.21.1
+huggingface-hub==0.32.0
+safetensors==0.5.3
+Pillow==11.2.1    # generate.py uses PIL.Image
+# For protobuf, if your local 6.31.0 is confirmed, use it. Otherwise, a standard one:
+protobuf==4.25.3 # Or your confirmed local 6.31.0 if pip-installable
+accelerate        # Good to include, though generate.py might not explicitly use it
+sentencepiece     # Often a dependency for tokenizers
+# NO Gradio needed for this test