Spaces:

witcher23
/

nanoVLM-inference

Running

App Files Files Community

vidhanm commited on May 24

Commit

a4ebbec

1 Parent(s): 984c158

.

Browse files

Files changed (2) hide show

Dockerfile +5 -19
app.py +141 -147

Dockerfile CHANGED Viewed

@@ -1,33 +1,19 @@
-FROM python:3.9-slim
 WORKDIR /app
-# Install git
 RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
-# Copy requirements and install
 COPY requirements.txt requirements.txt
-RUN echo "DEBUG: Installing packages from requirements.txt for Gradio app" && \
-    pip install --no-cache-dir -r requirements.txt && \
-    echo "DEBUG: Finished installing packages."
-# Clone the nanoVLM repository
-RUN echo "DEBUG: Cloning huggingface/nanoVLM repository..." && \
-    git clone https://github.com/huggingface/nanoVLM.git /app/nanoVLM && \
-    echo "DEBUG: nanoVLM repository cloned to /app/nanoVLM."
-# Set Python path
-ENV PYTHONPATH="/app/nanoVLM:${PYTHONPATH}"
 ENV HF_HOME=/app/.cache/huggingface
-# Create cache directory
 RUN mkdir -p $HF_HOME && chmod -R 777 $HF_HOME
-# Copy your Gradio application
-COPY app.py app.py
-# Expose the port Gradio runs on
 EXPOSE 7860
-# Command to run the Gradio application
 CMD ["python", "-u", "app.py"]

+FROM python:3.9-slim
 WORKDIR /app
 RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
 COPY requirements.txt requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+RUN git clone https://github.com/huggingface/nanoVLM.git /app/nanoVLM
+ENV PYTHONPATH="/app/nanoVLM:${PYTHONPATH}" # So generate.py can find 'from models...'
 ENV HF_HOME=/app/.cache/huggingface
 RUN mkdir -p $HF_HOME && chmod -R 777 $HF_HOME
+COPY app.py app.py # Your new Gradio app.py that calls generate.py
 EXPOSE 7860
 CMD ["python", "-u", "app.py"]

app.py CHANGED Viewed

@@ -1,179 +1,173 @@
 import sys
 import os
 from typing import Optional
 from PIL import Image as PILImage
-# Add the cloned nanoVLM directory to Python's system path
-NANOVLM_REPO_PATH = "/app/nanoVLM" # This path is where your Dockerfile clones huggingface/nanoVLM
 if NANOVLM_REPO_PATH not in sys.path:
     print(f"DEBUG: Adding {NANOVLM_REPO_PATH} to sys.path")
     sys.path.insert(0, NANOVLM_REPO_PATH)
-import gradio as gr
-import torch
-from transformers import AutoProcessor # Using AutoProcessor as in the successful generate.py
-# Import the custom VisionLanguageModel class
-VisionLanguageModel = None
-try:
-    print("DEBUG: Attempting to import VisionLanguageModel from models.vision_language_model")
-    from models.vision_language_model import VisionLanguageModel
-    print("DEBUG: Successfully imported VisionLanguageModel.")
-except ImportError as e:
-    print(f"CRITICAL ERROR: Importing VisionLanguageModel failed: {e}")
-except Exception as e:
-    print(f"CRITICAL ERROR: An unexpected error occurred during VisionLanguageModel import: {e}")
-# --- Device Setup ---
-device = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"DEBUG: Using device: {device}")
-# --- Configuration ---
-model_repo_id = "lusxvr/nanoVLM-222M" # Used for both processor and model weights
-print(f"DEBUG: Model Repository ID for processor and model: {model_repo_id}")
-# --- Initialize ---
-processor = None
-model = None
-if VisionLanguageModel: # Only proceed if custom model class was imported
     try:
-        # Load processor using AutoProcessor, mirroring generate.py
-        print(f"DEBUG: Loading processor using AutoProcessor.from_pretrained('{model_repo_id}')")
-        # generate.py doesn't explicitly use trust_remote_code=True for processor,
-        # but it might be implicitly active in your local transformers or not needed if processor_config is clear.
-        # Let's try without it first for AutoProcessor, then add if "Unrecognized model" for processor reappears.
-        processor = AutoProcessor.from_pretrained(model_repo_id) # Try without TRC first for processor
-        print(f"DEBUG: AutoProcessor loaded: {type(processor)}")
-        # Ensure tokenizer has pad_token set if it's GPT-2 based (AutoProcessor should handle a tokenizer component)
-        if hasattr(processor, 'tokenizer') and processor.tokenizer is not None:
-            current_tokenizer = processor.tokenizer
-            if getattr(current_tokenizer, 'pad_token', None) is None and hasattr(current_tokenizer, 'eos_token'):
-                current_tokenizer.pad_token = current_tokenizer.eos_token
-                print(f"DEBUG: Set processor.tokenizer.pad_token to eos_token (ID: {current_tokenizer.eos_token_id})")
-        else:
-            print("WARN: Processor does not have a 'tokenizer' attribute or it's None. Cannot set pad_token.")
-        # Load model using VisionLanguageModel.from_pretrained, mirroring generate.py
-        print(f"DEBUG: Loading model VisionLanguageModel.from_pretrained('{model_repo_id}')")
-        # The custom VLM.from_pretrained doesn't take trust_remote_code
-        model = VisionLanguageModel.from_pretrained(model_repo_id).to(device)
-        print(f"DEBUG: VisionLanguageModel loaded: {type(model)}")
-        model.eval()
-        print("DEBUG: Model set to eval() mode.")
     except Exception as e:
-        print(f"CRITICAL ERROR loading model or processor: {e}")
         import traceback
         traceback.print_exc()
-        processor = None; model = None # Ensure they are None if loading fails
-else:
-    print("CRITICAL ERROR: VisionLanguageModel class not imported. Cannot load model.")
-# --- Text Generation Function ---
-def generate_text_for_image(image_input_pil: Optional[PILImage.Image], prompt_input_str: Optional[str]) -> str:
-    print(f"DEBUG (generate_text_for_image): Received prompt: '{prompt_input_str}'")
-    if model is None or processor is None:
-        print("ERROR (generate_text_for_image): Model or processor not loaded.")
-        return "Error: Model or processor not loaded. Please check the application logs."
     if image_input_pil is None:
-        print("WARN (generate_text_for_image): No image uploaded.")
         return "Please upload an image."
-    if not prompt_input_str: # Check for empty or None prompt
-        print("WARN (generate_text_for_image): No prompt provided.")
         return "Please provide a prompt."
     try:
-        current_pil_image = image_input_pil
-        if not isinstance(current_pil_image, PILImage.Image): # Should be PIL from Gradio's type="pil"
-             print(f"WARN (generate_text_for_image): Input image not PIL, type: {type(current_pil_image)}. Converting.")
-             current_pil_image = PILImage.fromarray(current_pil_image)
-        if current_pil_image.mode != "RGB":
-            print(f"DEBUG (generate_text_for_image): Converting image from {current_pil_image.mode} to RGB.")
-            current_pil_image = current_pil_image.convert("RGB")
-        print(f"DEBUG (generate_text_for_image): Image prepped - size: {current_pil_image.size}, mode: {current_pil_image.mode}")
-        # Prepare inputs using the AutoProcessor, as in generate.py
-        print("DEBUG (generate_text_for_image): Processing inputs with AutoProcessor...")
-        inputs = processor(
-            text=[prompt_input_str], images=current_pil_image, return_tensors="pt"
-        ).to(device)
-        print(f"DEBUG (generate_text_for_image): Inputs from AutoProcessor - keys: {inputs.keys()}")
-        print(f"DEBUG (generate_text_for_image):   input_ids shape: {inputs['input_ids'].shape}, values: {inputs['input_ids']}")
-        print(f"DEBUG (generate_text_for_image):   pixel_values shape: {inputs['pixel_values'].shape}")
-        attention_mask = inputs.get('attention_mask')
-        if attention_mask is None: # Should be provided by AutoProcessor
-            print("WARN (generate_text_for_image): attention_mask not in processor output. Creating default.")
-            attention_mask = torch.ones_like(inputs['input_ids']).to(device)
-        print(f"DEBUG (generate_text_for_image):   attention_mask shape: {attention_mask.shape}")
-        print("DEBUG (generate_text_for_image): Calling model.generate...")
-        # Signature for nanoVLM's generate: (self, input_ids, image, attention_mask, max_new_tokens, ...)
-        generated_ids_tensor = model.generate(
-            inputs['input_ids'],
-            inputs['pixel_values'], # This is the 'image' argument for the model's generate method
-            attention_mask,
-            max_new_tokens=50,    # Consistent with successful generate.py test
-            temperature=0.7,      # From generate.py defaults (or adjust as preferred)
-            top_k=50,             # From generate.py defaults (or adjust as preferred)
-            # greedy=False is default in nanoVLM's generate
-        )
-        print(f"DEBUG (generate_text_for_image): Raw generated_ids: {generated_ids_tensor}")
-        # Use processor.batch_decode, as in generate.py
-        generated_text_list = processor.batch_decode(generated_ids_tensor, skip_special_tokens=True)
-        print(f"DEBUG (generate_text_for_image): Decoded text list: {generated_text_list}")
-        generated_text_str = generated_text_list[0] if generated_text_list else ""
-        # Optional: Clean up prompt if echoed
-        cleaned_text_str = generated_text_str
-        if prompt_input_str and generated_text_str.startswith(prompt_input_str):
-             cleaned_text_str = generated_text_str[len(prompt_input_str):].lstrip(" ,.:")
-        print(f"DEBUG (generate_text_for_image): Final cleaned text: '{cleaned_text_str}'")
-        return cleaned_text_str.strip()
     except Exception as e:
-        print(f"CRITICAL ERROR during generation: {e}")
-        import traceback
-        traceback.print_exc()
-        return f"Error during generation: {str(e)}. Check logs."
-# --- Gradio Interface ---
 description_md = """
-## nanoVLM-222M Interactive Demo
-Upload an image and type a prompt to get a description or answer from the model.
-This Space uses the `lusxvr/nanoVLM-222M` model weights with the `huggingface/nanoVLM` model code.
 """
-iface = None
-# Only define the interface if the model and processor loaded successfully
-if VisionLanguageModel and model and processor:
-    try:
-        print("DEBUG: Defining Gradio interface...")
-        iface = gr.Interface(
-            fn=generate_text_for_image,
-            inputs=[
-                gr.Image(type="pil", label="Upload Image"),
-                gr.Textbox(label="Your Prompt / Question", info="e.g., 'describe this image in detail'")
-            ],
-            outputs=gr.Textbox(label="Generated Text", show_copy_button=True),
-            title="nanoVLM-222M Demo",
-            description=description_md,
-            allow_flagging="never" # No examples or caching for now to keep it simple
-        )
-        print("DEBUG: Gradio interface defined successfully.")
-    except Exception as e:
-        print(f"CRITICAL ERROR defining Gradio interface: {e}")
-        import traceback; traceback.print_exc()
-else:
-    print("WARN: Model and/or processor did not load. Gradio interface will not be created.")
 # --- Launch Gradio App ---
 if __name__ == "__main__":
     print("DEBUG: Entered __main__ block for Gradio launch.")
     if iface is not None:
         print("DEBUG: Attempting to launch Gradio interface...")
         try:
@@ -183,4 +177,4 @@ if __name__ == "__main__":
             print(f"CRITICAL ERROR launching Gradio interface: {e}")
             import traceback; traceback.print_exc()
     else:
-        print("CRITICAL ERROR: Gradio interface (iface) is None or not defined due to loading errors. Cannot launch.")

 import sys
 import os
+import subprocess # For calling generate.py
+import tempfile # For handling temporary image files
 from typing import Optional
 from PIL import Image as PILImage
+import gradio as gr
+# Add the cloned nanoVLM directory to Python's system path (generate.py might need this too if it imports from 'models')
+NANOVLM_REPO_PATH = "/app/nanoVLM"
 if NANOVLM_REPO_PATH not in sys.path:
     print(f"DEBUG: Adding {NANOVLM_REPO_PATH} to sys.path")
     sys.path.insert(0, NANOVLM_REPO_PATH)
+print(f"DEBUG: Python sys.path: {sys.path}")
+# Path to the generate.py script within our Docker container
+GENERATE_SCRIPT_PATH = "/app/nanoVLM/generate.py"
+MODEL_REPO_ID = "lusxvr/nanoVLM-222M" # Model ID for generate.py
+print(f"DEBUG: Using generate.py script at: {GENERATE_SCRIPT_PATH}")
+print(f"DEBUG: Using model repo ID: {MODEL_REPO_ID}")
+def call_generate_script(image_path: str, prompt_text: str) -> str:
+    """
+    Calls the generate.py script as a subprocess and returns its output.
+    """
+    print(f"DEBUG (call_generate_script): Calling with image_path='{image_path}', prompt='{prompt_text}'")
+    # Arguments for generate.py (ensure they match its expected format)
+    # From previous success: --hf_model, --image, --prompt, --generations, --max_new_tokens
+    cmd_args = [
+        "python", "-u", GENERATE_SCRIPT_PATH,
+        "--hf_model", MODEL_REPO_ID,
+        "--image", image_path,
+        "--prompt", prompt_text,
+        "--generations", "1",      # Get one generation for the UI
+        "--max_new_tokens", "70"   # Adjust as needed
+        # --device is handled by generate.py internally
+    ]
+    print(f"DEBUG (call_generate_script): Executing command: {' '.join(cmd_args)}")
     try:
+        # Execute the command
+        # capture_output=True, text=True are for Python 3.7+
+        # For Python 3.9 (as in your Dockerfile base), this is fine.
+        process = subprocess.run(
+            cmd_args,
+            capture_output=True,
+            text=True,
+            check=True,  # Raise an exception for non-zero exit codes
+            timeout=120  # Add a timeout (e.g., 2 minutes)
+        )
+        stdout = process.stdout
+        stderr = process.stderr
+        print(f"DEBUG (call_generate_script): generate.py STDOUT:\n{stdout}")
+        if stderr:
+            print(f"DEBUG (call_generate_script): generate.py STDERR:\n{stderr}")
+        # --- Parse the output from generate.py ---
+        # The generate.py script prints:
+        # Outputs:
+        #   >> Generation 1:  Actual generated text here.
+        # We need to extract "Actual generated text here."
+        output_lines = stdout.splitlines()
+        generated_text = "Error: Could not parse output from generate.py script." # Default
+        parsing_output = False
+        for line in output_lines:
+            if "Outputs:" in line:
+                parsing_output = True
+                continue
+            if parsing_output and line.strip().startswith(">> Generation 1:"):
+                # Extract text after ">> Generation 1:  " (note the two spaces)
+                generated_text = line.split(">> Generation 1:  ", 1)[-1].strip()
+                break # Found the first generation
+        print(f"DEBUG (call_generate_script): Parsed generated text: '{generated_text}'")
+        return generated_text
+    except subprocess.CalledProcessError as e:
+        print(f"ERROR (call_generate_script): generate.py exited with error code {e.returncode}")
+        print(f"ERROR (call_generate_script): STDOUT: {e.stdout}")
+        print(f"ERROR (call_generate_script): STDERR: {e.stderr}")
+        return f"Error executing generation script (Code {e.returncode}). Check logs."
+    except subprocess.TimeoutExpired:
+        print("ERROR (call_generate_script): generate.py timed out.")
+        return "Error: Generation script timed out."
     except Exception as e:
+        print(f"ERROR (call_generate_script): An unexpected error occurred: {e}")
         import traceback
         traceback.print_exc()
+        return f"An unexpected error occurred while calling generation script: {str(e)}"
+def gradio_interface_fn(image_input_pil: Optional[PILImage.Image], prompt_input_str: Optional[str]) -> str:
+    print(f"DEBUG (gradio_interface_fn): Received prompt: '{prompt_input_str}'")
     if image_input_pil is None:
         return "Please upload an image."
+    if not prompt_input_str:
         return "Please provide a prompt."
+    # Save the uploaded PIL image to a temporary file
+    # tempfile.NamedTemporaryFile creates a file that is deleted when closed.
+    # We need to ensure it has a .jpg extension for some image libraries if they are picky.
+    # The 'delete=False' allows us to close it, pass its name, and then delete it manually.
     try:
+        with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_image_file:
+            image_input_pil.save(tmp_image_file, format="JPEG")
+            tmp_image_path = tmp_image_file.name
+        print(f"DEBUG (gradio_interface_fn): Temporary image saved to: {tmp_image_path}")
+        # Call the generate.py script with the path to the temporary image
+        result_text = call_generate_script(tmp_image_path, prompt_input_str)
+        return result_text
     except Exception as e:
+        print(f"ERROR (gradio_interface_fn): Error processing image or calling script: {e}")
+        import traceback; traceback.print_exc()
+        return f"An error occurred: {str(e)}"
+    finally:
+        # Clean up the temporary image file
+        if 'tmp_image_path' in locals() and os.path.exists(tmp_image_path):
+            try:
+                os.remove(tmp_image_path)
+                print(f"DEBUG (gradio_interface_fn): Temporary image {tmp_image_path} removed.")
+            except Exception as e_remove:
+                print(f"WARN (gradio_interface_fn): Could not remove temporary image {tmp_image_path}: {e_remove}")
+# --- Gradio Interface Definition ---
 description_md = """
+## nanoVLM-222M Interactive Demo (via generate.py)
+Upload an image and type a prompt. This interface calls the `generate.py` script from
+`huggingface/nanoVLM` under the hood to perform inference.
 """
+print("DEBUG: Defining Gradio interface...")
+iface = None
+try:
+    iface = gr.Interface(
+        fn=gradio_interface_fn,
+        inputs=[
+            gr.Image(type="pil", label="Upload Image"),
+            gr.Textbox(label="Your Prompt / Question", info="e.g., 'describe this image in detail'")
+        ],
+        outputs=gr.Textbox(label="Generated Text", show_copy_button=True),
+        title="nanoVLM-222M Demo (via Script)",
+        description=description_md,
+        allow_flagging="never"
+    )
+    print("DEBUG: Gradio interface defined successfully.")
+except Exception as e:
+    print(f"CRITICAL ERROR defining Gradio interface: {e}")
+    import traceback; traceback.print_exc()
 # --- Launch Gradio App ---
 if __name__ == "__main__":
     print("DEBUG: Entered __main__ block for Gradio launch.")
+    if not os.path.exists(GENERATE_SCRIPT_PATH):
+        print(f"CRITICAL ERROR: The script {GENERATE_SCRIPT_PATH} was not found. Cannot launch app.")
+        iface = None # Prevent launch
     if iface is not None:
         print("DEBUG: Attempting to launch Gradio interface...")
         try:
             print(f"CRITICAL ERROR launching Gradio interface: {e}")
             import traceback; traceback.print_exc()
     else:
+        print("CRITICAL ERROR: Gradio interface (iface) is None or not defined. Cannot launch.")