vidhanm commited on
Commit
4670dfa
·
1 Parent(s): 1313dd4

Add application files for nanoVLM

Browse files
Files changed (3) hide show
  1. Dockerfile +30 -0
  2. app.py +113 -0
  3. requirements.txt +6 -0
Dockerfile ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use a slim Python base image. For GPU, you'd need a CUDA-enabled base.
2
+ FROM python:3.9-slim
3
+
4
+ # Set the working directory in the container
5
+ WORKDIR /app
6
+
7
+ # Install git (useful for some Hugging Face model/tokenizer downloads that might use it)
8
+ # Also install common build tools often needed for Python packages
9
+ RUN apt-get update && apt-get install -y \
10
+ git \
11
+ build-essential \
12
+ && rm -rf /var/lib/apt/lists/*
13
+
14
+ # Copy the requirements file first to leverage Docker layer caching
15
+ COPY requirements.txt requirements.txt
16
+
17
+ # Install Python dependencies
18
+ # --no-cache-dir reduces image size
19
+ # --prefer-binary can speed up builds for packages with binary distributions
20
+ RUN pip install --no-cache-dir --prefer-binary -r requirements.txt
21
+
22
+ # Copy the application code into the container
23
+ COPY app.py app.py
24
+
25
+ # Expose the port Gradio will run on (default is 7860)
26
+ EXPOSE 7860
27
+
28
+ # Set the default command to run the Gradio application
29
+ # Using `python -u` for unbuffered output, which is good for logging
30
+ CMD ["python", "-u", "app.py"]
app.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from PIL import Image
3
+ import torch
4
+ from transformers import AutoProcessor, AutoModelForVision2Seq
5
+ import os
6
+
7
+ # Determine the device to use
8
+ # Using os.environ.get to allow device override from Space hardware config if needed
9
+ # Defaults to CUDA if available, else CPU.
10
+ device_choice = os.environ.get("DEVICE", "auto")
11
+ if device_choice == "auto":
12
+ device = "cuda" if torch.cuda.is_available() else "cpu"
13
+ else:
14
+ device = device_choice
15
+
16
+ print(f"Using device: {device}")
17
+
18
+ # Load the model and processor
19
+ model_id = "lusxvr/nanoVLM-222M"
20
+ try:
21
+ processor = AutoProcessor.from_pretrained(model_id)
22
+ model = AutoModelForVision2Seq.from_pretrained(model_id).to(device)
23
+ print("Model and processor loaded successfully.")
24
+ except Exception as e:
25
+ print(f"Error loading model/processor: {e}")
26
+ # If loading fails, we'll have the Gradio app display an error.
27
+ # This helps in debugging if the Space doesn't start correctly.
28
+ processor = None
29
+ model = None
30
+
31
+ def generate_text_for_image(image_input, prompt_input):
32
+ """
33
+ Generates text based on an image and a text prompt.
34
+ """
35
+ if model is None or processor is None:
36
+ return "Error: Model or processor not loaded. Check the Space logs for details."
37
+
38
+ if image_input is None:
39
+ return "Please upload an image."
40
+ if not prompt_input:
41
+ return "Please provide a prompt (e.g., 'Describe this image' or 'What color is the car?')."
42
+
43
+ try:
44
+ # Ensure the image is in PIL format and RGB
45
+ if not isinstance(image_input, Image.Image):
46
+ pil_image = Image.fromarray(image_input)
47
+ else:
48
+ pil_image = image_input
49
+
50
+ if pil_image.mode != "RGB":
51
+ pil_image = pil_image.convert("RGB")
52
+
53
+ # Prepare inputs for the model
54
+ # The prompt for nanoVLM is typically a question or an instruction.
55
+ inputs = processor(text=[prompt_input], images=[pil_image], return_tensors="pt").to(device)
56
+
57
+ # Generate text
58
+ # You can adjust max_new_tokens, temperature, top_k, etc.
59
+ generated_ids = model.generate(
60
+ **inputs,
61
+ max_new_tokens=150, # Increased for potentially longer descriptions
62
+ num_beams=3, # Example of adding beam search
63
+ no_repeat_ngram_size=2,
64
+ early_stopping=True
65
+ )
66
+
67
+ # Decode the generated tokens
68
+ generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
69
+
70
+ # The output might sometimes include the prompt itself, depending on the model.
71
+ # Simple heuristic to remove prompt if it appears at the beginning:
72
+ if generated_text.startswith(prompt_input):
73
+ cleaned_text = generated_text[len(prompt_input):].lstrip(" ,.:")
74
+ else:
75
+ cleaned_text = generated_text
76
+
77
+ return cleaned_text.strip()
78
+
79
+ except Exception as e:
80
+ print(f"Error during generation: {e}")
81
+ return f"An error occurred: {str(e)}"
82
+
83
+ # Create the Gradio interface
84
+ description = """
85
+ Upload an image and provide a text prompt (e.g., "What is in this image?", "Describe the animal in detail.").
86
+ The model will generate a textual response based on the visual content and your query.
87
+ This Space uses the `lusxvr/nanoVLM-222M` model.
88
+ """
89
+
90
+ # Example image from COCO dataset
91
+ example_image_url = "http://images.cocodataset.org/val2017/000000039769.jpg" # A cat and a remote
92
+
93
+ iface = gr.Interface(
94
+ fn=generate_text_for_image,
95
+ inputs=[
96
+ gr.Image(type="pil", label="Upload Image"),
97
+ gr.Textbox(label="Your Prompt/Question", info="e.g., 'What is this a picture of?', 'Describe the main subject.', 'How many animals are there?'")
98
+ ],
99
+ outputs=gr.Textbox(label="Generated Text", show_copy_button=True),
100
+ title="Interactive nanoVLM-222M Demo",
101
+ description=description,
102
+ examples=[
103
+ [example_image_url, "a photo of a"],
104
+ [example_image_url, "Describe the image in detail."],
105
+ [example_image_url, "What objects are on the sofa?"],
106
+ ],
107
+ cache_examples=True # Cache results for examples to load faster
108
+ )
109
+
110
+ if __name__ == "__main__":
111
+ # For Hugging Face Spaces, it's common to launch with server_name="0.0.0.0"
112
+ # The Space infrastructure handles the public URL and port mapping.
113
+ iface.launch(server_name="0.0.0.0", server_port=7860)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ torch>=2.0.0
2
+ transformers>=4.36.0
3
+ Pillow>=10.0.0
4
+ gradio
5
+ sentencepiece
6
+ accelerate