Spaces:
Running
on
Zero
Running
on
Zero
import torch | |
import gradio as gr | |
from transformers import AutoTokenizer, AutoModel, AutoImageProcessor | |
from PIL import Image | |
import gc | |
import os | |
import spaces | |
# Model configuration | |
MODEL_PATH = "nvidia/Llama-Nemotron-Nano-VL-8B-V1" | |
# Load model globally | |
print("Loading model...") | |
model = AutoModel.from_pretrained( | |
MODEL_PATH, | |
torch_dtype=torch.bfloat16, | |
low_cpu_mem_usage=True, | |
trust_remote_code=True, | |
).eval() | |
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) | |
image_processor = AutoImageProcessor.from_pretrained( | |
MODEL_PATH, | |
trust_remote_code=True | |
) | |
print("Model loaded successfully!") | |
def move_to_device(obj, device): | |
"""Recursively move tensors to device""" | |
if torch.is_tensor(obj): | |
return obj.to(device) | |
elif isinstance(obj, dict): | |
return {k: move_to_device(v, device) for k, v in obj.items()} | |
elif isinstance(obj, list): | |
return [move_to_device(v, device) for v in obj] | |
elif isinstance(obj, tuple): | |
return tuple(move_to_device(v, device) for v in obj) | |
elif hasattr(obj, 'to'): | |
return obj.to(device) | |
else: | |
return obj | |
def chat_text_only(message): | |
try: | |
device = "cuda" | |
# Move entire model to GPU | |
model.to(device) | |
generation_config = dict( | |
max_new_tokens=512, | |
do_sample=True, | |
temperature=0.7, | |
eos_token_id=tokenizer.eos_token_id | |
) | |
# Tokenize on CPU then move to GPU | |
inputs = tokenizer(message, return_tensors="pt") | |
inputs = move_to_device(inputs, device) | |
# Generate | |
with torch.no_grad(): | |
response, _ = model.chat( | |
tokenizer, | |
None, | |
message, | |
generation_config, | |
history=None, | |
return_history=True | |
) | |
# Move model back to CPU | |
model.to("cpu") | |
torch.cuda.empty_cache() | |
gc.collect() | |
return response | |
except Exception as e: | |
# Ensure model is back on CPU even if error occurs | |
model.to("cpu") | |
torch.cuda.empty_cache() | |
gc.collect() | |
return f"Error: {str(e)}" | |
def chat_with_image(image, message): | |
if image is None: | |
return "Please upload an image." | |
try: | |
device = "cuda" | |
# Move entire model to GPU | |
model.to(device) | |
generation_config = dict( | |
max_new_tokens=512, | |
do_sample=True, | |
temperature=0.7, | |
eos_token_id=tokenizer.eos_token_id | |
) | |
# Process image | |
image_features = image_processor(image) | |
# Move all image features to GPU | |
image_features = move_to_device(image_features, device) | |
# Add image token to message if not present | |
if "<image>" not in message: | |
message = f"<image>\n{message}" | |
# Generate | |
with torch.no_grad(): | |
response = model.chat( | |
tokenizer=tokenizer, | |
question=message, | |
generation_config=generation_config, | |
**image_features | |
) | |
# Move model back to CPU | |
model.to("cpu") | |
torch.cuda.empty_cache() | |
gc.collect() | |
return response | |
except Exception as e: | |
# Ensure model is back on CPU even if error occurs | |
model.to("cpu") | |
torch.cuda.empty_cache() | |
gc.collect() | |
return f"Error: {str(e)}" | |
def chat_with_two_images(image1, image2, message): | |
if image1 is None or image2 is None: | |
return "Please upload both images." | |
try: | |
device = "cuda" | |
# Move entire model to GPU | |
model.to(device) | |
generation_config = dict( | |
max_new_tokens=512, | |
do_sample=True, | |
temperature=0.7, | |
eos_token_id=tokenizer.eos_token_id | |
) | |
# Process both images | |
image_features = image_processor([image1, image2]) | |
# Move all image features to GPU | |
image_features = move_to_device(image_features, device) | |
# Format message for two images | |
if "<image-1>" not in message and "<image-2>" not in message: | |
message = f"<image-1>: <image>\n<image-2>: <image>\n{message}" | |
# Generate | |
with torch.no_grad(): | |
response = model.chat( | |
tokenizer=tokenizer, | |
question=message, | |
generation_config=generation_config, | |
**image_features | |
) | |
# Move model back to CPU | |
model.to("cpu") | |
torch.cuda.empty_cache() | |
gc.collect() | |
return response | |
except Exception as e: | |
# Ensure model is back on CPU even if error occurs | |
model.to("cpu") | |
torch.cuda.empty_cache() | |
gc.collect() | |
return f"Error: {str(e)}" | |
# Create Gradio interface | |
def create_interface(): | |
with gr.Blocks(title="Llama Nemotron Nano VL 8B", theme=gr.themes.Soft()) as demo: | |
gr.Markdown("# 🦙 Llama Nemotron Nano VL 8B Vision-Language Model") | |
gr.Markdown("Chat with a powerful vision-language model that can understand both text and images!") | |
with gr.Tabs(): | |
# Text-only chat tab | |
with gr.TabItem("💬 Text Chat"): | |
gr.Markdown("### Chat with the model using text only") | |
with gr.Row(): | |
with gr.Column(): | |
text_input = gr.Textbox( | |
label="Your message", | |
placeholder="Ask me anything...", | |
lines=3 | |
) | |
text_submit = gr.Button("Send", variant="primary") | |
with gr.Column(): | |
text_output = gr.Textbox( | |
label="Model Response", | |
lines=10, | |
max_lines=20 | |
) | |
text_submit.click( | |
chat_text_only, | |
inputs=[text_input], | |
outputs=[text_output] | |
) | |
# Example questions | |
gr.Examples( | |
examples=[ | |
["What is artificial intelligence?"], | |
["Explain quantum computing in simple terms."], | |
["What happened in 1969?"], | |
["Write a short story about a robot."] | |
], | |
inputs=[text_input] | |
) | |
# Single image chat tab | |
with gr.TabItem("🖼️ Image + Text Chat"): | |
gr.Markdown("### Upload an image and ask questions about it") | |
with gr.Row(): | |
with gr.Column(): | |
image_input = gr.Image( | |
label="Upload Image", | |
type="pil" | |
) | |
image_text_input = gr.Textbox( | |
label="Your question about the image", | |
placeholder="What do you see in this image?", | |
lines=3 | |
) | |
image_submit = gr.Button("Analyze", variant="primary") | |
with gr.Column(): | |
image_output = gr.Textbox( | |
label="Model Response", | |
lines=10, | |
max_lines=20 | |
) | |
image_submit.click( | |
chat_with_image, | |
inputs=[image_input, image_text_input], | |
outputs=[image_output] | |
) | |
# Example prompts | |
gr.Examples( | |
examples=[ | |
["Describe what you see in this image."], | |
["What objects are in this image?"], | |
["Extract any text from this image."], | |
["What is the main subject of this image?"] | |
], | |
inputs=[image_text_input] | |
) | |
# Two images comparison tab | |
with gr.TabItem("🖼️🖼️ Compare Two Images"): | |
gr.Markdown("### Upload two images and ask the model to compare them") | |
with gr.Row(): | |
with gr.Column(): | |
image1_input = gr.Image( | |
label="First Image", | |
type="pil" | |
) | |
image2_input = gr.Image( | |
label="Second Image", | |
type="pil" | |
) | |
two_images_text_input = gr.Textbox( | |
label="Your question about both images", | |
placeholder="Compare these two images...", | |
lines=3 | |
) | |
two_images_submit = gr.Button("Compare", variant="primary") | |
with gr.Column(): | |
two_images_output = gr.Textbox( | |
label="Model Response", | |
lines=10, | |
max_lines=20 | |
) | |
two_images_submit.click( | |
chat_with_two_images, | |
inputs=[image1_input, image2_input, two_images_text_input], | |
outputs=[two_images_output] | |
) | |
# Example prompts | |
gr.Examples( | |
examples=[ | |
["What are the main differences between these two images?"], | |
["Describe both images briefly."], | |
["Which image is more colorful?"], | |
["Compare the subjects in these images."] | |
], | |
inputs=[two_images_text_input] | |
) | |
# Footer | |
gr.Markdown("---") | |
gr.Markdown("⚡ Powered by NVIDIA Llama Nemotron Nano VL 8B") | |
return demo | |
# Create and launch the interface | |
if __name__ == "__main__": | |
demo = create_interface() | |
demo.queue() # Enable queuing for Zero GPU | |
demo.launch( | |
server_name="0.0.0.0", | |
server_port=7860, | |
ssr_mode=False | |
) |