Spaces:

Gapeleon
/

Llama-3.1-Nemotron-Nano-VL-8B-V1-Demo

Running on Zero

File size: 11,003 Bytes

import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModel, AutoImageProcessor
from PIL import Image
import gc
import os
import spaces

# Model configuration
MODEL_PATH = "nvidia/Llama-Nemotron-Nano-VL-8B-V1"

# Load model globally
print("Loading model...")
model = AutoModel.from_pretrained(
    MODEL_PATH,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    trust_remote_code=True,
).eval()

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
image_processor = AutoImageProcessor.from_pretrained(
    MODEL_PATH, 
    trust_remote_code=True
)
print("Model loaded successfully!")

def move_to_device(obj, device):
    """Recursively move tensors to device"""
    if torch.is_tensor(obj):
        return obj.to(device)
    elif isinstance(obj, dict):
        return {k: move_to_device(v, device) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [move_to_device(v, device) for v in obj]
    elif isinstance(obj, tuple):
        return tuple(move_to_device(v, device) for v in obj)
    elif hasattr(obj, 'to'):
        return obj.to(device)
    else:
        return obj

@spaces.GPU(duration=60)
def chat_text_only(message):
    try:
        device = "cuda"
        
        # Move entire model to GPU
        model.to(device)
        
        generation_config = dict(
            max_new_tokens=512, 
            do_sample=True,
            temperature=0.7,
            eos_token_id=tokenizer.eos_token_id
        )
        
        # Tokenize on CPU then move to GPU
        inputs = tokenizer(message, return_tensors="pt")
        inputs = move_to_device(inputs, device)
        
        # Generate
        with torch.no_grad():
            response, _ = model.chat(
                tokenizer, 
                None, 
                message, 
                generation_config, 
                history=None, 
                return_history=True
            )
        
        # Move model back to CPU
        model.to("cpu")
        torch.cuda.empty_cache()
        gc.collect()
        
        return response
        
    except Exception as e:
        # Ensure model is back on CPU even if error occurs
        model.to("cpu")
        torch.cuda.empty_cache()
        gc.collect()
        return f"Error: {str(e)}"

@spaces.GPU(duration=60)
def chat_with_image(image, message):
    if image is None:
        return "Please upload an image."
    
    try:
        device = "cuda"
        
        # Move entire model to GPU
        model.to(device)
        
        generation_config = dict(
            max_new_tokens=512, 
            do_sample=True,
            temperature=0.7,
            eos_token_id=tokenizer.eos_token_id
        )
        
        # Process image
        image_features = image_processor(image)
        
        # Move all image features to GPU
        image_features = move_to_device(image_features, device)
        
        # Add image token to message if not present
        if "<image>" not in message:
            message = f"<image>\n{message}"
        
        # Generate
        with torch.no_grad():
            response = model.chat(
                tokenizer=tokenizer, 
                question=message, 
                generation_config=generation_config,
                **image_features
            )
        
        # Move model back to CPU
        model.to("cpu")
        torch.cuda.empty_cache()
        gc.collect()
        
        return response
        
    except Exception as e:
        # Ensure model is back on CPU even if error occurs
        model.to("cpu")
        torch.cuda.empty_cache()
        gc.collect()
        return f"Error: {str(e)}"

@spaces.GPU(duration=60)
def chat_with_two_images(image1, image2, message):
    if image1 is None or image2 is None:
        return "Please upload both images."
    
    try:
        device = "cuda"
        
        # Move entire model to GPU
        model.to(device)
        
        generation_config = dict(
            max_new_tokens=512, 
            do_sample=True,
            temperature=0.7,
            eos_token_id=tokenizer.eos_token_id
        )
        
        # Process both images
        image_features = image_processor([image1, image2])
        
        # Move all image features to GPU
        image_features = move_to_device(image_features, device)
        
        # Format message for two images
        if "<image-1>" not in message and "<image-2>" not in message:
            message = f"<image-1>: <image>\n<image-2>: <image>\n{message}"
        
        # Generate
        with torch.no_grad():
            response = model.chat(
                tokenizer=tokenizer, 
                question=message, 
                generation_config=generation_config,
                **image_features
            )
        
        # Move model back to CPU
        model.to("cpu")
        torch.cuda.empty_cache()
        gc.collect()
        
        return response
        
    except Exception as e:
        # Ensure model is back on CPU even if error occurs
        model.to("cpu")
        torch.cuda.empty_cache()
        gc.collect()
        return f"Error: {str(e)}"

# Create Gradio interface
def create_interface():
    with gr.Blocks(title="Llama Nemotron Nano VL 8B", theme=gr.themes.Soft()) as demo:
        gr.Markdown("# 🦙 Llama Nemotron Nano VL 8B Vision-Language Model")
        gr.Markdown("Chat with a powerful vision-language model that can understand both text and images!")
        
        with gr.Tabs():
            # Text-only chat tab
            with gr.TabItem("💬 Text Chat"):
                gr.Markdown("### Chat with the model using text only")
                
                with gr.Row():
                    with gr.Column():
                        text_input = gr.Textbox(
                            label="Your message",
                            placeholder="Ask me anything...",
                            lines=3
                        )
                        text_submit = gr.Button("Send", variant="primary")
                    
                    with gr.Column():
                        text_output = gr.Textbox(
                            label="Model Response",
                            lines=10,
                            max_lines=20
                        )
                
                text_submit.click(
                    chat_text_only,
                    inputs=[text_input],
                    outputs=[text_output]
                )
                
                # Example questions
                gr.Examples(
                    examples=[
                        ["What is artificial intelligence?"],
                        ["Explain quantum computing in simple terms."],
                        ["What happened in 1969?"],
                        ["Write a short story about a robot."]
                    ],
                    inputs=[text_input]
                )
            
            # Single image chat tab
            with gr.TabItem("🖼️ Image + Text Chat"):
                gr.Markdown("### Upload an image and ask questions about it")
                
                with gr.Row():
                    with gr.Column():
                        image_input = gr.Image(
                            label="Upload Image",
                            type="pil"
                        )
                        image_text_input = gr.Textbox(
                            label="Your question about the image",
                            placeholder="What do you see in this image?",
                            lines=3
                        )
                        image_submit = gr.Button("Analyze", variant="primary")
                    
                    with gr.Column():
                        image_output = gr.Textbox(
                            label="Model Response",
                            lines=10,
                            max_lines=20
                        )
                
                image_submit.click(
                    chat_with_image,
                    inputs=[image_input, image_text_input],
                    outputs=[image_output]
                )
                
                # Example prompts
                gr.Examples(
                    examples=[
                        ["Describe what you see in this image."],
                        ["What objects are in this image?"],
                        ["Extract any text from this image."],
                        ["What is the main subject of this image?"]
                    ],
                    inputs=[image_text_input]
                )
            
            # Two images comparison tab
            with gr.TabItem("🖼️🖼️ Compare Two Images"):
                gr.Markdown("### Upload two images and ask the model to compare them")
                
                with gr.Row():
                    with gr.Column():
                        image1_input = gr.Image(
                            label="First Image",
                            type="pil"
                        )
                        image2_input = gr.Image(
                            label="Second Image", 
                            type="pil"
                        )
                        two_images_text_input = gr.Textbox(
                            label="Your question about both images",
                            placeholder="Compare these two images...",
                            lines=3
                        )
                        two_images_submit = gr.Button("Compare", variant="primary")
                    
                    with gr.Column():
                        two_images_output = gr.Textbox(
                            label="Model Response",
                            lines=10,
                            max_lines=20
                        )
                
                two_images_submit.click(
                    chat_with_two_images,
                    inputs=[image1_input, image2_input, two_images_text_input],
                    outputs=[two_images_output]
                )
                
                # Example prompts
                gr.Examples(
                    examples=[
                        ["What are the main differences between these two images?"],
                        ["Describe both images briefly."],
                        ["Which image is more colorful?"],
                        ["Compare the subjects in these images."]
                    ],
                    inputs=[two_images_text_input]
                )
        
        # Footer
        gr.Markdown("---")
        gr.Markdown("⚡ Powered by NVIDIA Llama Nemotron Nano VL 8B")
    
    return demo

# Create and launch the interface
if __name__ == "__main__":
    demo = create_interface()
    demo.queue()  # Enable queuing for Zero GPU
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        ssr_mode=False
    )