Spaces:

Gapeleon
/

Llama-3.1-Nemotron-Nano-VL-8B-V1-Demo

Running on Zero

App Files Files Community

Llama-3.1-Nemotron-Nano-VL-8B-V1-Demo / app.py

Gapeleon

Solves 500 error for some Users (#1)

1a1eaff verified 4 days ago

raw

history blame contribute delete

11 kB

	import torch
	import gradio as gr
	from transformers import AutoTokenizer, AutoModel, AutoImageProcessor
	from PIL import Image
	import gc
	import os
	import spaces

	# Model configuration
	MODEL_PATH = "nvidia/Llama-Nemotron-Nano-VL-8B-V1"

	# Load model globally
	print("Loading model...")
	model = AutoModel.from_pretrained(
	MODEL_PATH,
	torch_dtype=torch.bfloat16,
	low_cpu_mem_usage=True,
	trust_remote_code=True,
	).eval()

	tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
	image_processor = AutoImageProcessor.from_pretrained(
	MODEL_PATH,
	trust_remote_code=True
	)
	print("Model loaded successfully!")

	def move_to_device(obj, device):
	"""Recursively move tensors to device"""
	if torch.is_tensor(obj):
	return obj.to(device)
	elif isinstance(obj, dict):
	return {k: move_to_device(v, device) for k, v in obj.items()}
	elif isinstance(obj, list):
	return [move_to_device(v, device) for v in obj]
	elif isinstance(obj, tuple):
	return tuple(move_to_device(v, device) for v in obj)
	elif hasattr(obj, 'to'):
	return obj.to(device)
	else:
	return obj

	@spaces.GPU(duration=60)
	def chat_text_only(message):
	try:
	device = "cuda"

	# Move entire model to GPU
	model.to(device)

	generation_config = dict(
	max_new_tokens=512,
	do_sample=True,
	temperature=0.7,
	eos_token_id=tokenizer.eos_token_id
	)

	# Tokenize on CPU then move to GPU
	inputs = tokenizer(message, return_tensors="pt")
	inputs = move_to_device(inputs, device)

	# Generate
	with torch.no_grad():
	response, _ = model.chat(
	tokenizer,
	None,
	message,
	generation_config,
	history=None,
	return_history=True
	)

	# Move model back to CPU
	model.to("cpu")
	torch.cuda.empty_cache()
	gc.collect()

	return response

	except Exception as e:
	# Ensure model is back on CPU even if error occurs
	model.to("cpu")
	torch.cuda.empty_cache()
	gc.collect()
	return f"Error: {str(e)}"

	@spaces.GPU(duration=60)
	def chat_with_image(image, message):
	if image is None:
	return "Please upload an image."

	try:
	device = "cuda"

	# Move entire model to GPU
	model.to(device)

	generation_config = dict(
	max_new_tokens=512,
	do_sample=True,
	temperature=0.7,
	eos_token_id=tokenizer.eos_token_id
	)

	# Process image
	image_features = image_processor(image)

	# Move all image features to GPU
	image_features = move_to_device(image_features, device)

	# Add image token to message if not present
	if "<image>" not in message:
	message = f"<image>\n{message}"

	# Generate
	with torch.no_grad():
	response = model.chat(
	tokenizer=tokenizer,
	question=message,
	generation_config=generation_config,
	**image_features
	)

	# Move model back to CPU
	model.to("cpu")
	torch.cuda.empty_cache()
	gc.collect()

	return response

	except Exception as e:
	# Ensure model is back on CPU even if error occurs
	model.to("cpu")
	torch.cuda.empty_cache()
	gc.collect()
	return f"Error: {str(e)}"

	@spaces.GPU(duration=60)
	def chat_with_two_images(image1, image2, message):
	if image1 is None or image2 is None:
	return "Please upload both images."

	try:
	device = "cuda"

	# Move entire model to GPU
	model.to(device)

	generation_config = dict(
	max_new_tokens=512,
	do_sample=True,
	temperature=0.7,
	eos_token_id=tokenizer.eos_token_id
	)

	# Process both images
	image_features = image_processor([image1, image2])

	# Move all image features to GPU
	image_features = move_to_device(image_features, device)

	# Format message for two images
	if "<image-1>" not in message and "<image-2>" not in message:
	message = f"<image-1>: <image>\n<image-2>: <image>\n{message}"

	# Generate
	with torch.no_grad():
	response = model.chat(
	tokenizer=tokenizer,
	question=message,
	generation_config=generation_config,
	**image_features
	)

	# Move model back to CPU
	model.to("cpu")
	torch.cuda.empty_cache()
	gc.collect()

	return response

	except Exception as e:
	# Ensure model is back on CPU even if error occurs
	model.to("cpu")
	torch.cuda.empty_cache()
	gc.collect()
	return f"Error: {str(e)}"

	# Create Gradio interface
	def create_interface():
	with gr.Blocks(title="Llama Nemotron Nano VL 8B", theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 🦙 Llama Nemotron Nano VL 8B Vision-Language Model")
	gr.Markdown("Chat with a powerful vision-language model that can understand both text and images!")

	with gr.Tabs():
	# Text-only chat tab
	with gr.TabItem("💬 Text Chat"):
	gr.Markdown("### Chat with the model using text only")

	with gr.Row():
	with gr.Column():
	text_input = gr.Textbox(
	label="Your message",
	placeholder="Ask me anything...",
	lines=3
	)
	text_submit = gr.Button("Send", variant="primary")

	with gr.Column():
	text_output = gr.Textbox(
	label="Model Response",
	lines=10,
	max_lines=20
	)

	text_submit.click(
	chat_text_only,
	inputs=[text_input],
	outputs=[text_output]
	)

	# Example questions
	gr.Examples(
	examples=[
	["What is artificial intelligence?"],
	["Explain quantum computing in simple terms."],
	["What happened in 1969?"],
	["Write a short story about a robot."]
	],
	inputs=[text_input]
	)

	# Single image chat tab
	with gr.TabItem("🖼️ Image + Text Chat"):
	gr.Markdown("### Upload an image and ask questions about it")

	with gr.Row():
	with gr.Column():
	image_input = gr.Image(
	label="Upload Image",
	type="pil"
	)
	image_text_input = gr.Textbox(
	label="Your question about the image",
	placeholder="What do you see in this image?",
	lines=3
	)
	image_submit = gr.Button("Analyze", variant="primary")

	with gr.Column():
	image_output = gr.Textbox(
	label="Model Response",
	lines=10,
	max_lines=20
	)

	image_submit.click(
	chat_with_image,
	inputs=[image_input, image_text_input],
	outputs=[image_output]
	)

	# Example prompts
	gr.Examples(
	examples=[
	["Describe what you see in this image."],
	["What objects are in this image?"],
	["Extract any text from this image."],
	["What is the main subject of this image?"]
	],
	inputs=[image_text_input]
	)

	# Two images comparison tab
	with gr.TabItem("🖼️🖼️ Compare Two Images"):
	gr.Markdown("### Upload two images and ask the model to compare them")

	with gr.Row():
	with gr.Column():
	image1_input = gr.Image(
	label="First Image",
	type="pil"
	)
	image2_input = gr.Image(
	label="Second Image",
	type="pil"
	)
	two_images_text_input = gr.Textbox(
	label="Your question about both images",
	placeholder="Compare these two images...",
	lines=3
	)
	two_images_submit = gr.Button("Compare", variant="primary")

	with gr.Column():
	two_images_output = gr.Textbox(
	label="Model Response",
	lines=10,
	max_lines=20
	)

	two_images_submit.click(
	chat_with_two_images,
	inputs=[image1_input, image2_input, two_images_text_input],
	outputs=[two_images_output]
	)

	# Example prompts
	gr.Examples(
	examples=[
	["What are the main differences between these two images?"],
	["Describe both images briefly."],
	["Which image is more colorful?"],
	["Compare the subjects in these images."]
	],
	inputs=[two_images_text_input]
	)

	# Footer
	gr.Markdown("---")
	gr.Markdown("⚡ Powered by NVIDIA Llama Nemotron Nano VL 8B")

	return demo

	# Create and launch the interface
	if __name__ == "__main__":
	demo = create_interface()
	demo.queue() # Enable queuing for Zero GPU
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	ssr_mode=False
	)