# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Quick test script for image inference with NVIDIA Nemotron Nano VL model.""" import argparse from typing import List import torch from PIL import Image from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer def load_model(model_path: str, device: str = "cuda:0"): """Load the VLM model and processor. Args: model_path: Path to the pretrained model device: Device to load the model on Returns: Tuple of (model, tokenizer, processor) """ print(f"Loading model from {model_path}...") model = AutoModelForCausalLM.from_pretrained( model_path, trust_remote_code=True, device_map=device, torch_dtype=torch.bfloat16 ).eval() tokenizer = AutoTokenizer.from_pretrained(model_path) processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) print("Model loaded successfully!") return model, tokenizer, processor def test_single_image( model, tokenizer, processor, image_path: str, prompt_text: str = "Describe the image.", device: str = "cuda:0", max_new_tokens: int = 1024, do_sample: bool = False, ): """Test model inference on a single image. Args: model: The VLM model tokenizer: The tokenizer processor: The processor image_path: Path to the image file prompt_text: Text prompt for the model device: Device to run inference on max_new_tokens: Maximum number of tokens to generate do_sample: Whether to use sampling for generation """ print(f"\nProcessing: {image_path}") # Load image image = Image.open(image_path) # Prepare messages messages = [ {"role": "system", "content": "/no_think"}, { "role": "user", "content": [ {"type": "image", "image": ""}, {"type": "text", "text": prompt_text}, ], } ] # Generate prompt prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) # Process inputs inputs = processor( text=[prompt], images=[image], return_tensors="pt", ).to(device) # Generate output generated_ids = model.generate( pixel_values=inputs.pixel_values, input_ids=inputs.input_ids, attention_mask=inputs.attention_mask, max_new_tokens=max_new_tokens, do_sample=do_sample, eos_token_id=tokenizer.eos_token_id, ) # Decode output output_text = processor.batch_decode( generated_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False )[0] print(f"Output: {output_text}\n") def test_multi_images( model, tokenizer, processor, image_paths: List[str], prompt_text: str = "Describe the images in detail.", device: str = "cuda:0", max_new_tokens: int = 1024, do_sample: bool = False, ): """Test model inference on multiple images. Args: model: The VLM model tokenizer: The tokenizer processor: The processor image_paths: List of paths to image files prompt_text: Text prompt for the model device: Device to run inference on max_new_tokens: Maximum number of tokens to generate do_sample: Whether to use sampling for generation """ print(f"\nProcessing {len(image_paths)} images: {image_paths}") # Load images images = [Image.open(img_path) for img_path in image_paths] # Prepare messages with multiple image placeholders content = [{"type": "image", "image": f"/path/to/image{i+1}"} for i in range(len(images))] content.append({"type": "text", "text": f"\n{prompt_text}"}) messages = [ {"role": "system", "content": "/no_think"}, {"role": "user", "content": content} ] # Generate prompt prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) # Process inputs inputs = processor( text=[prompt], images=images, return_tensors="pt", ).to(device) # Generate output generated_ids = model.generate( pixel_values=inputs.pixel_values, input_ids=inputs.input_ids, attention_mask=inputs.attention_mask, max_new_tokens=max_new_tokens, do_sample=do_sample, eos_token_id=tokenizer.eos_token_id, ) # Decode output output_text = processor.batch_decode( generated_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False )[0] print(f"Output: {output_text}\n") def main(): parser = argparse.ArgumentParser(description="Test image inference with VLM model") parser.add_argument( "--model_path", type=str, required=True, help="Path to the pretrained model" ) parser.add_argument( "--device", type=str, default="cuda:0", help="Device to run inference on (e.g., cuda:0, cpu)" ) parser.add_argument( "--max_new_tokens", type=int, default=1024, help="Maximum number of tokens to generate" ) args = parser.parse_args() # Load model model, tokenizer, processor = load_model(args.model_path, args.device) # Test single images print("=" * 50) print("Testing Single Image Inference") print("=" * 50) single_image_paths = [ "images/example1a.jpeg", "images/example1b.jpeg", "images/table.png", "images/tech.png", ] for img_path in single_image_paths: test_single_image( model, tokenizer, processor, img_path, device=args.device, max_new_tokens=args.max_new_tokens ) # Test multi-image inference print("=" * 50) print("Testing Multi-Image Inference") print("=" * 50) multi_image_paths = [ "images/example1a.jpeg", "images/example1b.jpeg", ] test_multi_images( model, tokenizer, processor, multi_image_paths, device=args.device, max_new_tokens=args.max_new_tokens ) if __name__ == "__main__": main()