# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Quick test script for image inference with NVIDIA Nemotron Nano VL model."""

import argparse
from typing import List

import torch
from PIL import Image
from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer


def load_model(model_path: str, device: str = "cuda:0"):
    """Load the VLM model and processor.
    
    Args:
        model_path: Path to the pretrained model
        device: Device to load the model on
        
    Returns:
        Tuple of (model, tokenizer, processor)
    """
    print(f"Loading model from {model_path}...")
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        trust_remote_code=True,
        device_map=device,
        torch_dtype=torch.bfloat16
    ).eval()
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
    print("Model loaded successfully!")
    return model, tokenizer, processor


def test_single_image(
    model,
    tokenizer,
    processor,
    image_path: str,
    prompt_text: str = "Describe the image.",
    device: str = "cuda:0",
    max_new_tokens: int = 1024,
    do_sample: bool = False,
):
    """Test model inference on a single image.
    
    Args:
        model: The VLM model
        tokenizer: The tokenizer
        processor: The processor
        image_path: Path to the image file
        prompt_text: Text prompt for the model
        device: Device to run inference on
        max_new_tokens: Maximum number of tokens to generate
        do_sample: Whether to use sampling for generation
    """
    print(f"\nProcessing: {image_path}")
    
    # Load image
    image = Image.open(image_path)
    
    # Prepare messages
    messages = [
        {"role": "system", "content": "/no_think"},
        {
            "role": "user",
            "content": [
                {"type": "image", "image": ""},
                {"type": "text", "text": prompt_text},
            ],
        }
    ]
    
    # Generate prompt
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
    # Process inputs
    inputs = processor(
        text=[prompt],
        images=[image],
        return_tensors="pt",
    ).to(device)
    
    # Generate output
    generated_ids = model.generate(
        pixel_values=inputs.pixel_values,
        input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_new_tokens=max_new_tokens,
        do_sample=do_sample,
        eos_token_id=tokenizer.eos_token_id,
    )
    
    # Decode output
    output_text = processor.batch_decode(
        generated_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False
    )[0]
    
    print(f"Output: {output_text}\n")


def test_multi_images(
    model,
    tokenizer,
    processor,
    image_paths: List[str],
    prompt_text: str = "Describe the images in detail.",
    device: str = "cuda:0",
    max_new_tokens: int = 1024,
    do_sample: bool = False,
):
    """Test model inference on multiple images.
    
    Args:
        model: The VLM model
        tokenizer: The tokenizer
        processor: The processor
        image_paths: List of paths to image files
        prompt_text: Text prompt for the model
        device: Device to run inference on
        max_new_tokens: Maximum number of tokens to generate
        do_sample: Whether to use sampling for generation
    """
    print(f"\nProcessing {len(image_paths)} images: {image_paths}")
    
    # Load images
    images = [Image.open(img_path) for img_path in image_paths]
    
    # Prepare messages with multiple image placeholders
    content = [{"type": "image", "image": f"/path/to/image{i+1}"} for i in range(len(images))]
    content.append({"type": "text", "text": f"\n{prompt_text}"})
    
    messages = [
        {"role": "system", "content": "/no_think"},
        {"role": "user", "content": content}
    ]
    
    # Generate prompt
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
    # Process inputs
    inputs = processor(
        text=[prompt],
        images=images,
        return_tensors="pt",
    ).to(device)
    
    # Generate output
    generated_ids = model.generate(
        pixel_values=inputs.pixel_values,
        input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_new_tokens=max_new_tokens,
        do_sample=do_sample,
        eos_token_id=tokenizer.eos_token_id,
    )
    
    # Decode output
    output_text = processor.batch_decode(
        generated_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False
    )[0]
    
    print(f"Output: {output_text}\n")


def main():
    parser = argparse.ArgumentParser(description="Test image inference with VLM model")
    parser.add_argument(
        "--model_path",
        type=str,
        required=True,
        help="Path to the pretrained model"
    )
    parser.add_argument(
        "--device",
        type=str,
        default="cuda:0",
        help="Device to run inference on (e.g., cuda:0, cpu)"
    )
    parser.add_argument(
        "--max_new_tokens",
        type=int,
        default=1024,
        help="Maximum number of tokens to generate"
    )
    args = parser.parse_args()
    
    # Load model
    model, tokenizer, processor = load_model(args.model_path, args.device)
    
    # Test single images
    print("=" * 50)
    print("Testing Single Image Inference")
    print("=" * 50)
    
    single_image_paths = [
        "images/example1a.jpeg",
        "images/example1b.jpeg",
        "images/table.png",
        "images/tech.png",
    ]
    
    for img_path in single_image_paths:
        test_single_image(
            model, tokenizer, processor, img_path,
            device=args.device,
            max_new_tokens=args.max_new_tokens
        )
    
    # Test multi-image inference
    print("=" * 50)
    print("Testing Multi-Image Inference")
    print("=" * 50)
    
    multi_image_paths = [
        "images/example1a.jpeg",
        "images/example1b.jpeg",
    ]
    
    test_multi_images(
        model, tokenizer, processor, multi_image_paths,
        device=args.device,
        max_new_tokens=args.max_new_tokens
    )


if __name__ == "__main__":
    main()