# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import Optional, Union, List import numpy as np from transformers.feature_extraction_utils import BatchFeature from transformers.image_utils import ImageInput from transformers.processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs from transformers.tokenization_utils_base import PreTokenizedInput, TextInput from transformers.video_utils import VideoInput class NemotronNanoVLV2ImagesKwargs(ImagesKwargs): min_pixels: Optional[int] max_pixels: Optional[int] patch_size: Optional[int] temporal_patch_size: Optional[int] merge_size: Optional[int] class NemotronNanoVLV2ProcessorKwargs(ProcessingKwargs, total=False): images_kwargs: NemotronNanoVLV2ImagesKwargs videos_kwargs: VideosKwargs _defaults = { "text_kwargs": { "padding": False, }, } class NemotronNanoVLV2Processor(ProcessorMixin): r""" Constructs a Nemotron Nano VL V2 processor which wraps an image processor and a tokenizer into a single processor. [`NemotronNanoVLV2Processor`] offers all the functionalities of the image processor and tokenizer. See the [`~NemotronNanoVLV2Processor.__call__`] and [`~NemotronNanoVLV2Processor.decode`] for more information. Args: image_processor ([`AutoImageProcessor`], *optional*): The image processor is a required input. tokenizer ([`AutoTokenizer`], *optional*): The tokenizer is a required input. chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string. """ attributes = ["image_processor", "tokenizer"] image_processor_class = "AutoImageProcessor" video_processor_class = "AutoVideoProcessor" tokenizer_class = ("AutoTokenizer") def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs): self.image_token = "" if not hasattr(tokenizer, "image_token") else tokenizer.image_token self.video_token = "