Upload folder using huggingface_hub

Browse files

Files changed (15) hide show

.gitattributes +1 -0
chat_template.jinja +3 -0
config.json +242 -0
configuration_gar.py +63 -0
image_processing_perception_lm_fast.py +378 -0
model.safetensors +3 -0
modeling_gar.py +352 -0
modeling_perception_lm.py +865 -0
preprocessor_config.json +40 -0
processing_gar.py +316 -0
processor_config.json +9 -0
special_tokens_map.json +19 -0
tokenizer.json +3 -0
tokenizer_config.json +2118 -0
video_preprocessor_config.json +37 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,3 @@

+{{- bos_token }}{%- if messages[0]['role'] == 'system' -%}    {%- set system_message = messages[0]['content']|trim %}
+    {%- set messages = messages[1:] %}
+{%- else %}    {%- set system_message = 'You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.' %}{%- endif %}{{- '<|start_header_id|>system<|end_header_id|>\n\n' }}{{- system_message }}{{- '<|eot_id|>' }}{%- for message in messages %}{{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }}{%- for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<|image|>' }}{%- endfor %}{%- for content in message['content'] | selectattr('type', 'equalto', 'video') %}{{ '<|video|>' }}{%- endfor %}{%- for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{- content['text'] | trim }}{%- endfor %}{{'<|eot_id|>' }}{%- endfor %}{%- if add_generation_prompt %}{{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{%- endif %}

config.json ADDED Viewed

	@@ -0,0 +1,242 @@

+{
+  "architectures": [
+    "GARModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_gar.GARConfig",
+    "AutoModel": "modeling_gar.GARModel",
+    "AutoModelForCausalLM": "modeling_gar.GARModel"
+  },
+  "crop_tokens_ids": [
+    128004,
+    128005,
+    128008,
+    128010,
+    128011
+  ],
+  "kernel_size": [
+    14,
+    14
+  ],
+  "mask_path_embedding_out_channels": 1024,
+  "mllm_config": {
+    "_name_or_path": "/mnt/bn/zilongdata-us/wangyuhao/model/Perception-LM-1B",
+    "architectures": [
+      "PerceptionLMForConditionalGeneration"
+    ],
+    "image_token_id": 128002,
+    "model_type": "perception_lm",
+    "projector_pooling_ratio": 2,
+    "text_config": {
+      "_name_or_path": "",
+      "add_cross_attention": false,
+      "architectures": null,
+      "attention_bias": false,
+      "attention_dropout": 0.0,
+      "bad_words_ids": null,
+      "begin_suppress_tokens": null,
+      "bos_token_id": 128000,
+      "chunk_size_feed_forward": 0,
+      "cross_attention_hidden_size": null,
+      "decoder_start_token_id": null,
+      "diversity_penalty": 0.0,
+      "do_sample": false,
+      "early_stopping": false,
+      "encoder_no_repeat_ngram_size": 0,
+      "eos_token_id": [
+        128001,
+        128009
+      ],
+      "exponential_decay_length_penalty": null,
+      "finetuning_task": null,
+      "forced_bos_token_id": null,
+      "forced_eos_token_id": null,
+      "head_dim": 64,
+      "hidden_act": "silu",
+      "hidden_size": 2048,
+      "id2label": {
+        "0": "LABEL_0",
+        "1": "LABEL_1"
+      },
+      "initializer_range": 0.02,
+      "intermediate_size": 8192,
+      "is_decoder": false,
+      "is_encoder_decoder": false,
+      "label2id": {
+        "LABEL_0": 0,
+        "LABEL_1": 1
+      },
+      "length_penalty": 1.0,
+      "max_length": 20,
+      "max_position_embeddings": 11520,
+      "min_length": 0,
+      "mlp_bias": false,
+      "model_type": "llama",
+      "no_repeat_ngram_size": 0,
+      "num_attention_heads": 32,
+      "num_beam_groups": 1,
+      "num_beams": 1,
+      "num_hidden_layers": 16,
+      "num_key_value_heads": 8,
+      "num_return_sequences": 1,
+      "output_attentions": false,
+      "output_hidden_states": false,
+      "output_scores": false,
+      "pad_token_id": null,
+      "prefix": null,
+      "pretraining_tp": 1,
+      "problem_type": null,
+      "pruned_heads": {},
+      "remove_invalid_values": false,
+      "repetition_penalty": 1.0,
+      "return_dict": true,
+      "return_dict_in_generate": false,
+      "rms_norm_eps": 1e-05,
+      "rope_scaling": {
+        "factor": 32.0,
+        "high_freq_factor": 4.0,
+        "low_freq_factor": 1.0,
+        "original_max_position_embeddings": 8192,
+        "rope_type": "llama3"
+      },
+      "rope_theta": 500000.0,
+      "sep_token_id": null,
+      "suppress_tokens": null,
+      "task_specific_params": null,
+      "temperature": 1.0,
+      "tf_legacy_loss": false,
+      "tie_encoder_decoder": false,
+      "tie_word_embeddings": true,
+      "tokenizer_class": null,
+      "top_k": 50,
+      "top_p": 1.0,
+      "torch_dtype": "bfloat16",
+      "torchscript": false,
+      "typical_p": 1.0,
+      "use_bfloat16": false,
+      "use_cache": true,
+      "use_flash_attn": true,
+      "vocab_size": 128262
+    },
+    "torch_dtype": "bfloat16",
+    "use_flash_attn": true,
+    "video_token_id": 128003,
+    "vision_config": {
+      "_name_or_path": "",
+      "add_cross_attention": false,
+      "architecture": "vit_pe_core_large_patch14_336",
+      "architectures": null,
+      "bad_words_ids": null,
+      "begin_suppress_tokens": null,
+      "bos_token_id": null,
+      "chunk_size_feed_forward": 0,
+      "cross_attention_hidden_size": null,
+      "decoder_start_token_id": null,
+      "diversity_penalty": 0.0,
+      "do_pooling": true,
+      "do_sample": false,
+      "early_stopping": false,
+      "encoder_no_repeat_ngram_size": 0,
+      "eos_token_id": null,
+      "exponential_decay_length_penalty": null,
+      "finetuning_task": null,
+      "forced_bos_token_id": null,
+      "forced_eos_token_id": null,
+      "global_pool": "map",
+      "initializer_range": 0.02,
+      "is_decoder": false,
+      "is_encoder_decoder": false,
+      "label_names": [
+        "LABEL_0",
+        "LABEL_1"
+      ],
+      "length_penalty": 1.0,
+      "max_length": 20,
+      "min_length": 0,
+      "model_args": {
+        "depth": 23,
+        "embed_dim": 1024,
+        "global_pool": "",
+        "img_size": [
+          448,
+          448
+        ],
+        "init_values": 0.1,
+        "ref_feat_shape": [
+          32,
+          32
+        ],
+        "use_post_transformer_norm": false
+      },
+      "model_type": "timm_wrapper",
+      "no_repeat_ngram_size": 0,
+      "num_beam_groups": 1,
+      "num_beams": 1,
+      "num_classes": 2,
+      "num_features": 1024,
+      "num_return_sequences": 1,
+      "output_attentions": false,
+      "output_hidden_states": false,
+      "output_scores": false,
+      "pad_token_id": null,
+      "prefix": null,
+      "pretrained_cfg": {
+        "classifier": "head",
+        "crop_mode": "center",
+        "crop_pct": 1.0,
+        "custom_load": false,
+        "first_conv": "patch_embed.proj",
+        "fixed_input_size": true,
+        "input_size": [
+          3,
+          336,
+          336
+        ],
+        "interpolation": "bicubic",
+        "license": "custom",
+        "mean": [
+          0.5,
+          0.5,
+          0.5
+        ],
+        "pool_size": null,
+        "std": [
+          0.5,
+          0.5,
+          0.5
+        ],
+        "tag": "fb"
+      },
+      "problem_type": null,
+      "pruned_heads": {},
+      "remove_invalid_values": false,
+      "repetition_penalty": 1.0,
+      "return_dict": true,
+      "return_dict_in_generate": false,
+      "sep_token_id": null,
+      "suppress_tokens": null,
+      "task_specific_params": null,
+      "temperature": 1.0,
+      "tf_legacy_loss": false,
+      "tie_encoder_decoder": false,
+      "tie_word_embeddings": true,
+      "tokenizer_class": null,
+      "top_k": 50,
+      "top_p": 1.0,
+      "torch_dtype": "bfloat16",
+      "torchscript": false,
+      "typical_p": 1.0,
+      "use_bfloat16": false,
+      "use_flash_attn": false
+    },
+    "vision_use_cls_token": true
+  },
+  "model_type": "GAR",
+  "output_attentions": false,
+  "patch_size_h": 14,
+  "patch_size_w": 14,
+  "prompt_numbers": 5,
+  "max_num_tiles": 16,
+  "torch_dtype": "bfloat16",
+  "transformers_version": null
+}

configuration_gar.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import copy
+from transformers.utils import logging
+from transformers.configuration_utils import PretrainedConfig
+from transformers import AutoConfig, PerceptionLMConfig
+logger = logging.get_logger(__name__)
+class GARConfig(PretrainedConfig):
+    model_type = 'GAR'
+    is_composition = True
+    def __init__(
+        self,
+        mllm_config=None,
+        prompt_numbers=5,
+        crop_tokens_ids=[128004, 128005, 128008, 128010, 128011],
+        use_flash_attn=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if mllm_config is None:
+            mllm_config = {}
+            logger.info('mllm_config is None. Initializing the PerceptionLM with default values.')
+        if mllm_config is None:
+            self.mllm_config = AutoConfig.from_pretrained("facebook/Perception-LM-1B")
+        else:
+            self.mllm_config = PerceptionLMConfig(**mllm_config)
+        self.prompt_numbers = prompt_numbers
+        self.crop_tokens_ids = crop_tokens_ids
+        assert len(self.crop_tokens_ids) == self.prompt_numbers, f'{self.crop_tokens_ids} crop_tokens_ids length should be {self.prompt_numbers}'
+        try:
+            self.patch_size_h = self.mllm_config.vision_config.model_args["img_size"][0] // self.mllm_config.vision_config.model_args["ref_feat_shape"][0]
+            self.patch_size_w = self.mllm_config.vision_config.model_args["img_size"][1] // self.mllm_config.vision_config.model_args["ref_feat_shape"][1]
+            self.kernel_size = [self.patch_size_h, self.patch_size_w]
+        except:
+            self.patch_size_h = 16
+            self.patch_size_w = 16
+            self.kernel_size = [self.patch_size_h, self.patch_size_w]
+        try:
+            self.mask_path_embedding_out_channels = self.mllm_config.vision_config.num_features
+        except:
+            self.mask_path_embedding_out_channels = 1280
+        self.mllm_config.use_flash_attn = True if use_flash_attn else False
+        self.mllm_config.text_config.use_flash_attn = True if use_flash_attn else False
+        self.mllm_config.vision_config.use_flash_attn = False
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output['mllm_config'] = self.mllm_config.to_dict()
+        output['model_type'] = self.__class__.model_type
+        return output

image_processing_perception_lm_fast.py ADDED Viewed

	@@ -0,0 +1,378 @@

+# *************************************************************************
+# This file may have been modified by Bytedance Inc. (“Bytedance Inc.'s Mo-
+# difications”). All Bytedance Inc.'s Modifications are Copyright (2025) B-
+# ytedance Inc..
+# *************************************************************************
+# Adapted from https://github.com/huggingface/transformers/blob/v4.55.4/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
+# Copyright 2025 Meta Platforms, Inc. and the HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for PerceptionLM."""
+import math
+from functools import reduce
+from typing import Optional, Union
+import numpy as np
+from transformers.image_processing_utils import BatchFeature
+from transformers.image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    DefaultFastImageProcessorKwargs,
+    get_image_size,
+    group_images_by_shape,
+    reorder_images,
+)
+from transformers.image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    PILImageResampling,
+)
+from transformers.processing_utils import Unpack
+from transformers.utils import (
+    TensorType,
+    auto_docstring,
+    is_torch_available,
+    is_torchvision_available,
+)
+if is_torch_available():
+    import torch
+if is_torchvision_available():
+    from torchvision.transforms import functional as F
+class PerceptionLMFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    r"""
+    vision_input_type (`str`, *optional*, defaults to `"thumb+tile"`):
+        Vision processing strategy. `"thumb+tile"` uses both thumbnails and multiple tiles for
+        multi-scale processing, otherwise uses single tile for lower memory usage.
+    tile_size (`int`, *optional*, defaults to `448`):
+        Height and width dimension (in pixels) of each tile used for image processing.
+    max_num_tiles (`int`, *optional*, defaults to `36`):
+        Maximum number of tiles an image can be split into based on its aspect ratio.
+    """
+    vision_input_type: str = "thumb+tile"
+    tile_size: int = 448
+    max_num_tiles: int = 36
+@auto_docstring
+class PerceptionLMImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BICUBIC
+    image_mean = IMAGENET_STANDARD_MEAN
+    image_std = IMAGENET_STANDARD_STD
+    do_resize = True
+    do_center_crop = False
+    do_rescale = True
+    do_normalize = True
+    do_convert_rgb = True
+    size = {"width": 448, "height": 448}  # for backward compatibility in tests
+    valid_kwargs = PerceptionLMFastImageProcessorKwargs
+    def __init__(self, **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs]) -> None:
+        super().__init__(**kwargs)
+    @auto_docstring
+    def preprocess(
+        self, images, **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs]
+    ) -> BatchFeature:
+        return super().preprocess(images, **kwargs)
+    @staticmethod
+    def _factors(n: int):
+        """Return all factors of a number."""
+        return set(
+            reduce(
+                list.__add__,
+                ([i, n // i] for i in range(1, int(n**0.5) + 1) if n % i == 0),
+            )
+        )
+    def _find_supported_aspect_ratios(self):
+        """
+        This function computes all the allowed aspect ratios for a fixed
+        number of input chunks. The order of returned items matters for the result of `_fit_image_to_canvas` function.
+        If tie exists in `_fit_image_to_canvas`, the latter in `_find_supported_aspect_ratios` wins.
+        For example, with `num_tiles=5`, it will return:
+        {
+            0.2: [(1, 5)],
+            5.0: [(5, 1)],
+            0.25: [(1, 4)],
+            1.0: [(2, 2), (1, 1)],
+            4.0: [(4, 1)],
+            0.3333333333333333: [(1, 3)],
+            3.0: [(3, 1)],
+            0.5: [(1, 2)],
+            2.0: [(2, 1)]
+        }
+        """
+        asp_dict = {}
+        for chunk_size in range(self.max_num_tiles, 0, -1):
+            _factors = sorted(self._factors(chunk_size))
+            _asp_ratios = [(x, chunk_size // x) for x in _factors]
+            for ratio in _asp_ratios:
+                k = ratio[0] / ratio[1]
+                if k not in asp_dict:
+                    asp_dict[k] = [ratio]
+                else:
+                    asp_dict[k].append(ratio)
+        return asp_dict
+    def _get_image_height_width(
+        self, image_width: int, image_height: int, target_width: int, target_height: int
+    ) -> tuple[int, int]:
+        """
+        Given image width, height and target width, height for the canvas, return the dimensions of how the image would be resized
+        with aspect ratio preservation.
+        """
+        scale = image_width / image_height
+        if scale > 1.0:
+            # Width is larger than height
+            # Rescaling factor is the minimum of the two scaling factors. Else one side would be outside of the canvas.
+            rescaling_factor = min(
+                target_width / image_width, target_height / image_height
+            )
+            # Set new width to target width and height to the rescaled height.
+            new_w = rescaling_factor * image_width
+            new_h = math.floor(new_w / scale)
+        else:
+            # Height is larger than width
+            # Rescaling factor is the minimum of the two scaling factors. Else one side would be outside of the canvas.
+            rescaling_factor = min(
+                target_width / image_width, target_height / image_height
+            )
+            # Set new height to target height and width to the rescaled width.
+            new_h = rescaling_factor * image_height
+            new_w = math.floor(new_h * scale)
+        return new_w, new_h
+    def _fit_image_to_canvas(self, img_width: int, img_height: int, tile_size: int):
+        """
+        Given an image width, height and target number of chunks this function will see if the image
+        can be fit into any of the canvases that can be build from arranging the tiles in a grid.
+        If the image can be fit onto several canvases, it will return the canvas where the shorter edge
+        of the image will be largest.
+        """
+        # Initialize the optimal canvas to None. If no canvas is found where image fits, function returns None.
+        optimal_canvas = None
+        optimal_image_width_height = None
+        scale = img_width / img_height
+        # Gather all potential supported image resolutions and iterate through them to find best match
+        potential_arrangements = [
+            item
+            for sublist in self._find_supported_aspect_ratios().values()
+            for item in sublist
+        ]
+        for n_w, n_h in potential_arrangements:
+            # Compute the canvas size
+            canvas_width, canvas_height = n_w * tile_size, n_h * tile_size
+            # Check if image can fit into the canvas without downsampling
+            if canvas_width >= img_width and canvas_height >= img_height:
+                # If we did not find a good canvas yet, we will use the current one
+                if optimal_canvas is None:
+                    # Set optimal canvas and determine the actual image height and width in the canvas with aspect ratio preserving resampling
+                    optimal_canvas = (n_w, n_h)
+                    optimal_image_width_height = self._get_image_height_width(
+                        image_width=img_width,
+                        image_height=img_height,
+                        target_width=n_w * tile_size,
+                        target_height=n_h * tile_size,
+                    )
+                else:
+                    # If we already found an optimal canvas before, we will check if the shorter edge of the image will be larger than the current optimal canvas.
+                    # This means we can potentially upsample the image resolution which is beneficial to performance.
+                    image_width_height = self._get_image_height_width(
+                        image_width=img_width,
+                        image_height=img_height,
+                        target_width=n_w * tile_size,
+                        target_height=n_h * tile_size,
+                    )
+                    # Llama3V dynamic tiling. Priortize biggest canvas.
+                    if (
+                        scale < 1.0
+                        and (image_width_height[0] >= optimal_image_width_height[0])
+                    ) or (
+                        scale >= 1.0
+                        and (image_width_height[1] >= optimal_image_width_height[1])
+                    ):
+                        optimal_canvas = (n_w, n_h)
+                        optimal_image_width_height = image_width_height
+        return optimal_canvas
+    def _find_closest_aspect_ratio(
+        self, img_width: int, img_height: int, tile_size: int
+    ) -> tuple:
+        """
+        Given an image width, height and target number of chunks
+        this function will find the closest supported aspect ratio.
+        """
+        target_aspect_ratio = img_width / img_height
+        asp_dict = self._find_supported_aspect_ratios()
+        closest_aspect_ratio = None
+        if target_aspect_ratio >= 1:
+            closest_aspect_ratio = min(
+                [k for k in asp_dict if k <= target_aspect_ratio],
+                key=lambda x: abs(x - target_aspect_ratio),
+            )
+            tiles_given_aspect_ratio = asp_dict[closest_aspect_ratio]
+            # select largest width
+            return max(tiles_given_aspect_ratio, key=lambda x: x[0])
+        else:
+            closest_aspect_ratio = min(
+                [k for k in asp_dict if k > target_aspect_ratio],
+                key=lambda x: abs(1 / x - 1 / target_aspect_ratio),
+            )
+            tiles_given_aspect_ratio = asp_dict[closest_aspect_ratio]
+            # select largest height
+            return max(tiles_given_aspect_ratio, key=lambda x: x[1])
+    def _split(self, image: torch.Tensor, ncw: int, nch: int) -> torch.Tensor:
+        # Split image into number of required tiles (width x height)
+        batch_size, num_channels, height, width = image.size()
+        image = image.view(
+            batch_size, num_channels, nch, height // nch, ncw, width // ncw
+        )
+        # Permute dimensions to reorder the axes
+        image = image.permute(0, 2, 4, 1, 3, 5).contiguous()
+        # Reshape into the desired output shape (batch_size * 4, num_channels, width/2, height/2)
+        image = image.view(
+            batch_size, ncw * nch, num_channels, height // nch, width // ncw
+        )
+        return image
+    def resize(
+        self,
+        image: np.ndarray,
+        tile_size: int,
+        max_num_tiles: int,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        height, width = get_image_size(image, channel_dim=input_data_format)
+        if max_num_tiles > 1:
+            aspect_ratio = self._fit_image_to_canvas(
+                img_width=width, img_height=height, tile_size=tile_size
+            )
+            if aspect_ratio is None:
+                # If we did not find a canvas, we have to find the closest aspect ratio and downsample the image
+                aspect_ratio = self._find_closest_aspect_ratio(
+                    img_width=width, img_height=height, tile_size=tile_size
+                )
+        else:
+            aspect_ratio = (1, 1)
+        new_width, new_height = aspect_ratio[0] * tile_size, aspect_ratio[1] * tile_size
+        image = F.resize(image, (new_height, new_width), interpolation=resample)
+        return image, aspect_ratio
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        do_resize: bool,
+        do_rescale: Optional[bool],
+        rescale_factor: Optional[Union[int, float]],
+        do_normalize: Optional[bool],
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        vision_input_type: str,
+        tile_size: int,
+        max_num_tiles: int,
+        return_tensors: Optional[Union[str, TensorType]],
+        disable_grouping: bool,
+        **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs],
+    ) -> BatchFeature:
+        # Group images by size for batched transformation
+        resample = kwargs.pop("resample", self.resample)
+        grouped_images, grouped_images_index = group_images_by_shape(
+            images, disable_grouping=disable_grouping
+        )
+        resized_images_grouped = {}
+        aspect_ratio = [1, 1]
+        for shape, stacked_images in grouped_images.items():
+            if do_resize:
+                if vision_input_type == "thumb+tile":
+                    thumbnails, _ = self.resize(
+                        stacked_images,
+                        tile_size,
+                        max_num_tiles=1,
+                        resample=resample,
+                    )
+                    images_for_tiling, (tiles_w, tiles_h) = self.resize(
+                        stacked_images,
+                        tile_size,
+                        max_num_tiles=max_num_tiles,
+                        resample=resample,
+                    )
+                    image_tiles = self._split(images_for_tiling, tiles_w, tiles_h)
+                    stacked_images = torch.cat(
+                        [thumbnails.unsqueeze(1), image_tiles], dim=1
+                    )
+                    aspect_ratio = [tiles_w, tiles_h]
+                else:  # vanilla single tile for low memory devices
+                    stacked_images, _ = self.resize(
+                        stacked_images,
+                        tile_size,
+                        max_num_tiles=1,
+                        resample=resample,
+                    )
+            resized_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
+        grouped_images, grouped_images_index = group_images_by_shape(
+            resized_images, disable_grouping=disable_grouping
+        )
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images,
+                do_rescale,
+                rescale_factor,
+                do_normalize,
+                image_mean,
+                image_std,
+            )
+            processed_images_grouped[shape] = stacked_images
+        processed_images = reorder_images(
+            processed_images_grouped, grouped_images_index
+        )
+        processed_images = [
+            p[None] if p.ndim == 3 else p for p in processed_images
+        ]  # add tiles dimension if needed
+        processed_images = (
+            torch.stack(processed_images, dim=0) if return_tensors else processed_images
+        )
+        return BatchFeature(
+            data={"pixel_values": processed_images, "aspect_ratio": aspect_ratio},
+            tensor_type=return_tensors,
+        )
+__all__ = ["PerceptionLMImageProcessorFast"]

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9314ba927fcef833a56fd3e3d664ff4093e27af6c8415ce808951a34061f393b
+size 3068342248

modeling_gar.py ADDED Viewed

	@@ -0,0 +1,352 @@

+from typing import List, Optional, Tuple, Union
+from torch import nn
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.utils import logging
+from typing import Optional, Union
+import torch
+import torchvision
+from torch import nn
+from einops import rearrange
+from transformers.modeling_utils import PreTrainedModel
+from transformers import GenerationConfig
+from .configuration_gar import GARConfig
+from .modeling_perception_lm import PerceptionLMForConditionalGeneration
+logger = logging.get_logger(__name__)
+class GARModel(PreTrainedModel):
+    config_class = GARConfig
+    main_input_name = 'pixel_values'
+    base_model_prefix = 'language_model'
+    _no_split_modules = ['LlamaDecoderLayer']
+    _supports_flash_attn_2 = True
+    supports_gradient_checkpointing = True
+    def __init__(
+        self,
+        config: GARConfig,
+        mllm=None,
+        mask_patch_embedding=None,
+        use_flash_attn=True,
+    ):
+        super().__init__(config)
+        use_flash_attn = use_flash_attn
+        config.mllm_config.use_flash_attn = True if use_flash_attn else False
+        config.mllm_config.text_config.use_flash_attn = True if use_flash_attn else False
+        config.mllm_config.vision_config.use_flash_attn = False
+        config.mllm_config._attn_implementation = 'flash_attention_2' if use_flash_attn else 'eager'
+        config.mllm_config.vision_config._attn_implementation = 'eager'
+        self.prompt_numbers = config.prompt_numbers
+        if mllm is not None:
+            self.mllm = mllm
+        else:
+            self.mllm = PerceptionLMForConditionalGeneration(config.mllm_config)
+        if mask_patch_embedding is not None:
+            self.mask_patch_embedding = mask_patch_embedding
+        else:
+            self.mask_patch_embedding = nn.Conv2d(
+                in_channels=3,
+                out_channels=config.mask_path_embedding_out_channels,
+                kernel_size=config.kernel_size,
+                stride=config.kernel_size,
+                bias=False,
+            )
+        self.crop_tokens_ids = config.crop_tokens_ids
+    @property
+    def lm_head(self):
+        return self.mllm.model.language_model.get_output_embeddings()
+    def get_input_embeddings(self):
+        return self.mllm.model.language_model.get_input_embeddings()
+    def get_output_embeddings(self):
+        return self.mllm.model.language_model.get_output_embeddings()
+    def forward(self, data, data_samples=None, mode='loss'):
+        crop_tokens = self.crop_tokens_ids
+        # (batch_size, num_tiles, channels, height, width)
+        pixel_values = data['pixel_values'].to(self.mllm.device).to(self.mllm.dtype)
+        mask_values = torch.round((data['global_mask_values'] + 1.) / 2. * 255.).long().to(self.mllm.device)
+        mask_values = torch.clamp(mask_values, min=0, max=self.prompt_numbers)
+        assert mask_values.max() < self.prompt_numbers + 1 and mask_values.min() >= 0
+        mask_embeds = self.mask_patch_embedding((mask_values != self.prompt_numbers).to(self.mllm.dtype))     # binary mask
+        input_ids = data['input_ids']
+        aspect_ratios = data['aspect_ratios']
+        bboxes = data['bboxes']
+        assert input_ids.shape[0] == 1, "Currently only support batch_size=1"
+        inputs_embeds = self.mllm.get_input_embeddings()(input_ids)
+        labels = data['labels']
+        image_features = None
+        if pixel_values is not None:
+            image_features = self.mllm.get_image_features(
+                pixel_values=pixel_values.unsqueeze(0),
+                mask_embeds=mask_embeds,
+            )
+            image_features = image_features.to(inputs_embeds.device, dtype=inputs_embeds.dtype)
+            special_image_mask, _ = self.mllm.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+        # feature replay
+        new_inputs_embeds = []
+        new_labels = []
+        image_features_tiles = rearrange(image_features[1:].unsqueeze(0), 'b n (h w) c -> b n c h w', h=16, w=16)
+        for batch_idx in range(inputs_embeds.shape[0]):
+            curr_inputs_embeds = inputs_embeds[batch_idx]
+            curr_labels = labels[batch_idx]
+            for crop_token in crop_tokens:
+                if crop_token in input_ids[batch_idx]:
+                    target_mask = input_ids[batch_idx].eq(crop_token)
+                    target_indices = target_mask.nonzero().squeeze()
+                    head_idx = target_indices.min().item()
+                    tail_idx = target_indices.max().item()
+                    image_features_recover = self._merge(image_features_tiles, aspect_ratios[batch_idx][0], aspect_ratios[batch_idx][1])
+                    feat_h, feat_w = image_features_recover.shape[2:]
+                    x1, y1, x2, y2 = bboxes[batch_idx][str(crop_token)]
+                    orig_h, orig_w = feat_h * 28, feat_w * 28
+                    # origin box
+                    roi_orig_x1 = x1 * orig_w
+                    roi_orig_y1 = y1 * orig_h
+                    roi_orig_x2 = x2 * orig_w
+                    roi_orig_y2 = y2 * orig_h
+                    # feat box
+                    spatial_scale = feat_w / orig_w
+                    roi_feat_x1 = roi_orig_x1 * spatial_scale
+                    roi_feat_y1 = roi_orig_y1 * spatial_scale
+                    roi_feat_x2 = roi_orig_x2 * spatial_scale
+                    roi_feat_y2 = roi_orig_y2 * spatial_scale
+                    roi = torch.tensor(
+                        [0, roi_feat_x1, roi_feat_y1, roi_feat_x2, roi_feat_y2],
+                        dtype=torch.float32, device=image_features_recover.device,
+                    )
+                    roi_features = torchvision.ops.roi_align(
+                        input=image_features_recover.float(),
+                        boxes=roi.unsqueeze(0),
+                        output_size=(16, 16),
+                        spatial_scale=spatial_scale,
+                        sampling_ratio=2,
+                        aligned=True,
+                    )
+                    image_features_replay = roi_features.permute(0, 2, 3, 1).flatten(1, 2).to(image_features_recover.dtype).squeeze()
+                    curr_inputs_embeds = torch.cat([
+                        curr_inputs_embeds[:head_idx],
+                        image_features_replay,
+                        curr_inputs_embeds[tail_idx+1:],
+                    ])
+                    curr_labels = torch.cat([
+                        curr_labels[:head_idx],
+                        -100 * torch.ones(image_features_replay.shape[0], dtype=torch.long, device=labels.device),
+                        curr_labels[tail_idx+1:],
+                    ])
+                    assert curr_inputs_embeds.shape[0] == curr_labels.shape[0], f"shape mismatch, got {curr_inputs_embeds.shape[0]} != {curr_labels.shape[0]}"
+            new_inputs_embeds.append(curr_inputs_embeds.unsqueeze(0))
+            new_labels.append(curr_labels)
+        inputs_embeds = torch.cat(new_inputs_embeds, dim=0)
+        labels = torch.cat(new_labels, dim=0)
+        skip_this_batch = False
+        if mode == "loss":
+            position_ids = torch.arange(0, inputs_embeds.shape[1], dtype=torch.long, device=inputs_embeds.device).unsqueeze(0).repeat(inputs_embeds.shape[0], 1)
+            attention_mask = torch.ones(inputs_embeds.shape[0], inputs_embeds.shape[1], dtype=torch.long, device=inputs_embeds.device)
+            use_cache = False
+            outputs, _skip_this_case = self._llm_forward(
+                inputs_embeds=inputs_embeds,
+                position_ids=position_ids,
+                attention_mask=attention_mask,
+                labels=labels,
+                use_cache=use_cache
+            )
+            if skip_this_batch or _skip_this_case:
+                print("skip this batch!")
+                loss_dict = {'loss': outputs.loss * 0.0}
+            else:
+                loss_dict = {'loss': outputs.loss}
+            return loss_dict
+        elif mode == "predict":
+            pass
+        elif mode == "tensor":
+            pass
+        else:
+            raise NotImplementedError
+        return outputs
+    def _merge(self, tiles: torch.Tensor, ncw: int, nch: int) -> torch.Tensor:
+        batch_size, num_tiles, num_channels, tile_height, tile_width = tiles.size()
+        assert num_tiles == ncw * nch, f"{ncw * nch} != {num_tiles}"
+        tiles = tiles.view(batch_size, nch, ncw, num_channels, tile_height, tile_width)
+        tiles = tiles.permute(0, 3, 1, 4, 2, 5).contiguous()
+        original_height = nch * tile_height
+        original_width = ncw * tile_width
+        image = tiles.view(batch_size, num_channels, original_height, original_width)
+        return image
+    def _llm_forward(
+        self,
+        inputs_embeds: torch.FloatTensor,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        image_flags: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        return_dict = return_dict if return_dict is not None \
+            else self.mllm.config.use_return_dict
+        skip_this_case = False
+        outputs = self.mllm(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            labels=labels,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        return outputs, skip_this_case
+    @torch.no_grad()
+    def generate(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        global_mask_values: Optional[torch.LongTensor] = None,
+        aspect_ratios: Optional[torch.FloatTensor] = None,
+        bboxes: Optional[torch.FloatTensor] = None,
+        input_ids: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        generation_config: Optional[GenerationConfig] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **generate_kwargs,
+    ) -> torch.LongTensor:
+        device = self.device
+        if pixel_values is not None:
+            pixel_values = pixel_values.to(device).to(self.mllm.dtype)
+            if global_mask_values is not None:
+                mask_values = torch.round((global_mask_values + 1.) / 2. * 255.).long().to(device)
+                mask_values = torch.clamp(mask_values, min=0, max=self.prompt_numbers)
+                assert mask_values.max() < self.prompt_numbers + 1 and mask_values.min() >= 0, f"max: {mask_values.max()}, min: {mask_values.min()}"
+                mask_embeds = self.mask_patch_embedding((mask_values != self.prompt_numbers).to(self.mllm.dtype))
+            else:
+                mask_embeds = None
+            inputs_embeds = self.mllm.get_input_embeddings()(input_ids)
+            image_features = self.mllm.get_image_features(
+                pixel_values=pixel_values.unsqueeze(0),
+                mask_embeds=mask_embeds,
+            )
+            image_features = image_features.to(inputs_embeds.device, dtype=inputs_embeds.dtype)
+            special_image_mask, _ = self.mllm.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+            # feature replay
+            new_inputs_embeds = []
+            image_features_tiles = rearrange(image_features[1:].unsqueeze(0), 'b n (h w) c -> b n c h w', h=16, w=16)
+            for batch_idx in range(inputs_embeds.shape[0]):
+                curr_inputs_embeds = inputs_embeds[batch_idx]
+                for crop_token in self.crop_tokens_ids:
+                    if crop_token in input_ids[batch_idx]:
+                        target_mask = input_ids[batch_idx].eq(crop_token)
+                        target_indices = target_mask.nonzero().squeeze()
+                        head_idx = target_indices.min().item()
+                        tail_idx = target_indices.max().item()
+                        image_features_recover = self._merge(image_features_tiles, aspect_ratios[batch_idx][0], aspect_ratios[batch_idx][1])
+                        feat_h, feat_w = image_features_recover.shape[2:]
+                        x1, y1, x2, y2 = bboxes[batch_idx][str(crop_token)]
+                        orig_h, orig_w = feat_h * 28, feat_w * 28
+                        # origin box
+                        roi_orig_x1 = x1 * orig_w
+                        roi_orig_y1 = y1 * orig_h
+                        roi_orig_x2 = x2 * orig_w
+                        roi_orig_y2 = y2 * orig_h
+                        # feat box
+                        spatial_scale = feat_w / orig_w
+                        roi_feat_x1 = roi_orig_x1 * spatial_scale
+                        roi_feat_y1 = roi_orig_y1 * spatial_scale
+                        roi_feat_x2 = roi_orig_x2 * spatial_scale
+                        roi_feat_y2 = roi_orig_y2 * spatial_scale
+                        roi = torch.tensor(
+                            [0, roi_feat_x1, roi_feat_y1, roi_feat_x2, roi_feat_y2],
+                            dtype=torch.float32, device=image_features_recover.device,
+                        )
+                        roi_features = torchvision.ops.roi_align(
+                            input=image_features_recover.float(),
+                            boxes=roi.unsqueeze(0),
+                            output_size=(16, 16),
+                            spatial_scale=spatial_scale,
+                            sampling_ratio=2,
+                            aligned=True,
+                        )
+                        image_features_replay = roi_features.permute(0, 2, 3, 1).flatten(1, 2).to(image_features_recover.dtype).squeeze()
+                        curr_inputs_embeds = torch.cat([
+                            curr_inputs_embeds[:head_idx],
+                            image_features_replay,
+                            curr_inputs_embeds[tail_idx+1:],
+                        ])
+                new_inputs_embeds.append(curr_inputs_embeds.unsqueeze(0))
+            inputs_embeds = torch.cat(new_inputs_embeds, dim=0)
+        else:
+            inputs_embeds = self.mllm.get_input_embeddings()(input_ids)
+        outputs = self.mllm.generate(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            generation_config=generation_config,
+            output_hidden_states=output_hidden_states,
+            # return_dict=return_dict,
+            use_cache=True,
+            return_dict_in_generate=True,
+        )
+        return outputs

modeling_perception_lm.py ADDED Viewed

	@@ -0,0 +1,865 @@

+# *************************************************************************
+# This file may have been modified by Bytedance Inc. (“Bytedance Inc.'s Mo-
+# difications”). All Bytedance Inc.'s Modifications are Copyright (2025) B-
+# ytedance Inc..
+# *************************************************************************
+# Adapted from https://github.com/huggingface/transformers/blob/v4.55.4/src/transformers/models/perception_lm/modeling_perception_lm.py
+# coding=utf-8
+# Copyright 2025 Meta Platforms, Inc. and the HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from dataclasses import dataclass
+from typing import Optional, Union
+import torch
+import torch.nn.functional as F
+import torchvision
+from einops import rearrange
+from timm.models._manipulate import checkpoint
+from torch import nn
+from transformers import AutoModel, PerceptionLMConfig
+from transformers.generation import GenerationMixin
+from transformers.modeling_outputs import BaseModelOutputWithPast, ModelOutput
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import auto_docstring, can_return_tuple
+class PerceptionLMAdaptiveAvgPooling(nn.Module):
+    def __init__(self, pooling_ratio=2):
+        super().__init__()
+        self.pooling_ratio = pooling_ratio
+    def forward(self, hidden_states):
+        b, num_tokens, c = hidden_states.shape
+        h = int(math.sqrt(num_tokens))
+        if h * h != num_tokens:
+            raise ValueError(
+                f"num_tokens {num_tokens} is expected to be a square number"
+            )
+        shape = (h // self.pooling_ratio, h // self.pooling_ratio)
+        hidden_states = hidden_states.permute(0, 2, 1).reshape(b, -1, h, h)
+        hidden_states = F.adaptive_avg_pool2d(hidden_states, shape)
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)
+        return hidden_states
+class PerceptionLMMultiModalProjector(nn.Module):
+    def __init__(self, config: PerceptionLMConfig):
+        super().__init__()
+        input_size = config.vision_config.model_args["embed_dim"]
+        output_size = config.text_config.hidden_size
+        self.linear_1 = nn.Linear(
+            in_features=input_size,
+            out_features=output_size,
+            bias=True,
+        )
+        self.gelu = nn.GELU()
+        self.linear_2 = nn.Linear(
+            in_features=output_size,
+            out_features=output_size,
+            bias=True,
+        )
+        self.pooling = (
+            PerceptionLMAdaptiveAvgPooling(config.projector_pooling_ratio)
+            if config.projector_pooling_ratio > 1
+            else nn.Identity()
+        )
+    def forward(self, features):
+        features = features.permute(1, 0, 2)  # NLD -> LND
+        features = self.linear_1(features)
+        features = self.gelu(features)
+        features = self.linear_2(features)
+        features = features.permute(1, 0, 2)  # LND -> NLD
+        features = self.pooling(features)
+        return features
+@auto_docstring
+class PerceptionLMPreTrainedModel(PreTrainedModel):
+    config: PerceptionLMConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _can_compile_fullgraph = True
+    _supports_flex_attn = True
+    _supports_attention_backend = True
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for PerceptionLM outputs, with hidden states and attentions.
+    """
+)
+class PerceptionLMModelOutputWithPast(BaseModelOutputWithPast):
+    r"""
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        Image hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    video_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_videos, sequence_length, hidden_size)`.
+        Video hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+    image_hidden_states: Optional[torch.FloatTensor] = None
+    video_hidden_states: Optional[torch.FloatTensor] = None
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for PerceptionLM causal language model (or autoregressive) outputs.
+    """
+)
+class PerceptionLMCausalLMOutputWithPast(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss (for next-token prediction).
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        Image hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    video_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_videos, sequence_length, hidden_size)`.
+        Video hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[list[torch.FloatTensor]] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[torch.FloatTensor] = None
+    video_hidden_states: Optional[torch.FloatTensor] = None
+@auto_docstring
+class PerceptionLMModel(PerceptionLMPreTrainedModel):
+    _checkpoint_conversion_mapping = {}
+    def __init__(self, config: PerceptionLMConfig):
+        super().__init__(config)
+        self.vision_tower = AutoModel.from_config(config.vision_config)
+        def custom_forward_features(
+            self,
+            x: torch.Tensor,
+            mask_embeds: Optional[torch.Tensor] = None,
+        ) -> torch.Tensor:
+            """Forward pass through feature extraction layers.
+            Args:
+                x: Input tensor.
+            Returns:
+                Feature tensor.
+            """
+            x = self.patch_embed(x)
+            if mask_embeds is not None:
+                x = x + mask_embeds.flatten(2).transpose(1, 2)
+            x, rot_pos_embed = self._pos_embed(x)
+            x = self.norm_pre(x)
+            if getattr(self, "rope_mixed", False) and rot_pos_embed is not None:
+                # Handle depth-dependent embeddings for mixed mode
+                # pos embed has shape (depth, num_heads, H*W, dim) or (depth, batch_size, num_heads, H*W, dim)
+                for i, blk in enumerate(self.blocks):
+                    if self.grad_checkpointing and not torch.jit.is_scripting():
+                        x = checkpoint(blk, x, rope=rot_pos_embed[i])
+                    else:
+                        x = blk(x, rope=rot_pos_embed[i])
+            else:
+                # Standard path for non-mixed mode
+                for blk in self.blocks:
+                    if self.grad_checkpointing and not torch.jit.is_scripting():
+                        x = checkpoint(blk, x, rope=rot_pos_embed)
+                    else:
+                        x = blk(x, rope=rot_pos_embed)
+            x = self.norm(x)
+            return x
+        self.vision_tower.timm_model.forward_features = custom_forward_features.__get__(
+            self.vision_tower.timm_model
+        )
+        self.multi_modal_projector = PerceptionLMMultiModalProjector(config)
+        self.language_model = AutoModel.from_config(config.text_config)
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+    def set_decoder(self, decoder):
+        self.language_model = decoder
+    def get_decoder(self):
+        return self.language_model
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        mask_embeds: Optional[torch.FloatTensor] = None,
+        **kwargs,
+    ):
+        """
+        Obtains image last hidden states from the vision tower and apply multimodal projection.
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_tiles, channels, height, width)`)
+               The tensors corresponding to the input images.
+        Returns:
+            image_features (`torch.Tensor`): Image feature tensor of shape `(num_tiles, num_patches, embed_dim)`).
+        """
+        if len(pixel_values.shape) == 5:
+            pixel_values = pixel_values.flatten(0, 1)
+        assert (
+            len(pixel_values.shape) == 4
+        ), f"pixel_values should be of shape (batch_size * num_tiles, channels, height, width). But got {pixel_values.shape}."
+        # pre-mask
+        image_outputs = self.vision_tower(pixel_values, mask_embeds=mask_embeds)
+        # image_outputs = self.vision_tower(pixel_values)
+        image_outputs = image_outputs.last_hidden_state
+        if self.config.vision_use_cls_token:
+            image_outputs = image_outputs[:, 1:, :]
+        # post-mask
+        # if mask_embeds is not None:
+        #     image_outputs = image_outputs + mask_embeds.flatten(2).transpose(1, 2)
+        image_features = self.multi_modal_projector(image_outputs)
+        return image_features
+    def get_placeholder_mask(
+        self,
+        input_ids: torch.LongTensor,
+        inputs_embeds: torch.FloatTensor,
+        image_features: torch.FloatTensor = None,
+        video_features: torch.FloatTensor = None,
+    ):
+        """
+        Obtains multimodal placeholdr mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(
+                    self.config.image_token_id,
+                    dtype=torch.long,
+                    device=inputs_embeds.device,
+                )
+            )
+            special_image_mask = special_image_mask.all(-1)
+            special_video_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(
+                    self.config.video_token_id,
+                    dtype=torch.long,
+                    device=inputs_embeds.device,
+                )
+            )
+            special_video_mask = special_video_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.config.image_token_id
+            special_video_mask = input_ids == self.config.video_token_id
+        n_image_tokens = special_image_mask.sum()
+        special_image_mask = (
+            special_image_mask.unsqueeze(-1)
+            .expand_as(inputs_embeds)
+            .to(inputs_embeds.device)
+        )
+        if (
+            image_features is not None
+            and inputs_embeds[special_image_mask].numel() != image_features.numel()
+        ):
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {image_features.size()[:-1].numel()}"
+            )
+        n_video_tokens = special_video_mask.sum()
+        special_video_mask = (
+            special_video_mask.unsqueeze(-1)
+            .expand_as(inputs_embeds)
+            .to(inputs_embeds.device)
+        )
+        if (
+            video_features is not None
+            and inputs_embeds[special_video_mask].numel() != video_features.numel()
+        ):
+            raise ValueError(
+                f"Videos features and image tokens do not match: tokens: {n_video_tokens}, features {video_features.size()[:-1].numel()}"
+            )
+        return special_image_mask, special_video_mask
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        mask_embeds: Optional[torch.FloatTensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,  # need
+        position_ids: Optional[torch.LongTensor] = None,  # need
+        past_key_values: Optional[list[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,  # need
+        use_cache: Optional[bool] = None,  # need
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **lm_kwargs,
+    ) -> Union[tuple, PerceptionLMModelOutputWithPast]:
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You must specify exactly one of input_ids or inputs_embeds"
+            )
+        if (
+            pixel_values is not None or pixel_values_videos is not None
+        ) and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both (pixel_values or pixel_values_videos) and inputs_embeds at the same time, and must specify either one"
+            )
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+        image_features = None
+        if pixel_values is not None:
+            image_features = self.get_image_features(
+                pixel_values=pixel_values, mask_embeds=mask_embeds
+            )
+            image_features = image_features.to(
+                inputs_embeds.device, dtype=inputs_embeds.dtype
+            )
+            special_image_mask, _ = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(
+                special_image_mask, image_features
+            )
+        video_features = None
+        if pixel_values_videos is not None:
+            video_features = self.get_image_features(pixel_values=pixel_values_videos)
+            video_features = video_features.to(
+                inputs_embeds.device, dtype=inputs_embeds.dtype
+            )
+            _, special_video_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, video_features=video_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(
+                special_video_mask, video_features
+            )
+        outputs = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **lm_kwargs,
+        )
+        return PerceptionLMModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            hidden_states=outputs.hidden_states,
+            past_key_values=outputs.past_key_values,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+            video_hidden_states=(
+                video_features if pixel_values_videos is not None else None
+            ),
+        )
+@auto_docstring
+class PerceptionLMForConditionalGeneration(
+    PerceptionLMPreTrainedModel, GenerationMixin
+):
+    _checkpoint_conversion_mapping = {}
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config: PerceptionLMConfig):
+        super().__init__(config)
+        self.model = PerceptionLMModel(config)
+        self.lm_head = nn.Linear(
+            config.text_config.hidden_size, config.text_config.vocab_size, bias=False
+        )
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        self.model.set_input_embeddings(value)
+    def get_output_embeddings(self) -> nn.Module:
+        return self.lm_head
+    def set_decoder(self, decoder):
+        self.model.set_decoder(decoder)
+    def get_decoder(self):
+        return self.model.get_decoder()
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        mask_embeds: Optional[torch.FloatTensor] = None,
+        **kwargs,
+    ):
+        return self.model.get_image_features(
+            pixel_values=pixel_values, mask_embeds=mask_embeds, **kwargs
+        )
+    def get_placeholder_mask(
+        self,
+        input_ids: torch.LongTensor,
+        inputs_embeds: torch.FloatTensor,
+        image_features: torch.FloatTensor = None,
+        video_features: torch.FloatTensor = None,
+    ):
+        return self.model.get_placeholder_mask(
+            input_ids=input_ids,
+            inputs_embeds=inputs_embeds,
+            image_features=image_features,
+            video_features=video_features,
+        )
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,  # no need
+        pixel_values: Optional[torch.FloatTensor] = None,  # no need
+        pixel_values_videos: Optional[torch.FloatTensor] = None,  # no need
+        attention_mask: Optional[torch.Tensor] = None,  # need
+        position_ids: Optional[torch.LongTensor] = None,  # need
+        past_key_values: Optional[list[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,  # need
+        labels: Optional[torch.LongTensor] = None,  # need
+        use_cache: Optional[bool] = None,  # need
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **lm_kwargs,
+    ) -> Union[tuple, PerceptionLMCausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Example:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, PerceptionLMForConditionalGeneration
+        >>> model = PerceptionLMForConditionalGeneration.from_pretrained("perception_lm-hf/perception_lm-1.5-7b-hf")
+        >>> processor = AutoProcessor.from_pretrained("perception_lm-hf/perception_lm-1.5-7b-hf")
+        >>> prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
+        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "USER:  \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
+        ```"""
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            pixel_values_videos=pixel_values_videos,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **lm_kwargs,
+        )
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = (
+            slice(-logits_to_keep, None)
+            if isinstance(logits_to_keep, int)
+            else logits_to_keep
+        )
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                logits=logits,
+                labels=labels,
+                vocab_size=self.config.text_config.vocab_size,
+                **lm_kwargs,
+            )
+        return PerceptionLMCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
+            video_hidden_states=outputs.video_hidden_states,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        pixel_values=None,
+        mask_embeds=None,
+        pixel_values_videos=None,
+        attention_mask=None,
+        cache_position=None,
+        logits_to_keep=None,
+        feature_replay=None,
+        feature_replay_video=None,
+        crop_tokens=[128004],
+        roi_align=None,
+        bboxes=None,
+        aspect_ratios=True,
+        processor=None,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+        assert not (feature_replay and feature_replay_video)
+        if cache_position[0] == 0:
+            inputs_embeds = model_inputs["inputs_embeds"]
+            if inputs_embeds is None:
+                inputs_embeds = self.get_input_embeddings()(input_ids)
+            image_features = None
+            if pixel_values is not None:
+                image_features = self.get_image_features(
+                    pixel_values=pixel_values, mask_embeds=mask_embeds
+                )
+                image_features = image_features.to(
+                    inputs_embeds.device, dtype=inputs_embeds.dtype
+                )
+                special_image_mask, _ = self.get_placeholder_mask(
+                    input_ids,
+                    inputs_embeds=inputs_embeds,
+                    image_features=image_features,
+                )
+                inputs_embeds = inputs_embeds.masked_scatter(
+                    special_image_mask, image_features
+                )
+            video_features = None
+            if pixel_values_videos is not None:
+                video_features = self.get_image_features(
+                    pixel_values=pixel_values_videos
+                )
+                video_features = video_features.to(
+                    inputs_embeds.device, dtype=inputs_embeds.dtype
+                )
+                _, special_video_mask = self.get_placeholder_mask(
+                    input_ids,
+                    inputs_embeds=inputs_embeds,
+                    video_features=video_features,
+                )
+                inputs_embeds = inputs_embeds.masked_scatter(
+                    special_video_mask, video_features
+                )
+            if feature_replay:
+                assert (
+                    inputs_embeds.shape[0] == 1
+                ), "Currently only support batch_size=1 for feature replay"
+                def _merge(tiles: torch.Tensor, ncw: int, nch: int) -> torch.Tensor:
+                    # merge image tiles to the original image
+                    # input: (batch_size, ncw * nch, num_channels, height//nch, width//ncw)
+                    # output: (batch_size, num_channels, height, width)
+                    batch_size, num_tiles, num_channels, tile_height, tile_width = (
+                        tiles.size()
+                    )
+                    assert num_tiles == ncw * nch, f"{ncw * nch} != {num_tiles}"
+                    tiles = tiles.view(
+                        batch_size, nch, ncw, num_channels, tile_height, tile_width
+                    )
+                    tiles = tiles.permute(0, 3, 1, 4, 2, 5).contiguous()
+                    original_height = nch * tile_height
+                    original_width = ncw * tile_width
+                    image = tiles.view(
+                        batch_size, num_channels, original_height, original_width
+                    )
+                    return image
+                new_inputs_embeds = []
+                image_features_tiles = rearrange(
+                    image_features[1:].unsqueeze(0),
+                    "b n (h w) c -> b n c h w",
+                    h=16,
+                    w=16,
+                )
+                for batch_idx in range(inputs_embeds.shape[0]):
+                    curr_inputs_emebds = inputs_embeds[batch_idx]
+                    for crop_token in crop_tokens:
+                        if crop_token in input_ids[batch_idx]:
+                            target_mask = input_ids[batch_idx].eq(crop_token)
+                            target_indices = target_mask.nonzero().squeeze()
+                            head_idx = target_indices.min().item()
+                            tail_idx = target_indices.max().item()
+                            image_features_recover = _merge(
+                                image_features_tiles,
+                                aspect_ratios[batch_idx][0],
+                                aspect_ratios[batch_idx][1],
+                            )
+                            x1, y1, x2, y2 = bboxes[batch_idx][str(crop_token)]
+                            feat_h, feat_w = image_features_recover.shape[2:]
+                            orig_h, orig_w = feat_h * 28, feat_w * 28  # 原图尺寸
+                            # origin box
+                            roi_orig_x1 = x1 * orig_w
+                            roi_orig_y1 = y1 * orig_h
+                            roi_orig_x2 = x2 * orig_w
+                            roi_orig_y2 = y2 * orig_h
+                            # feat box
+                            spatial_scale = feat_w / orig_w
+                            roi_feat_x1 = roi_orig_x1 * spatial_scale
+                            roi_feat_y1 = roi_orig_y1 * spatial_scale
+                            roi_feat_x2 = roi_orig_x2 * spatial_scale
+                            roi_feat_y2 = roi_orig_y2 * spatial_scale
+                            roi = torch.tensor(
+                                [0, roi_feat_x1, roi_feat_y1, roi_feat_x2, roi_feat_y2],
+                                dtype=torch.float32,
+                                device=image_features_recover.device,
+                            )
+                            roi_features = torchvision.ops.roi_align(
+                                input=image_features_recover.float(),
+                                boxes=roi.unsqueeze(0),
+                                output_size=(16, 16),
+                                spatial_scale=spatial_scale,
+                                sampling_ratio=2,
+                                aligned=True,
+                            )
+                            image_features_replay = (
+                                roi_features.permute(0, 2, 3, 1)
+                                .flatten(1, 2)
+                                .to(image_features_recover.dtype)
+                                .squeeze()
+                            )
+                            curr_inputs_emebds = torch.cat(
+                                [
+                                    inputs_embeds[batch_idx][:head_idx],
+                                    image_features_replay,
+                                    inputs_embeds[batch_idx][tail_idx + 1 :],
+                                ]
+                            )
+                    new_inputs_embeds.append(curr_inputs_emebds.unsqueeze(0))
+                inputs_embeds = torch.cat(new_inputs_embeds, dim=0)
+                model_inputs["position_ids"] = (
+                    torch.arange(
+                        0,
+                        inputs_embeds.shape[1],
+                        dtype=torch.long,
+                        device=inputs_embeds.device,
+                    )
+                    .unsqueeze(0)
+                    .repeat(inputs_embeds.shape[0], 1)
+                )
+                model_inputs["attention_mask"] = torch.ones(
+                    inputs_embeds.shape[0],
+                    inputs_embeds.shape[1],
+                    dtype=torch.long,
+                    device=inputs_embeds.device,
+                )
+                model_inputs["cache_position"] = model_inputs["position_ids"].clone()
+            elif feature_replay_video:
+                assert (
+                    inputs_embeds.shape[0] == 1
+                ), "Currently only support batch_size=1 for feature replay"
+                assert processor is not None, "Need processor"
+                new_inputs_embeds = []
+                image_features_tiles = rearrange(
+                    image_features.unsqueeze(0), "b n (h w) c -> b n c h w", h=16, w=16
+                )
+                for batch_idx in range(inputs_embeds.shape[0]):
+                    curr_inputs_emebds = inputs_embeds[batch_idx]
+                    for frame_idx in range(image_features.shape[0]):
+                        crop_token = processor.tokenizer.convert_tokens_to_ids(
+                            f"<|reserved_special_token_{2 + frame_idx}|>"
+                        )
+                        if crop_token in input_ids[batch_idx]:
+                            target_mask = input_ids[batch_idx].eq(crop_token)
+                            target_indices = target_mask.nonzero().squeeze()
+                            head_idx = target_indices.min().item()
+                            tail_idx = target_indices.max().item()
+                            x1, y1, x2, y2 = bboxes[batch_idx][str(crop_token)]
+                            feat_h, feat_w = 16, 16
+                            orig_h, orig_w = feat_h * 28, feat_w * 28
+                            # origin box
+                            roi_orig_x1 = x1 * orig_w
+                            roi_orig_y1 = y1 * orig_h
+                            roi_orig_x2 = x2 * orig_w
+                            roi_orig_y2 = y2 * orig_h
+                            # feat box
+                            spatial_scale = feat_w / orig_w
+                            roi_feat_x1 = roi_orig_x1 * spatial_scale
+                            roi_feat_y1 = roi_orig_y1 * spatial_scale
+                            roi_feat_x2 = roi_orig_x2 * spatial_scale
+                            roi_feat_y2 = roi_orig_y2 * spatial_scale
+                            roi = torch.tensor(
+                                [0, roi_feat_x1, roi_feat_y1, roi_feat_x2, roi_feat_y2],
+                                dtype=torch.float32,
+                                device=image_features_tiles.device,
+                            )
+                            roi_features = torchvision.ops.roi_align(
+                                input=image_features_tiles[:, frame_idx].float(),
+                                boxes=roi.unsqueeze(0),
+                                output_size=(16, 16),
+                                spatial_scale=spatial_scale,
+                                sampling_ratio=2,
+                                aligned=True,
+                            )
+                            image_features_replay = (
+                                roi_features.permute(0, 2, 3, 1)
+                                .flatten(1, 2)
+                                .to(image_features_tiles.dtype)
+                                .squeeze()
+                            )
+                            curr_inputs_emebds = torch.cat(
+                                [
+                                    curr_inputs_emebds[:head_idx],
+                                    image_features_replay,
+                                    curr_inputs_emebds[tail_idx + 1 :],
+                                ]
+                            )
+                    new_inputs_embeds.append(curr_inputs_emebds.unsqueeze(0))
+                inputs_embeds = torch.cat(new_inputs_embeds, dim=0)
+                model_inputs["position_ids"] = (
+                    torch.arange(
+                        0,
+                        inputs_embeds.shape[1],
+                        dtype=torch.long,
+                        device=inputs_embeds.device,
+                    )
+                    .unsqueeze(0)
+                    .repeat(inputs_embeds.shape[0], 1)
+                )
+                model_inputs["attention_mask"] = torch.ones(
+                    inputs_embeds.shape[0],
+                    inputs_embeds.shape[1],
+                    dtype=torch.long,
+                    device=inputs_embeds.device,
+                )
+                model_inputs["cache_position"] = model_inputs["position_ids"].clone()
+            model_inputs["inputs_embeds"] = inputs_embeds
+            model_inputs["input_ids"] = None
+            model_inputs["pixel_values"] = None
+            model_inputs["pixel_values_videos"] = None
+            model_inputs["mask_embeds"] = None
+        return model_inputs
+__all__ = [
+    "PerceptionLMForConditionalGeneration",
+    "PerceptionLMPreTrainedModel",
+    "PerceptionLMModel",
+]

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "auto_map": {
+    "AutoImageProcessor": "image_processing_perception_lm_fast.PerceptionLMImageProcessorFast",
+    "AutoProcessor": "processing_gar.GARPerceptionLMProcessor"
+  },
+  "crop_size": null,
+  "data_format": "channels_first",
+  "default_to_square": true,
+  "device": null,
+  "disable_grouping": null,
+  "do_center_crop": false,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "PerceptionLMImageProcessorFast",
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "input_data_format": null,
+  "max_frame_tiles": 1,
+  "max_num_tiles": 16,
+  "processor_class": "GARPerceptionLMProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "return_tensors": null,
+  "size": {
+    "height": 448,
+    "width": 448
+  },
+  "tile_size": 448,
+  "vision_input_type": "thumb+tile"
+}

processing_gar.py ADDED Viewed

	@@ -0,0 +1,316 @@

+# coding=utf-8
+# Copyright 2025 Meta Platforms, Inc. and the HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for PerceptionLM.
+"""
+from typing import Iterable, Union
+import numpy as np
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput, get_image_size, to_numpy_array
+from transformers.processing_utils import (
+    MultiModalData,
+    ProcessingKwargs,
+    ProcessorMixin,
+    Unpack,
+)
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+from transformers.utils import logging
+from transformers.video_utils import VideoInput
+from transformers.image_utils import PILImageResampling
+from .image_processing_perception_lm_fast import PerceptionLMImageProcessorFast
+from transformers import AutoTokenizer, AutoProcessor, AutoImageProcessor
+logger = logging.get_logger(__name__)
+class PerceptionLMProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+            "return_mm_token_type_ids": False,
+        },
+    }
+class GARPerceptionLMProcessor(ProcessorMixin):
+    r"""
+    Constructs a PerceptionLM processor which wraps a PerceptionLM image processor, a PerceptionLM video processor, and a tokenizer into a single processor.
+    [`PerceptionLMProcessor`] offers all the functionalities of [`PerceptionLMImageProcessorFast`], [`PerceptionLMVideoProcessor`], and the tokenizer (e.g. [`LlamaTokenizerFast`]). See the
+    [`~PerceptionLMProcessor.__call__`] and [`~PerceptionLMProcessor.decode`] for more information.
+    Args:
+        video_processor ([`PerceptionLMVideoProcessor`], *optional*):
+            The video processor to process video inputs.
+        image_processor ([`PerceptionLMImageProcessorFast`], *optional*):
+            The image processor to process image inputs.
+        tokenizer ([`LlamaTokenizerFast`] or similar, *optional*):
+            The tokenizer to process text inputs.
+        patch_size (`int`, *optional*):
+            Patch size from the vision tower.
+        chat_template (`str`, *optional*):
+            A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string.
+        pooling_ratio (`int`, *optional*, defaults to 2):
+            Pooling ratio for vision tokens. If not 1, 2D adaptive pooling is applied over projected vision tokens.
+    """
+    attributes = ["video_processor", "image_processor", "tokenizer"]
+    image_processor_class = "AutoImageProcessor"
+    video_processor_class = "AutoVideoProcessor"
+    tokenizer_class = "AutoTokenizer"
+    def __init__(
+        self,
+        video_processor=None,
+        image_processor=None,
+        tokenizer=None,
+        patch_size=None,
+        chat_template=None,
+        pooling_ratio=2,
+        **kwargs,
+    ):
+        self.patch_size = patch_size
+        self.pooling_ratio = pooling_ratio
+        self.image_token = tokenizer.image_token
+        self.video_token = tokenizer.video_token
+        self.image_token_id = tokenizer.image_token_id
+        self.video_token_id = tokenizer.video_token_id
+        super().__init__(
+            video_processor, image_processor, tokenizer, chat_template=chat_template,
+        )
+    def __call__(
+        self,
+        images: ImageInput = None,
+        visual_prompts: ImageInput = None,
+        text: Union[
+            TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]
+        ] = None,
+        audio=None,
+        videos: VideoInput = None,
+        **kwargs: Unpack[PerceptionLMProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Prepares a batch containing one or more sequences of text and/or images and/or videos.
+        If `text` is provided, it is tokenized using the tokenizer.
+        If `images` is provided, they are processed using the image processor.
+        If `videos` is provided, they are processed using the video processor.
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, *optional*):
+                The image or batch of images to be processed. Each image can be a PIL image, NumPy array, or PyTorch tensor.
+                Both channels-first and channels-last formats are supported.
+            text (`str`, `List[str]`, *optional*):
+                The sequence or batch of sequences to be tokenized. Each sequence can be a string.
+            videos (`Any`, *optional*):
+                The video or batch of videos to be processed.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is provided.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is provided).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is provided.
+            - **pixel_values_videos** -- Video pixel values to be fed to a model. Returned when `videos` is provided.
+        """
+        if text is None:
+            raise ValueError(
+                "You have to specify at least `text` input. Optionally, you can also specify `images` or `videos`."
+            )
+        output_kwargs = self._merge_kwargs(
+            PerceptionLMProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        if images is not None:
+            image_inputs = self.image_processor(
+                images=images, **output_kwargs["images_kwargs"]
+            )
+        else:
+            image_inputs = {}
+        if visual_prompts is not None:
+            visual_prompts_inputs = self.image_processor(
+                images=visual_prompts, **output_kwargs["images_kwargs"], resample=PILImageResampling.NEAREST
+            )
+            image_inputs["mask_values"] = visual_prompts_inputs["pixel_values"]
+        else:
+            image_inputs["mask_values"] = None
+        if videos is not None:
+            videos_inputs = self.video_processor(
+                videos, **output_kwargs["videos_kwargs"]
+            )
+        else:
+            videos_inputs = {}
+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise ValueError(
+                "Invalid input text. Please provide a string, or a list of strings"
+            )
+        # try to expand inputs in processing if we have the necessary parts
+        prompt_strings = []
+        pixel_values = iter(image_inputs.get("pixel_values", []))
+        pixel_values_videos = iter(videos_inputs.get("pixel_values_videos", []))
+        for sample in text:
+            # Replace the media token with the expanded media token sequence
+            sample = self._expand_media_tokens(
+                sample, self.tokenizer.image_token, pixel_values
+            )
+            sample = self._expand_media_tokens(
+                sample, self.tokenizer.video_token, pixel_values_videos
+            )
+            prompt_strings.append(sample)
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+        return_mm_token_type_ids = output_kwargs["text_kwargs"].pop(
+            "return_mm_token_type_ids", False
+        )
+        text_inputs = self.tokenizer(
+            prompt_strings, **output_kwargs["text_kwargs"], return_tensors=None
+        )
+        self._check_special_mm_tokens(
+            prompt_strings, text_inputs, modalities=["image", "video"]
+        )
+        if return_mm_token_type_ids:
+            array_ids = np.array(text_inputs["input_ids"])
+            mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
+            mm_token_type_ids[array_ids == self.image_token_id] = 1
+            text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
+        return BatchFeature(
+            data={**text_inputs, **image_inputs, **videos_inputs},
+            tensor_type=return_tensors,
+        )
+    def _expand_media_tokens(self, sample, media_token: str, media_iter: Iterable):
+        media_count = sample.count(media_token)
+        if media_count > 0:
+            media_list = [next(media_iter) for _ in range(media_count)]
+            sample_splits = sample.split(media_token)
+            media_token_list = []
+            for media in media_list:
+                height, width = get_image_size(to_numpy_array(media))
+                num_tiles = media.shape[0]
+                num_media_tokens = (
+                    (height // self.patch_size // self.pooling_ratio)
+                    * (width // self.patch_size // self.pooling_ratio)
+                    * num_tiles
+                )
+                media_token_list.append(num_media_tokens)
+            sample = ""
+            for i, num_media_tokens in enumerate(media_token_list):
+                sample += sample_splits[i]
+                sample += media_token * num_media_tokens
+            sample += sample_splits[-1]
+        return sample
+    def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs):
+        """
+        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
+        Args:
+            image_sizes (`list[list[int]]`, *optional*):
+                The input sizes formatted as (height, width) per each image.
+        Returns:
+            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
+            input modalities, along with other useful data.
+        """
+        vision_data = {}
+        if image_sizes is not None:
+            images_kwargs = PerceptionLMProcessorKwargs._defaults.get(
+                "images_kwargs", {}
+            )
+            images_kwargs.update(kwargs)
+            tile_size = (
+                images_kwargs.get("tile_size", None) or self.image_processor.tile_size
+            )
+            num_image_tokens = []
+            num_image_patches = []
+            for height, width in image_sizes:
+                if self.image_processor.vision_input_type == "thumb+tile":
+                    aspect_ratio = self.image_processor._fit_image_to_canvas(
+                        img_width=width, img_height=height, tile_size=tile_size
+                    )
+                    if aspect_ratio is None:
+                        aspect_ratio = self.image_processor._find_closest_aspect_ratio(
+                            img_width=width, img_height=height, tile_size=tile_size
+                        )
+                    num_tiles = (
+                        aspect_ratio[0] * aspect_ratio[1] + 1
+                    )  # base image and tiles
+                else:
+                    num_tiles = 1
+                num_image_tokens.append(
+                    (tile_size // self.patch_size // self.pooling_ratio)
+                    * (tile_size // self.patch_size // self.pooling_ratio)
+                    * num_tiles
+                )
+                num_image_patches.append(num_tiles)
+            vision_data.update(
+                {
+                    "num_image_tokens": num_image_tokens,
+                    "num_image_patches": num_image_patches,
+                }
+            )
+        return MultiModalData(**vision_data)
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PerceptionLMTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PerceptionLMTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+AutoProcessor.register("GARPerceptionLMProcessor", GARPerceptionLMProcessor)
+AutoImageProcessor.register(
+    "GARPerceptionLMImageProcessorFast",
+    slow_image_processor_class=None,
+    fast_image_processor_class=PerceptionLMImageProcessorFast
+)
+__all__ = ["GARPerceptionLMProcessor"]

processor_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "patch_size": 14,
+  "pooling_ratio": 2,
+  "processor_class": "GARPerceptionLMProcessor",
+  "auto_map": {
+    "AutoImageProcessor": "image_processing_perception_lm_fast.PerceptionLMImageProcessorFast",
+    "AutoProcessor": "processing_gar.GARPerceptionLMProcessor"
+  }
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "bos_token": {
+    "content": "<|begin_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|eot_id|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "image_token": "<|image|>",
+  "pad_token": "<|end_of_text|>",
+  "video_token": "<|video|>"
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a5531cfd169b9f439ecb1339ada499771bf9a7391217dfbb51fd3a03a9fa0ce0
+size 17211041

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,2118 @@

+{
+  "added_tokens_decoder": {
+    "128000": {
+      "content": "<|begin_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128001": {
+      "content": "<|end_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128002": {
+      "content": "<|image|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128003": {
+      "content": "<|video|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128004": {
+      "content": "<|reserved_special_token_2|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128005": {
+      "content": "<|reserved_special_token_3|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128006": {
+      "content": "<|start_header_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128007": {
+      "content": "<|end_header_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128008": {
+      "content": "<|reserved_special_token_4|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128009": {
+      "content": "<|eot_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128010": {
+      "content": "<|reserved_special_token_5|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128011": {
+      "content": "<|reserved_special_token_6|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128012": {
+      "content": "<|reserved_special_token_7|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128013": {
+      "content": "<|reserved_special_token_8|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128014": {
+      "content": "<|reserved_special_token_9|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128015": {
+      "content": "<|reserved_special_token_10|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128016": {
+      "content": "<|reserved_special_token_11|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128017": {
+      "content": "<|reserved_special_token_12|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128018": {
+      "content": "<|reserved_special_token_13|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128019": {
+      "content": "<|reserved_special_token_14|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128020": {
+      "content": "<|reserved_special_token_15|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128021": {
+      "content": "<|reserved_special_token_16|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128022": {
+      "content": "<|reserved_special_token_17|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128023": {
+      "content": "<|reserved_special_token_18|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128024": {
+      "content": "<|reserved_special_token_19|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128025": {
+      "content": "<|reserved_special_token_20|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128026": {
+      "content": "<|reserved_special_token_21|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128027": {
+      "content": "<|reserved_special_token_22|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128028": {
+      "content": "<|reserved_special_token_23|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128029": {
+      "content": "<|reserved_special_token_24|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128030": {
+      "content": "<|reserved_special_token_25|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128031": {
+      "content": "<|reserved_special_token_26|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128032": {
+      "content": "<|reserved_special_token_27|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128033": {
+      "content": "<|reserved_special_token_28|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128034": {
+      "content": "<|reserved_special_token_29|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128035": {
+      "content": "<|reserved_special_token_30|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128036": {
+      "content": "<|reserved_special_token_31|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128037": {
+      "content": "<|reserved_special_token_32|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128038": {
+      "content": "<|reserved_special_token_33|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128039": {
+      "content": "<|reserved_special_token_34|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128040": {
+      "content": "<|reserved_special_token_35|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128041": {
+      "content": "<|reserved_special_token_36|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128042": {
+      "content": "<|reserved_special_token_37|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128043": {
+      "content": "<|reserved_special_token_38|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128044": {
+      "content": "<|reserved_special_token_39|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128045": {
+      "content": "<|reserved_special_token_40|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128046": {
+      "content": "<|reserved_special_token_41|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128047": {
+      "content": "<|reserved_special_token_42|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128048": {
+      "content": "<|reserved_special_token_43|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128049": {
+      "content": "<|reserved_special_token_44|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128050": {
+      "content": "<|reserved_special_token_45|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128051": {
+      "content": "<|reserved_special_token_46|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128052": {
+      "content": "<|reserved_special_token_47|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128053": {
+      "content": "<|reserved_special_token_48|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128054": {
+      "content": "<|reserved_special_token_49|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128055": {
+      "content": "<|reserved_special_token_50|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128056": {
+      "content": "<|reserved_special_token_51|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128057": {
+      "content": "<|reserved_special_token_52|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128058": {
+      "content": "<|reserved_special_token_53|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128059": {
+      "content": "<|reserved_special_token_54|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128060": {
+      "content": "<|reserved_special_token_55|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128061": {
+      "content": "<|reserved_special_token_56|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128062": {
+      "content": "<|reserved_special_token_57|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128063": {
+      "content": "<|reserved_special_token_58|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128064": {
+      "content": "<|reserved_special_token_59|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128065": {
+      "content": "<|reserved_special_token_60|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128066": {
+      "content": "<|reserved_special_token_61|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128067": {
+      "content": "<|reserved_special_token_62|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128068": {
+      "content": "<|reserved_special_token_63|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128069": {
+      "content": "<|reserved_special_token_64|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128070": {
+      "content": "<|reserved_special_token_65|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128071": {
+      "content": "<|reserved_special_token_66|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128072": {
+      "content": "<|reserved_special_token_67|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128073": {
+      "content": "<|reserved_special_token_68|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128074": {
+      "content": "<|reserved_special_token_69|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128075": {
+      "content": "<|reserved_special_token_70|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128076": {
+      "content": "<|reserved_special_token_71|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128077": {
+      "content": "<|reserved_special_token_72|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128078": {
+      "content": "<|reserved_special_token_73|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128079": {
+      "content": "<|reserved_special_token_74|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128080": {
+      "content": "<|reserved_special_token_75|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128081": {
+      "content": "<|reserved_special_token_76|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128082": {
+      "content": "<|reserved_special_token_77|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128083": {
+      "content": "<|reserved_special_token_78|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128084": {
+      "content": "<|reserved_special_token_79|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128085": {
+      "content": "<|reserved_special_token_80|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128086": {
+      "content": "<|reserved_special_token_81|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128087": {
+      "content": "<|reserved_special_token_82|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128088": {
+      "content": "<|reserved_special_token_83|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128089": {
+      "content": "<|reserved_special_token_84|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128090": {
+      "content": "<|reserved_special_token_85|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128091": {
+      "content": "<|reserved_special_token_86|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128092": {
+      "content": "<|reserved_special_token_87|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128093": {
+      "content": "<|reserved_special_token_88|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128094": {
+      "content": "<|reserved_special_token_89|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128095": {
+      "content": "<|reserved_special_token_90|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128096": {
+      "content": "<|reserved_special_token_91|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128097": {
+      "content": "<|reserved_special_token_92|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128098": {
+      "content": "<|reserved_special_token_93|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128099": {
+      "content": "<|reserved_special_token_94|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128100": {
+      "content": "<|reserved_special_token_95|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128101": {
+      "content": "<|reserved_special_token_96|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128102": {
+      "content": "<|reserved_special_token_97|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128103": {
+      "content": "<|reserved_special_token_98|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128104": {
+      "content": "<|reserved_special_token_99|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128105": {
+      "content": "<|reserved_special_token_100|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128106": {
+      "content": "<|reserved_special_token_101|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128107": {
+      "content": "<|reserved_special_token_102|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128108": {
+      "content": "<|reserved_special_token_103|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128109": {
+      "content": "<|reserved_special_token_104|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128110": {
+      "content": "<|reserved_special_token_105|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128111": {
+      "content": "<|reserved_special_token_106|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128112": {
+      "content": "<|reserved_special_token_107|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128113": {
+      "content": "<|reserved_special_token_108|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128114": {
+      "content": "<|reserved_special_token_109|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128115": {
+      "content": "<|reserved_special_token_110|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128116": {
+      "content": "<|reserved_special_token_111|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128117": {
+      "content": "<|reserved_special_token_112|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128118": {
+      "content": "<|reserved_special_token_113|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128119": {
+      "content": "<|reserved_special_token_114|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128120": {
+      "content": "<|reserved_special_token_115|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128121": {
+      "content": "<|reserved_special_token_116|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128122": {
+      "content": "<|reserved_special_token_117|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128123": {
+      "content": "<|reserved_special_token_118|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128124": {
+      "content": "<|reserved_special_token_119|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128125": {
+      "content": "<|reserved_special_token_120|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128126": {
+      "content": "<|reserved_special_token_121|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128127": {
+      "content": "<|reserved_special_token_122|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128128": {
+      "content": "<|reserved_special_token_123|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128129": {
+      "content": "<|reserved_special_token_124|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128130": {
+      "content": "<|reserved_special_token_125|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128131": {
+      "content": "<|reserved_special_token_126|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128132": {
+      "content": "<|reserved_special_token_127|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128133": {
+      "content": "<|reserved_special_token_128|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128134": {
+      "content": "<|reserved_special_token_129|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128135": {
+      "content": "<|reserved_special_token_130|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128136": {
+      "content": "<|reserved_special_token_131|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128137": {
+      "content": "<|reserved_special_token_132|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128138": {
+      "content": "<|reserved_special_token_133|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128139": {
+      "content": "<|reserved_special_token_134|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128140": {
+      "content": "<|reserved_special_token_135|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128141": {
+      "content": "<|reserved_special_token_136|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128142": {
+      "content": "<|reserved_special_token_137|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128143": {
+      "content": "<|reserved_special_token_138|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128144": {
+      "content": "<|reserved_special_token_139|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128145": {
+      "content": "<|reserved_special_token_140|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128146": {
+      "content": "<|reserved_special_token_141|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128147": {
+      "content": "<|reserved_special_token_142|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128148": {
+      "content": "<|reserved_special_token_143|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128149": {
+      "content": "<|reserved_special_token_144|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128150": {
+      "content": "<|reserved_special_token_145|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128151": {
+      "content": "<|reserved_special_token_146|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128152": {
+      "content": "<|reserved_special_token_147|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128153": {
+      "content": "<|reserved_special_token_148|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128154": {
+      "content": "<|reserved_special_token_149|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128155": {
+      "content": "<|reserved_special_token_150|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128156": {
+      "content": "<|reserved_special_token_151|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128157": {
+      "content": "<|reserved_special_token_152|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128158": {
+      "content": "<|reserved_special_token_153|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128159": {
+      "content": "<|reserved_special_token_154|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128160": {
+      "content": "<|reserved_special_token_155|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128161": {
+      "content": "<|reserved_special_token_156|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128162": {
+      "content": "<|reserved_special_token_157|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128163": {
+      "content": "<|reserved_special_token_158|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128164": {
+      "content": "<|reserved_special_token_159|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128165": {
+      "content": "<|reserved_special_token_160|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128166": {
+      "content": "<|reserved_special_token_161|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128167": {
+      "content": "<|reserved_special_token_162|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128168": {
+      "content": "<|reserved_special_token_163|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128169": {
+      "content": "<|reserved_special_token_164|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128170": {
+      "content": "<|reserved_special_token_165|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128171": {
+      "content": "<|reserved_special_token_166|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128172": {
+      "content": "<|reserved_special_token_167|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128173": {
+      "content": "<|reserved_special_token_168|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128174": {
+      "content": "<|reserved_special_token_169|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128175": {
+      "content": "<|reserved_special_token_170|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128176": {
+      "content": "<|reserved_special_token_171|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128177": {
+      "content": "<|reserved_special_token_172|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128178": {
+      "content": "<|reserved_special_token_173|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128179": {
+      "content": "<|reserved_special_token_174|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128180": {
+      "content": "<|reserved_special_token_175|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128181": {
+      "content": "<|reserved_special_token_176|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128182": {
+      "content": "<|reserved_special_token_177|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128183": {
+      "content": "<|reserved_special_token_178|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128184": {
+      "content": "<|reserved_special_token_179|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128185": {
+      "content": "<|reserved_special_token_180|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128186": {
+      "content": "<|reserved_special_token_181|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128187": {
+      "content": "<|reserved_special_token_182|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128188": {
+      "content": "<|reserved_special_token_183|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128189": {
+      "content": "<|reserved_special_token_184|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128190": {
+      "content": "<|reserved_special_token_185|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128191": {
+      "content": "<|reserved_special_token_186|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128192": {
+      "content": "<|reserved_special_token_187|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128193": {
+      "content": "<|reserved_special_token_188|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128194": {
+      "content": "<|reserved_special_token_189|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128195": {
+      "content": "<|reserved_special_token_190|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128196": {
+      "content": "<|reserved_special_token_191|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128197": {
+      "content": "<|reserved_special_token_192|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128198": {
+      "content": "<|reserved_special_token_193|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128199": {
+      "content": "<|reserved_special_token_194|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128200": {
+      "content": "<|reserved_special_token_195|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128201": {
+      "content": "<|reserved_special_token_196|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128202": {
+      "content": "<|reserved_special_token_197|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128203": {
+      "content": "<|reserved_special_token_198|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128204": {
+      "content": "<|reserved_special_token_199|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128205": {
+      "content": "<|reserved_special_token_200|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128206": {
+      "content": "<|reserved_special_token_201|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128207": {
+      "content": "<|reserved_special_token_202|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128208": {
+      "content": "<|reserved_special_token_203|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128209": {
+      "content": "<|reserved_special_token_204|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128210": {
+      "content": "<|reserved_special_token_205|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128211": {
+      "content": "<|reserved_special_token_206|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128212": {
+      "content": "<|reserved_special_token_207|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128213": {
+      "content": "<|reserved_special_token_208|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128214": {
+      "content": "<|reserved_special_token_209|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128215": {
+      "content": "<|reserved_special_token_210|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128216": {
+      "content": "<|reserved_special_token_211|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128217": {
+      "content": "<|reserved_special_token_212|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128218": {
+      "content": "<|reserved_special_token_213|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128219": {
+      "content": "<|reserved_special_token_214|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128220": {
+      "content": "<|reserved_special_token_215|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128221": {
+      "content": "<|reserved_special_token_216|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128222": {
+      "content": "<|reserved_special_token_217|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128223": {
+      "content": "<|reserved_special_token_218|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128224": {
+      "content": "<|reserved_special_token_219|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128225": {
+      "content": "<|reserved_special_token_220|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128226": {
+      "content": "<|reserved_special_token_221|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128227": {
+      "content": "<|reserved_special_token_222|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128228": {
+      "content": "<|reserved_special_token_223|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128229": {
+      "content": "<|reserved_special_token_224|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128230": {
+      "content": "<|reserved_special_token_225|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128231": {
+      "content": "<|reserved_special_token_226|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128232": {
+      "content": "<|reserved_special_token_227|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128233": {
+      "content": "<|reserved_special_token_228|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128234": {
+      "content": "<|reserved_special_token_229|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128235": {
+      "content": "<|reserved_special_token_230|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128236": {
+      "content": "<|reserved_special_token_231|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128237": {
+      "content": "<|reserved_special_token_232|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128238": {
+      "content": "<|reserved_special_token_233|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128239": {
+      "content": "<|reserved_special_token_234|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128240": {
+      "content": "<|reserved_special_token_235|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128241": {
+      "content": "<|reserved_special_token_236|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128242": {
+      "content": "<|reserved_special_token_237|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128243": {
+      "content": "<|reserved_special_token_238|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128244": {
+      "content": "<|reserved_special_token_239|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128245": {
+      "content": "<|reserved_special_token_240|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128246": {
+      "content": "<|reserved_special_token_241|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128247": {
+      "content": "<|reserved_special_token_242|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128248": {
+      "content": "<|reserved_special_token_243|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128249": {
+      "content": "<|reserved_special_token_244|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128250": {
+      "content": "<|reserved_special_token_245|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128251": {
+      "content": "<|reserved_special_token_246|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128252": {
+      "content": "<|reserved_special_token_247|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128253": {
+      "content": "<|reserved_special_token_248|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128254": {
+      "content": "<|reserved_special_token_249|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128255": {
+      "content": "<|reserved_special_token_250|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128256": {
+      "content": "<Prompt0>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128257": {
+      "content": "<Prompt1>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128258": {
+      "content": "<Prompt2>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128259": {
+      "content": "<Prompt3>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128260": {
+      "content": "<Prompt4>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128261": {
+      "content": "<NO_Prompt>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|begin_of_text|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|eot_id|>",
+  "extra_special_tokens": {
+    "image_token": "<|image|>",
+    "pad_token": "<|end_of_text|>",
+    "video_token": "<|video|>"
+  },
+  "image_token": "<|image|>",
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 11520,
+  "pad_token": "<|end_of_text|>",
+  "processor_class": "GARPerceptionLMProcessor",
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "video_token": "<|video|>"
+}

video_preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "crop_size": null,
+  "data_format": "channels_first",
+  "default_to_square": true,
+  "device": null,
+  "do_center_crop": false,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_pad": null,
+  "do_rescale": true,
+  "do_resize": true,
+  "do_sample_frames": null,
+  "fps": null,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "input_data_format": null,
+  "num_frames": null,
+  "processor_class": "GARPerceptionLMProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "return_metadata": false,
+  "size": {
+    "height": 448,
+    "width": 448
+  },
+  "size_divisor": null,
+  "video_metadata": null,
+  "video_processor_type": "PerceptionLMVideoProcessor"
+}