|
|
|
|
|
|
|
|
|
|
|
from transformers import AutoConfig, LlamaConfig |
|
from transformers.configuration_utils import PretrainedConfig |
|
from transformers.utils import logging |
|
from transformers.dynamic_module_utils import get_class_from_dynamic_module |
|
|
|
logger = logging.get_logger(__name__) |
|
|
|
class Llama_Nemotron_Nano_VL_Config(PretrainedConfig): |
|
model_type = 'Llama_Nemotron_Nano_VL' |
|
is_composition = True |
|
|
|
def __init__( |
|
self, |
|
vision_config=None, |
|
llm_config=None, |
|
force_image_size=None, |
|
downsample_ratio=0.5, |
|
template=None, |
|
ps_version='v1', |
|
image_tag_type="internvl", |
|
projector_hidden_size=4096, |
|
vit_hidden_size=1280, |
|
attn_implementation="flash_attention_2", |
|
**kwargs |
|
): |
|
super().__init__(**kwargs) |
|
|
|
if vision_config is not None: |
|
assert "auto_map" in vision_config and "AutoConfig" in vision_config["auto_map"] |
|
vision_auto_config = get_class_from_dynamic_module(*vision_config["auto_map"]["AutoConfig"].split("--")[::-1]) |
|
self.vision_config = vision_auto_config(**vision_config) |
|
else: |
|
self.vision_config = PretrainedConfig() |
|
|
|
if llm_config is None: |
|
self.llm_config = LlamaConfig() |
|
else: |
|
self.llm_config = LlamaConfig(**llm_config) |
|
|
|
|
|
self.force_image_size = force_image_size |
|
self.downsample_ratio = downsample_ratio |
|
self.template = template |
|
self.ps_version = ps_version |
|
self.image_tag_type = image_tag_type |
|
self.projector_hidden_size = projector_hidden_size |
|
self.vit_hidden_size = vit_hidden_size |
|
|
|
self._attn_implementation = attn_implementation |
|
self.vision_config.use_flash_attn = "flash_attention" in self._attn_implementation |
|
self.llm_config._attn_implementation = self._attn_implementation |