davenliu commited on
Commit
e99bd86
·
verified ·
1 Parent(s): bd67af4

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
added_tokens.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</img>": 151653,
3
+ "</think>": 151668,
4
+ "</tool_call>": 151658,
5
+ "</tool_response>": 151666,
6
+ "<img>": 151652,
7
+ "<think>": 151667,
8
+ "<tool_call>": 151657,
9
+ "<tool_response>": 151665,
10
+ "<|box_end|>": 151649,
11
+ "<|box_start|>": 151648,
12
+ "<|endoftext|>": 151643,
13
+ "<|file_sep|>": 151664,
14
+ "<|fim_middle|>": 151660,
15
+ "<|fim_pad|>": 151662,
16
+ "<|fim_prefix|>": 151659,
17
+ "<|fim_suffix|>": 151661,
18
+ "<|im_end|>": 151645,
19
+ "<|im_start|>": 151644,
20
+ "<|image_pad|>": 151655,
21
+ "<|object_ref_end|>": 151647,
22
+ "<|object_ref_start|>": 151646,
23
+ "<|quad_end|>": 151651,
24
+ "<|quad_start|>": 151650,
25
+ "<|repo_name|>": 151663,
26
+ "<|video_pad|>": 151656,
27
+ "<|vision_pad|>": 151654
28
+ }
chat_template.jinja ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
+ {%- endif %}
16
+ {%- endif %}
17
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
18
+ {%- for message in messages[::-1] %}
19
+ {%- set index = (messages|length - 1) - loop.index0 %}
20
+ {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
21
+ {%- set ns.multi_step_tool = false %}
22
+ {%- set ns.last_query_index = index %}
23
+ {%- endif %}
24
+ {%- endfor %}
25
+ {%- for message in messages %}
26
+ {%- if message.content is string %}
27
+ {%- set content = message.content %}
28
+ {%- else %}
29
+ {%- set content = '' %}
30
+ {%- endif %}
31
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
32
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
33
+ {%- elif message.role == "assistant" %}
34
+ {%- set reasoning_content = '' %}
35
+ {%- if message.reasoning_content is string %}
36
+ {%- set reasoning_content = message.reasoning_content %}
37
+ {%- else %}
38
+ {%- if '</think>' in content %}
39
+ {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
40
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
41
+ {%- endif %}
42
+ {%- endif %}
43
+ {%- if loop.index0 > ns.last_query_index %}
44
+ {%- if loop.last or (not loop.last and reasoning_content) %}
45
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
46
+ {%- else %}
47
+ {{- '<|im_start|>' + message.role + '\n' + content }}
48
+ {%- endif %}
49
+ {%- else %}
50
+ {{- '<|im_start|>' + message.role + '\n' + content }}
51
+ {%- endif %}
52
+ {%- if message.tool_calls %}
53
+ {%- for tool_call in message.tool_calls %}
54
+ {%- if (loop.first and content) or (not loop.first) %}
55
+ {{- '\n' }}
56
+ {%- endif %}
57
+ {%- if tool_call.function %}
58
+ {%- set tool_call = tool_call.function %}
59
+ {%- endif %}
60
+ {{- '<tool_call>\n{"name": "' }}
61
+ {{- tool_call.name }}
62
+ {{- '", "arguments": ' }}
63
+ {%- if tool_call.arguments is string %}
64
+ {{- tool_call.arguments }}
65
+ {%- else %}
66
+ {{- tool_call.arguments | tojson }}
67
+ {%- endif %}
68
+ {{- '}\n</tool_call>' }}
69
+ {%- endfor %}
70
+ {%- endif %}
71
+ {{- '<|im_end|>\n' }}
72
+ {%- elif message.role == "tool" %}
73
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
74
+ {{- '<|im_start|>user' }}
75
+ {%- endif %}
76
+ {{- '\n<tool_response>\n' }}
77
+ {{- content }}
78
+ {{- '\n</tool_response>' }}
79
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
80
+ {{- '<|im_end|>\n' }}
81
+ {%- endif %}
82
+ {%- endif %}
83
+ {%- endfor %}
84
+ {%- if add_generation_prompt %}
85
+ {{- '<|im_start|>assistant\n' }}
86
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "AndesVLForConditionalGeneration"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_andesvl.AndesVLConfig",
7
+ "AutoModel": "modeling_andesvl.AndesVLForConditionalGeneration",
8
+ "AutoModelForCausalLM": "modeling_andesvl.AndesVLForConditionalGeneration"
9
+ },
10
+ "model_type": "andesvl-aimv2-qwen3",
11
+ "text_config": {
12
+ "vocab_size": 151936,
13
+ "max_position_embeddings": 262144,
14
+ "hidden_size": 2560,
15
+ "intermediate_size": 9728,
16
+ "num_hidden_layers": 36,
17
+ "num_attention_heads": 32,
18
+ "use_sliding_window": false,
19
+ "sliding_window": null,
20
+ "max_window_layers": 36,
21
+ "num_key_value_heads": 8,
22
+ "head_dim": 128,
23
+ "hidden_act": "silu",
24
+ "initializer_range": 0.02,
25
+ "rms_norm_eps": 1e-06,
26
+ "use_cache": true,
27
+ "rope_theta": 5000000,
28
+ "rope_scaling": null,
29
+ "attention_bias": false,
30
+ "attention_dropout": 0.0,
31
+ "tie_word_embeddings": true,
32
+ "architectures": [
33
+ "Qwen3ForCausalLM"
34
+ ],
35
+ "bos_token_id": 151643,
36
+ "eos_token_id": 151645,
37
+ "model_type": "qwen3"
38
+ },
39
+ "vision_config": {
40
+ "attention_dropout": 0.0,
41
+ "disable_rope": false,
42
+ "fullatt_block_indexes": null,
43
+ "hidden_size": 1024,
44
+ "hidden_stride": 2,
45
+ "image_size": 448,
46
+ "intermediate_size": 2816,
47
+ "interpolate_pe_method": "two_dim",
48
+ "model_type": "aimv2",
49
+ "num_attention_heads": 8,
50
+ "num_channels": 3,
51
+ "num_hidden_layers": 24,
52
+ "patch_size": 14,
53
+ "preserve_original_pe": true,
54
+ "projection_dropout": 0.0,
55
+ "qkv_bias": false,
56
+ "rms_norm_eps": 1e-05,
57
+ "temporal_patch_size": 1,
58
+ "torch_dtype": "bfloat16",
59
+ "transformers_version": "4.52.4",
60
+ "use_bias": false,
61
+ "window_size": 112
62
+ },
63
+ "tie_word_embeddings": true,
64
+ "torch_dtype": "bfloat16",
65
+ "transformers_version": "4.51.0"
66
+ }
configuration_aimv2_navit_rope.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any
2
+
3
+ from transformers.configuration_utils import PretrainedConfig
4
+
5
+ __all__ = ["Aimv2VisionConfig"]
6
+
7
+
8
+ class Aimv2VisionConfig(PretrainedConfig):
9
+ model_type: str = "aimv2"
10
+
11
+ def __init__(
12
+ self,
13
+ hidden_size: int = 1024,
14
+ intermediate_size: int = 2816,
15
+ num_hidden_layers: int = 24,
16
+ num_attention_heads: int = 8,
17
+ num_channels: int = 3,
18
+ image_size: int = 224,
19
+ patch_size: int = 14,
20
+ rms_norm_eps: float = 1e-5,
21
+ attention_dropout: float = 0.0,
22
+ projection_dropout: float = 0.0,
23
+ qkv_bias: bool = False,
24
+ use_bias: bool = False,
25
+ hidden_stride: int = 2,
26
+ window_size: int = 112,
27
+ fullatt_block_indexes: list = None,
28
+ temporal_patch_size: int = 1,
29
+ preserve_original_pe: bool = False,
30
+ interpolate_pe_method: str = 'one_dim',
31
+ disable_rope: bool = False,
32
+ min_pixels: int = 3136,
33
+ max_pixels: int = 1960000,
34
+ **kwargs: Any,
35
+ ):
36
+ super().__init__(**kwargs)
37
+ self.hidden_size = hidden_size
38
+ self.intermediate_size = intermediate_size
39
+ self.num_hidden_layers = num_hidden_layers
40
+ self.num_attention_heads = num_attention_heads
41
+ self.num_channels = num_channels
42
+ self.patch_size = patch_size
43
+ self.image_size = image_size
44
+ self.attention_dropout = attention_dropout
45
+ self.rms_norm_eps = rms_norm_eps
46
+
47
+ self.projection_dropout = projection_dropout
48
+ self.qkv_bias = qkv_bias
49
+ self.use_bias = use_bias
50
+
51
+ self.hidden_stride = hidden_stride
52
+ self.window_size = window_size
53
+ self.fullatt_block_indexes = fullatt_block_indexes
54
+ self.temporal_patch_size = temporal_patch_size
55
+ self.preserve_original_pe = preserve_original_pe
56
+ self.interpolate_pe_method = interpolate_pe_method
57
+ self.disable_rope = disable_rope
58
+ self.min_pixels = min_pixels
59
+ self.max_pixels = max_pixels
configuration_andesvl.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+
3
+ from transformers import Qwen3Config
4
+ from transformers.configuration_utils import PretrainedConfig
5
+ from transformers.utils import logging
6
+ from .configuration_aimv2_navit_rope import Aimv2VisionConfig
7
+
8
+ logger = logging.get_logger(__name__)
9
+
10
+
11
+ class AndesVLConfig(PretrainedConfig):
12
+ model_type = 'andesvl-aimv2-qwen3'
13
+
14
+ def __init__(
15
+ self,
16
+ vision_config=None,
17
+ text_config=None,
18
+ **kwargs):
19
+ super().__init__(**kwargs)
20
+
21
+ self.vision_config = Aimv2VisionConfig(**vision_config) if vision_config is not None else Aimv2VisionConfig()
22
+ self.text_config = Qwen3Config(**text_config) if text_config is not None else Qwen3Config()
23
+
24
+ def to_dict(self):
25
+ """
26
+ Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
27
+ Returns:
28
+ `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
29
+ """
30
+ output = copy.deepcopy(self.__dict__)
31
+ output['vision_config'] = self.vision_config.to_dict()
32
+ output['text_config'] = self.text_config.to_dict()
33
+ output['model_type'] = self.__class__.model_type
34
+ return output
generation_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_sample": true,
3
+ "temperature": 0.7,
4
+ "top_k": 20,
5
+ "top_p": 0.8,
6
+ "pad_token_id": 151643,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": [
9
+ 151645,
10
+ 151643
11
+ ]
12
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
modeling_aimv2_navit_rope.py ADDED
@@ -0,0 +1,388 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # adapted from https://huggingface.co/apple/aimv2-huge-patch14-448 (modification: add gradient checkpoint support)
2
+ from typing import Optional, Tuple, Union
3
+
4
+ import torch
5
+ from torch import nn
6
+ from torch.nn import functional as F
7
+ from transformers.modeling_outputs import BaseModelOutputWithNoAttention
8
+ from transformers.modeling_utils import PreTrainedModel
9
+ from flash_attn.layers.rotary import apply_rotary_emb
10
+ from flash_attn import flash_attn_varlen_func
11
+
12
+ from .configuration_aimv2_navit_rope import Aimv2VisionConfig
13
+
14
+
15
+ class RMSNorm(nn.Module):
16
+ def __init__(self, dim: int, eps: float = 1e-6):
17
+ super().__init__()
18
+ self.weight = nn.Parameter(torch.ones(dim))
19
+ self.eps = eps
20
+
21
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
22
+ output = self._norm(x.float()).type_as(x)
23
+ return output * self.weight
24
+
25
+ def extra_repr(self) -> str:
26
+ return f"{tuple(self.weight.shape)}, eps={self.eps}"
27
+
28
+ def _norm(self, x: torch.Tensor) -> torch.Tensor:
29
+ return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
30
+
31
+
32
+ try:
33
+ from flash_attn.ops.rms_norm import RMSNorm
34
+ except Exception as e:
35
+ pass
36
+
37
+
38
+ class AIMv2SwiGLUFFN(nn.Module):
39
+ def __init__(self, config: Aimv2VisionConfig):
40
+ super().__init__()
41
+ hidden_features = config.intermediate_size
42
+ in_features = config.hidden_size
43
+ bias = config.use_bias
44
+
45
+ self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
46
+ self.fc2 = nn.Linear(hidden_features, in_features, bias=bias)
47
+ self.fc3 = nn.Linear(in_features, hidden_features, bias=bias)
48
+
49
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
50
+ x = F.silu(self.fc1(x)) * self.fc3(x)
51
+ x = self.fc2(x)
52
+ return x
53
+
54
+
55
+ # copied from qwen2.5-vl
56
+ class VisionRotaryEmbedding(nn.Module):
57
+ def __init__(self, dim: int, theta: float = 10000.0) -> None:
58
+ super().__init__()
59
+ inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
60
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
61
+
62
+ def forward(self, seqlen: int) -> torch.Tensor:
63
+ seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
64
+ freqs = torch.outer(seq, self.inv_freq)
65
+ return freqs
66
+
67
+ # Note: in qwen2-vl and qwen2.5-vl, 3d convolution is used.
68
+ class AIMv2PatchEmbed(nn.Module):
69
+ def __init__(self, config: Aimv2VisionConfig):
70
+ super().__init__()
71
+ self.config = config
72
+ self.proj = nn.Conv2d(
73
+ config.num_channels,
74
+ config.hidden_size,
75
+ kernel_size=(config.patch_size, config.patch_size),
76
+ stride=(config.patch_size, config.patch_size),
77
+ )
78
+ assert self.config.temporal_patch_size == 1 #恒等于1.
79
+ self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
80
+
81
+ #NOTE: 这里主要是将conv2d转换为linear的运算,效率更高。
82
+ def _get_2d_weight(self):
83
+ # Get 2d conv weight and bias, convert to format that linear function can use directly
84
+ weight = self.proj.weight.view(self.config.hidden_size, -1) # [hidden_size, c*patch_size*patch_size]
85
+ bias = self.proj.bias if self.proj.bias is not None else torch.zeros(self.config.hidden_size, device=weight.device)
86
+ return weight, bias
87
+
88
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
89
+ # Expected input shape: (num_patches, c*temporal_patch_size*patch_size*patch_size)
90
+ # When temporal_patch_size=1: (num_patches, c*patch_size*patch_size)
91
+ x = torch.nn.functional.linear(x, *self._get_2d_weight())
92
+ x = self.norm(x)
93
+ return x
94
+
95
+
96
+ class AIMv2ViTPreprocessor(nn.Module):
97
+ def __init__(self, config: Aimv2VisionConfig):
98
+ super().__init__()
99
+
100
+ num_patches = (config.image_size // config.patch_size) ** 2
101
+
102
+ self.patchifier = AIMv2PatchEmbed(config)
103
+
104
+ self.preserve_original_pe = config.preserve_original_pe
105
+ self.hidden_stride = config.hidden_stride
106
+
107
+ if self.preserve_original_pe:
108
+ self.interpolate_pe_method = config.interpolate_pe_method
109
+ self.pos_embed = nn.Parameter(torch.zeros((1, num_patches, config.hidden_size)))
110
+
111
+ def forward(self, x: torch.Tensor, grid_thws: Optional[torch.Tensor] = None) -> torch.Tensor:
112
+ tokens = self.patchifier(x)
113
+
114
+ if self.preserve_original_pe:
115
+ assert grid_thws is not None
116
+ pos_embed_new = torch.zeros_like(tokens)
117
+ if self.interpolate_pe_method == 'one_dim':
118
+ pos_embed = self.pos_embed.transpose(1,2).to(tokens.device)
119
+ elif self.interpolate_pe_method == 'two_dim':
120
+ ori_h = ori_w = int(self.pos_embed.shape[1] ** 0.5)
121
+ pos_embed = self.pos_embed.reshape(1, ori_h, ori_w, -1).permute(0,3,1,2)
122
+ else:
123
+ raise TypeError("The interpolation method for pe should be one_dim, two_dim.")
124
+ cnt = 0
125
+ for t, h, w in grid_thws:
126
+ num_patches = h * w
127
+ thw = t * h * w
128
+ if self.interpolate_pe_method == 'one_dim':
129
+ pe = F.interpolate(pos_embed, size=num_patches, mode='linear', align_corners=False).transpose(1,2)
130
+ elif self.interpolate_pe_method == 'two_dim':
131
+ # 1, 1024, 32, 32
132
+ pe = F.interpolate(pos_embed, size=(h,w), mode='bicubic', align_corners=False)
133
+ # 1, 1024, 1024
134
+ pe = pe.permute(0,2,3,1).reshape(1, h*w, -1)
135
+ # 1024, 1024
136
+ pe = pe[0].repeat(t,1)
137
+ # 1, 16, 2, 16, 2, 1024
138
+ pe = pe.reshape(t, h//self.hidden_stride, self.hidden_stride, w//self.hidden_stride, self.hidden_stride, -1)
139
+ # 1024, 1024
140
+ pe = pe.permute(0,1,3,2,4,5).reshape(thw,-1)
141
+ pos_embed_new[cnt:cnt+thw] = pe
142
+
143
+ cnt += thw
144
+
145
+ tokens = tokens + pos_embed_new
146
+ return tokens
147
+
148
+ # copied from qwen2.5-vl
149
+ def apply_rotary_pos_emb_flashatt(
150
+ q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
151
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
152
+ cos = cos.chunk(2, dim=-1)[0].contiguous()
153
+ sin = sin.chunk(2, dim=-1)[0].contiguous()
154
+ q_embed = apply_rotary_emb(q.float(), cos.float(), sin.float()).type_as(q)
155
+ k_embed = apply_rotary_emb(k.float(), cos.float(), sin.float()).type_as(k)
156
+ return q_embed, k_embed
157
+
158
+ class AIMv2FlashAttention2(nn.Module):
159
+ def __init__(self, config: Aimv2VisionConfig) -> None:
160
+ super().__init__()
161
+ dim = config.hidden_size
162
+ self.num_heads = config.num_attention_heads
163
+ self.qkv = nn.Linear(dim, dim * 3, bias=config.qkv_bias)
164
+ self.proj = nn.Linear(dim, dim, bias=config.use_bias)
165
+
166
+ self.use_rope = not config.disable_rope
167
+
168
+ def forward(
169
+ self,
170
+ hidden_states: torch.Tensor,
171
+ cu_seqlens: torch.Tensor,
172
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
173
+ ) -> torch.Tensor:
174
+
175
+ seq_length = hidden_states.shape[0]
176
+ q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
177
+ if self.use_rope:
178
+ cos, sin = position_embeddings
179
+ q, k = apply_rotary_pos_emb_flashatt(q.unsqueeze(0), k.unsqueeze(0), cos, sin)
180
+ q = q.squeeze(0)
181
+ k = k.squeeze(0)
182
+
183
+ max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
184
+ attn_output = flash_attn_varlen_func(q, k, v, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen).reshape(
185
+ seq_length, -1
186
+ )
187
+ attn_output = self.proj(attn_output)
188
+ return attn_output
189
+
190
+ class AIMv2Block(nn.Module):
191
+ def __init__(self, config: Aimv2VisionConfig):
192
+ super().__init__()
193
+ self.attn = AIMv2FlashAttention2(config)
194
+ self.norm_1 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
195
+ self.mlp = AIMv2SwiGLUFFN(config)
196
+ self.norm_2 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
197
+
198
+ def forward(
199
+ self, x: torch.Tensor, cu_seqlens: torch.Tensor, position_embeddings: torch.Tensor
200
+ ) -> torch.Tensor:
201
+ x = x + self.attn(self.norm_1(x), cu_seqlens=cu_seqlens, position_embeddings=position_embeddings)
202
+ x = x + self.mlp(self.norm_2(x))
203
+ return x
204
+
205
+
206
+ class AIMv2Transformer(nn.Module):
207
+ def __init__(self, config: Aimv2VisionConfig):
208
+ super().__init__()
209
+ self.blocks = nn.ModuleList(
210
+ [AIMv2Block(config) for _ in range(config.num_hidden_layers)]
211
+ )
212
+ self.post_trunk_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
213
+ self.gradient_checkpointing = False
214
+
215
+ self.rotary_pos_emb = VisionRotaryEmbedding(config.hidden_size // config.num_attention_heads // 2)
216
+
217
+ self.hidden_stride = config.hidden_stride
218
+ self.patch_size = config.patch_size
219
+ self.window_size = config.window_size
220
+ self.spatial_merge_unit = config.hidden_stride * config.hidden_stride
221
+
222
+ self.fullatt_block_indexes = config.fullatt_block_indexes
223
+
224
+ # copied from qwen2.5_vl
225
+ def rot_pos_emb(self, grid_thw):
226
+ pos_ids = []
227
+ for t, h, w in grid_thw:
228
+ hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
229
+ hpos_ids = hpos_ids.reshape(
230
+ h // self.hidden_stride,
231
+ self.hidden_stride,
232
+ w // self.hidden_stride,
233
+ self.hidden_stride,
234
+ )
235
+ hpos_ids = hpos_ids.permute(0, 2, 1, 3)
236
+ hpos_ids = hpos_ids.flatten()
237
+
238
+ wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
239
+ wpos_ids = wpos_ids.reshape(
240
+ h // self.hidden_stride,
241
+ self.hidden_stride,
242
+ w // self.hidden_stride,
243
+ self.hidden_stride,
244
+ )
245
+ wpos_ids = wpos_ids.permute(0, 2, 1, 3)
246
+ wpos_ids = wpos_ids.flatten()
247
+ pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
248
+ pos_ids = torch.cat(pos_ids, dim=0)
249
+ max_grid_size = grid_thw[:, 1:].max()
250
+ rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
251
+ rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
252
+ return rotary_pos_emb
253
+
254
+ def get_window_index(self, grid_thw):
255
+ window_index: list = []
256
+ cu_window_seqlens: list = [0]
257
+ window_index_id = 0
258
+ vit_merger_window_size = self.window_size // self.hidden_stride // self.patch_size # patch (after merge) number in each window
259
+
260
+ for grid_t, grid_h, grid_w in grid_thw:
261
+ llm_grid_h, llm_grid_w = (
262
+ grid_h // self.hidden_stride, # number of patch after merge
263
+ grid_w // self.hidden_stride,
264
+ )
265
+ index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape(grid_t, llm_grid_h, llm_grid_w)
266
+ pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size
267
+ pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size
268
+ num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size
269
+ num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size
270
+ index_padded = F.pad(index, (0, pad_w, 0, pad_h), "constant", -100)
271
+ index_padded = index_padded.reshape(
272
+ grid_t,
273
+ num_windows_h,
274
+ vit_merger_window_size,
275
+ num_windows_w,
276
+ vit_merger_window_size,
277
+ )
278
+ index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape(
279
+ grid_t,
280
+ num_windows_h * num_windows_w,
281
+ vit_merger_window_size,
282
+ vit_merger_window_size,
283
+ )
284
+ seqlens = (index_padded != -100).sum([2, 3]).reshape(-1)
285
+ index_padded = index_padded.reshape(-1)
286
+ index_new = index_padded[index_padded != -100]
287
+ window_index.append(index_new + window_index_id)
288
+ cu_seqlens_tmp = seqlens.cumsum(0) * self.spatial_merge_unit + cu_window_seqlens[-1]
289
+ cu_window_seqlens.extend(cu_seqlens_tmp.tolist())
290
+ window_index_id += (grid_t * llm_grid_h * llm_grid_w).item()
291
+ window_index = torch.cat(window_index, dim=0)
292
+
293
+ return window_index, cu_window_seqlens
294
+
295
+ def forward(
296
+ self,
297
+ tokens: torch.Tensor,
298
+ grid_thws: torch.Tensor,
299
+ output_hidden_states: bool = False,
300
+ ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, ...]]]:
301
+ # RoPE, modified from qwen2.5_vl
302
+ rotary_pos_emb = self.rot_pos_emb(grid_thws)
303
+ window_index, cu_window_seqlens = self.get_window_index(grid_thws)
304
+ cu_window_seqlens = torch.tensor(
305
+ cu_window_seqlens,
306
+ device=tokens.device,
307
+ dtype=grid_thws.dtype if torch.jit.is_tracing() else torch.int32,
308
+ )
309
+ cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
310
+
311
+ seq_len, _ = tokens.size()
312
+ tokens = tokens.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
313
+ tokens = tokens[window_index, :, :]
314
+ tokens = tokens.reshape(seq_len, -1)
315
+ rotary_pos_emb = rotary_pos_emb.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
316
+ rotary_pos_emb = rotary_pos_emb[window_index, :, :]
317
+ rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1)
318
+ emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
319
+ position_embeddings = (emb.cos(), emb.sin())
320
+
321
+ cu_seqlens = torch.repeat_interleave(grid_thws[:, 1] * grid_thws[:, 2], grid_thws[:, 0]).cumsum(
322
+ dim=0,
323
+ # Select dtype based on the following factors:
324
+ # - FA2 requires that cu_seqlens_q must have dtype int32
325
+ # - torch.onnx.export requires that cu_seqlens_q must have same dtype as grid_thw
326
+ # See https://github.com/huggingface/transformers/pull/34852 for more information
327
+ dtype=grid_thws.dtype if torch.jit.is_tracing() else torch.int32,
328
+ )
329
+ cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
330
+
331
+ reverse_indices = torch.argsort(window_index)
332
+
333
+ hidden_states = () if output_hidden_states else None
334
+ for index, block in enumerate(self.blocks):
335
+ if self.fullatt_block_indexes is None or index in self.fullatt_block_indexes:
336
+ cu_seqlens_tmp = cu_seqlens
337
+ else:
338
+ cu_seqlens_tmp = cu_window_seqlens
339
+ if self.gradient_checkpointing and self.training:
340
+ tokens = self._gradient_checkpointing_func(block.__call__, tokens, cu_seqlens_tmp, position_embeddings)
341
+ else:
342
+ tokens = block(tokens, cu_seqlens_tmp, position_embeddings)
343
+ if output_hidden_states:
344
+ tokens_ = tokens.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
345
+ hidden_states += (tokens_[reverse_indices,:].reshape(seq_len, -1),)
346
+ tokens = self.post_trunk_norm(tokens)
347
+ tokens = tokens.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
348
+ tokens = tokens[reverse_indices,:].reshape(seq_len, -1)
349
+
350
+ return tokens, hidden_states
351
+
352
+
353
+ class AIMv2PretrainedModel(PreTrainedModel):
354
+ config_class = Aimv2VisionConfig
355
+ base_model_prefix = "aimv2"
356
+ supports_gradient_checkpointing = True
357
+ main_input_name = "pixel_values"
358
+ _no_split_modules = ["AIMv2ViTPreprocessor", "AIMv2Block"]
359
+ _supports_sdpa = True
360
+ _supports_flash_attn_2 = True
361
+
362
+
363
+ class Aimv2VisionModel(AIMv2PretrainedModel):
364
+ def __init__(self, config: Aimv2VisionConfig):
365
+ super().__init__(config)
366
+ self.preprocessor = AIMv2ViTPreprocessor(config)
367
+ self.trunk = AIMv2Transformer(config)
368
+
369
+ def forward(
370
+ self,
371
+ hidden_states: torch.Tensor,
372
+ grid_hws: torch.Tensor,
373
+ ):
374
+ # NOTE: 这个是我们自研的ViT输入接口
375
+ # Transform flattened pixel values to include temporal dimension
376
+ pixel_values = torch.cat([hidden_states for _ in range(self.config.temporal_patch_size)], dim=1)
377
+
378
+ # Add temporal dimension (t=1) to the grid info
379
+ grid_t = torch.ones(grid_hws.shape[0], 1, device=grid_hws.device, dtype=grid_hws.dtype)
380
+ grid_thws = torch.cat([grid_t, grid_hws], dim=1)
381
+
382
+ # Process through the model
383
+ x = self.preprocessor(pixel_values, grid_thws=grid_thws)
384
+ x, _ = self.trunk(x, grid_thws=grid_thws, output_hidden_states=False)
385
+
386
+ return x
387
+
388
+ __all__ = ["Aimv2VisionModel"]
modeling_andesvl.py ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch import nn
2
+ import torch.utils.checkpoint
3
+ from transformers import Qwen3ForCausalLM
4
+ from transformers.modeling_utils import PreTrainedModel
5
+ from transformers.utils import logging
6
+ from .configuration_andesvl import AndesVLConfig
7
+ from .modeling_aimv2_navit_rope import Aimv2VisionModel
8
+
9
+ logger = logging.get_logger(__name__)
10
+
11
+ class AndesVLForConditionalGeneration(PreTrainedModel):
12
+ config_class = AndesVLConfig
13
+ main_input_name = 'pixel_values'
14
+ _supports_flash_attn_2 = True
15
+ _no_split_modules = ['Aimv2VisionModel','Qwen3DecoderLayer']
16
+
17
+
18
+ def __init__(self, config: AndesVLConfig):
19
+ super().__init__(config)
20
+
21
+ self.config = config
22
+ self.vision_encoder = Aimv2VisionModel(config.vision_config)
23
+ self.language_model = Qwen3ForCausalLM(config.text_config)
24
+
25
+ vit_hidden_size = self.vision_encoder.config.hidden_size
26
+ llm_hidden_size = self.language_model.config.hidden_size
27
+ self.patch_size = self.vision_encoder.config.patch_size
28
+ self.mlp = nn.Sequential(
29
+ nn.Linear(vit_hidden_size * 4, vit_hidden_size * 4),
30
+ nn.GELU(),
31
+ nn.Linear(vit_hidden_size * 4, llm_hidden_size),
32
+ )
33
+
34
+ def get_input_embeddings(self):
35
+ return self.language_model.model.embed_tokens
36
+
37
+ def set_input_embeddings(self, value):
38
+ self.language_model.model.embed_tokens = value
39
+
40
+ def get_output_embeddings(self):
41
+ return self.language_model.lm_head
42
+
43
+ def set_output_embeddings(self, new_embeddings):
44
+ self.language_model.lm_head = new_embeddings
45
+
46
+ def get_flated_pixel_values(self, pixel_values):
47
+ flated_pixel_values = []
48
+ image_grid_hw = []
49
+ for pv in pixel_values:
50
+ c, h, w = pv.shape
51
+ assert c==3 and h%self.patch_size==0 and w%self.patch_size==0, f"{c}, {w}, {h}, {self.patch_size}"
52
+ image_grid_hw.append((h//self.patch_size, w//self.patch_size))
53
+ fpv = pv.reshape(c, h//(2*self.patch_size), 2, self.patch_size, w//(2*self.patch_size), 2, self.patch_size)
54
+ flated_pixel_values.append(fpv.permute(1, 4, 2, 5, 0, 3, 6).reshape(-1, c*self.patch_size*self.patch_size))
55
+ flated_pixel_values = torch.cat(flated_pixel_values, dim=0) # (Len_img, C, H, W)
56
+ image_grid_hw = torch.tensor(image_grid_hw, device=flated_pixel_values.device) # (N_img, 2)
57
+ return flated_pixel_values, image_grid_hw
58
+
59
+
60
+ def get_vit_embeds_and_merge(self, pixel_values, image_grid_hw, input_embeds, image_flags):
61
+ """
62
+ Args:
63
+ pixel_values: (Len_img, H_vit0), 拉平后的初始patch特征,按照序列维度拼接在一起
64
+ image_grid_hw: (N_img, 2), 每个图片的宽高
65
+ input_embeds: (Bt, Lt, Ht), 每个token的embedding
66
+ image_flags: (Bt, Lt), 每个token是否是图片
67
+ """
68
+ vit_embeds = self.vision_encoder(pixel_values, image_grid_hw) # (Len_img, H_vit)
69
+ vit_embeds = vit_embeds.view(-1, vit_embeds.shape[-1]*4) # (Len_img//4, H_vit*4)
70
+ vit_embeds = self.mlp(vit_embeds) # (Len_img//4, H_llm)
71
+ vit_embeds = vit_embeds[:image_flags.sum()]
72
+ Bt, Lt, Ht = input_embeds.shape
73
+ input_embeds = input_embeds.reshape(-1, Ht)
74
+ image_flags = image_flags.view(-1)
75
+ input_embeds[image_flags == 1] = vit_embeds
76
+ input_embeds = input_embeds.view(Bt, Lt, Ht)
77
+ return input_embeds
78
+
79
+ @torch.inference_mode()
80
+ @torch.autocast(device_type="cuda", dtype=torch.bfloat16)
81
+ def generate(
82
+ self,
83
+ pixel_values=None,
84
+ input_ids=None,
85
+ attention_mask=None,
86
+ image_flags=None, # (Bt, Lt)
87
+ generation_config=None,
88
+ **generate_kwargs,
89
+ ) -> torch.LongTensor:
90
+
91
+ input_embeds = self.language_model.get_input_embeddings()(input_ids) # (Bt, Lt, Ht)
92
+ if image_flags != None and (image_flags == 1).sum() > 0:
93
+ flated_pixel_values, image_grid_hw = self.get_flated_pixel_values(pixel_values)
94
+ input_embeds = self.get_vit_embeds_and_merge(flated_pixel_values, image_grid_hw, input_embeds, image_flags)
95
+ outputs = self.language_model.generate(
96
+ input_ids=input_ids,
97
+ inputs_embeds=input_embeds,
98
+ attention_mask=attention_mask,
99
+ generation_config=generation_config,
100
+ use_cache=True,
101
+ **generate_kwargs,
102
+ )
103
+ return outputs
104
+
105
+ #NOTE: completion和chat接口暂不支持batch推理,需要手动构建self.generate函数的输入来实现。
106
+ def completion(self, prompt, images, tokenizer, image_processor, **kwargs):
107
+ """输入一段文字和一组图片(其中文字中的图片用占位符标记为<image>),输出补全的文本"""
108
+ assert prompt.count("<image>") == len(images), "图片数量和占位符数量不匹配"
109
+ def replacement(m):
110
+ token_count = image_tokens.pop(0)
111
+ return f"<img>{'<|vision_pad|>' * token_count}</img>"
112
+ #首先对所有的图像进行处理,获取对应的size
113
+ max_size = kwargs.get("max_size", 733) # max_size**2为支持的最大的面积
114
+ base = self.patch_size*2
115
+ image_token_id = tokenizer.vocab['<|vision_pad|>'] # 图像token的占位符
116
+ background_color = tuple(int(x*255) for x in image_processor.image_mean)
117
+ transform = T.Compose([T.ToTensor(),T.Normalize(mean=image_processor.image_mean, std=image_processor.image_std)])
118
+ pixel_values = []
119
+ image_tokens = []
120
+ for image in images:
121
+ if isinstance(image, (tuple, list)):
122
+ image, detail = image
123
+ else:
124
+ detail = "low"
125
+ image = load_image(image)
126
+ if detail=="low":
127
+ image = native_preprocess(image, max_size, base, background_color, min_tokens=4)
128
+ pixel_values.append(transform(image))
129
+ image_tokens.append(image.size[0]*image.size[1]//(base*base))
130
+ else:
131
+ raise NotImplementedError("暂未实现")
132
+ new_prompt = re.sub(r"<image>", replacement, prompt)
133
+ input_ids = tokenizer(new_prompt, return_tensors="pt", add_special_tokens=False).input_ids.to(self.device)
134
+ image_flags = (input_ids == image_token_id).int()
135
+ input_ids = input_ids.to(self.vision_encoder.device)
136
+ pixel_values = [pv.to(self.vision_encoder.device) for pv in pixel_values]
137
+ image_flags = image_flags.to(self.vision_encoder.device)
138
+ output_ids = self.generate(pixel_values=pixel_values, input_ids=input_ids, image_flags=image_flags, **kwargs)[0][input_ids.shape[1]:]
139
+ return tokenizer.decode(output_ids, skip_special_tokens=True)
140
+
141
+ def chat(self, messages, tokenizer, image_processor, **kwargs):
142
+ """输入是一组对话信息(openai格式),输出是回复"""
143
+ prompt = ""
144
+ images = []
145
+ for message in messages:
146
+ role = message["role"]
147
+ assert role in ["user", "assistant", "system"], f"非法的角色{role}"
148
+ content = message['content']
149
+ if isinstance(content, str):
150
+ prompt += f"<|im_start|>{role}\n{content}{tokenizer.eos_token}\n"
151
+ elif isinstance(content, list):
152
+ temp = ""
153
+ for sub_content in content:
154
+ if sub_content['type']=='text':
155
+ temp += f"{sub_content['text']}"
156
+ elif sub_content['type']=='image_url':
157
+ temp += "<image>"
158
+ images.append([load_image(sub_content['image_url']['url']), sub_content['image_url'].get("detail",'low')])
159
+ prompt += f"<|im_start|>{role}\n{temp}{tokenizer.eos_token}\n"
160
+ else:
161
+ raise ValueError(f"非法的内容{content}")
162
+ if 'thinking' in kwargs:
163
+ kwargs.pop('thinking')
164
+ prompt += f"<|im_start|>assistant\n"
165
+ return self.completion(prompt, images, tokenizer, image_processor, **kwargs)
166
+
167
+ ########################
168
+ ###下面是图像处理的代码###
169
+ ########################
170
+
171
+ import os
172
+ import math
173
+ import re
174
+ from typing import Union
175
+ import requests
176
+ import base64
177
+ from io import BytesIO
178
+ from PIL import Image
179
+ import torchvision.transforms as T
180
+
181
+ def load_image(source: Union[str, Image.Image]) -> Image.Image:
182
+ """加载图像"""
183
+ if isinstance(source, Image.Image):
184
+ img = source
185
+ elif isinstance(source, str):
186
+ if source.startswith('http'):
187
+ response = requests.get(source)
188
+ response.raise_for_status()
189
+ img = Image.open(BytesIO(response.content))
190
+ elif os.path.exists(source):
191
+ img = Image.open(source)
192
+ elif source.startswith('data:image'):
193
+ img = Image.open(BytesIO(base64.b64decode(source.split(',')[1])))
194
+ else:
195
+ raise ValueError("Unsupported image source")
196
+ else:
197
+ raise ValueError("Unsupported image source")
198
+ return img.convert('RGB')
199
+
200
+ def get_scaled_img_size(image_size, max_area, base, max_resolution=4172, upper=True):
201
+ """计算缩放后的图片大小和包裹矩形的大小"""
202
+ # 计算原始图片的宽高比
203
+ aspect_ratio = image_size[0] / image_size[1]
204
+ # 计算包裹矩形的最大可能宽度和高度
205
+ max_width = math.floor(math.sqrt(max_area * aspect_ratio))
206
+ max_height = math.floor(math.sqrt(max_area / aspect_ratio))
207
+ max_width, max_height = min(max_width, max_resolution), min(
208
+ max_height, max_resolution
209
+ )
210
+ max_width, max_height = max(max_width, base), max(max_height, base)
211
+ # 确保包裹矩形的宽度和高度都是base的整数倍
212
+ if not upper:
213
+ # 向下取整, 保证面积不会超过max_area
214
+ max_width = max_width - max_width % base
215
+ max_height = max_height - max_height % base
216
+ else:
217
+ # 向上取整,同时不超过max_resolution(单边最大长度)
218
+ max_width = min(max_width + (base - max_width % base), max_resolution)
219
+ max_height = min(max_height + (base - max_height % base), max_resolution)
220
+ # 计算缩放因子
221
+ scale_factor = min(max_width / image_size[0], max_height / image_size[1])
222
+ # 计算缩放后的图片大小
223
+ new_image_size = (
224
+ round(image_size[0] * scale_factor),
225
+ round(image_size[1] * scale_factor),
226
+ )
227
+ # 计算包裹矩形的大小
228
+ bounding_box_size = (max_width, max_height)
229
+ return new_image_size, bounding_box_size
230
+
231
+
232
+ def max_preprocess(
233
+ img, max_size, base, background_color, max_resolution=4172, upper=True, force_resize=False
234
+ ):
235
+ """对图片进行预处理,使其面积接近max_size**2"""
236
+ # 首先把图片resize到长度和宽度都低于max_resolution
237
+ w, h = img.size
238
+ if max(w, h) > max_resolution:
239
+ scale = max_resolution / max(w, h)
240
+ w, h = int(w * scale), int(h * scale)
241
+ # 获取缩放后的图片大小和包裹矩形的大小
242
+ new_image_size, bounding_box_size = get_scaled_img_size(
243
+ (w, h), max_size**2, base, max_resolution, upper
244
+ )
245
+ if force_resize:
246
+ return img.resize(bounding_box_size)
247
+ # 创建一个新的画布
248
+ canvas = Image.new("RGB", bounding_box_size, background_color)
249
+ # 计算将图像粘贴到画布上的位置
250
+ paste_width = (bounding_box_size[0] - new_image_size[0]) // 2
251
+ paste_height = (bounding_box_size[1] - new_image_size[1]) // 2
252
+ # 将图像粘贴到画布上
253
+ canvas.paste(img.resize(new_image_size), (paste_width, paste_height))
254
+ return canvas
255
+
256
+ def native_preprocess(
257
+ img, max_size, base, background_color, max_resolution=4172, min_tokens=64
258
+ ):
259
+ # 对图片进行处理,使其宽度和高度都是base的整数倍
260
+ # 如果图片的最长边超过max_resolution,就把图片resize到max_resolution以内
261
+ w, h = img.size
262
+ # 首先保证图片的最长边不超过max_resolution(ViT在极限长度)
263
+ if max(w, h) > max_resolution:
264
+ scale = max_resolution / max(w, h)
265
+ w, h = int(w * scale), int(h * scale)
266
+ img = img.resize((w, h))
267
+ if w * h > max_size**2:
268
+ return max_preprocess(img, max_size, base, background_color, max_resolution)
269
+ if w * h < (base * base * min_tokens):
270
+ return max_preprocess(
271
+ img,
272
+ int(base * (min_tokens**0.5)),
273
+ base,
274
+ background_color,
275
+ max_resolution,
276
+ )
277
+ w1, h1 = w + base - w % base, h + base - h % base
278
+ if w1 == w and h1 == h:
279
+ return img
280
+ else:
281
+ # 创建一个新的(w1, h1)的画布,并把图片resize保证只有一侧存在白边的情况
282
+ scale = min(w1 / w, h1 / h)
283
+ new_w, new_h = int(w * scale), int(h * scale)
284
+ img = img.resize((new_w, new_h))
285
+ canvas = Image.new("RGB", (w1, h1), background_color)
286
+ canvas.paste(img, ((w1 - new_w) // 2, (h1 - new_h) // 2))
287
+ return canvas
preprocessor_config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": {
3
+ "height": -1,
4
+ "width": -1
5
+ },
6
+ "do_center_crop": false,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_rescale": true,
10
+ "do_resize": false,
11
+ "hidden_stride": 2,
12
+ "image_mean": [
13
+ 0.48145466,
14
+ 0.4578275,
15
+ 0.40821073
16
+ ],
17
+ "image_processor_type": "CLIPImageProcessor",
18
+ "image_std": [
19
+ 0.26862954,
20
+ 0.26130258,
21
+ 0.27577711
22
+ ],
23
+ "max_pixels": 2408448,
24
+ "min_pixels": 200704,
25
+ "patch_size": 14,
26
+ "resample": 3,
27
+ "rescale_factor": 0.00392156862745098,
28
+ "size": {
29
+ "shortest_edge": -1
30
+ },
31
+ "temporal_patch_size": 1
32
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:337d5a8162e654cb38d9c1a85f9e73d4719efb55b6278e705b5927e9a2ab035f
3
+ size 8719640490
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<img>",
12
+ "</img>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e0d3ee707b399f44f189e1abfb2b3cd844b96407e9b2a5a21cb3e0b5f57bb05
3
+ size 11422629
tokenizer_config.json ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<img>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "</img>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<img>",
224
+ "</img>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "clean_up_tokenization_spaces": false,
231
+ "eos_token": "<|im_end|>",
232
+ "errors": "replace",
233
+ "extra_special_tokens": {},
234
+ "model_max_length": 262144,
235
+ "pad_token": "<|endoftext|>",
236
+ "split_special_tokens": false,
237
+ "tokenizer_class": "Qwen2Tokenizer",
238
+ "unk_token": null
239
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff