WinstonWmj0512 commited on
Commit
d7c557d
·
verified ·
1 Parent(s): 534d283

Upload 17 files

Browse files
config.json ADDED
@@ -0,0 +1,325 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-90",
3
+ "action_dim": 7,
4
+ "add_bias_linear": false,
5
+ "add_qkv_bias": true,
6
+ "arch_specifier": "no-align+fused-gelu-mlp",
7
+ "architectures": [
8
+ "OpenVLAOFTForRLActionPrediction"
9
+ ],
10
+ "attn_implementation": "flash_attention_2",
11
+ "auto_map": {
12
+ "AutoConfig": "configuration_prismatic.OpenVLAConfig",
13
+ "AutoModelForVision2Seq": "modeling_prismatic.OpenVLAForActionPrediction"
14
+ },
15
+ "center_crop": true,
16
+ "ckpt_path": "/mnt/public/mjwei/repo/RLinf-0915/logs/20250930-17:48:12/checkpoints/global_step_50/actor/model.pt",
17
+ "hf_llm_id": "meta-llama/Llama-2-7b-hf",
18
+ "hidden_size": 4096,
19
+ "image_resize_strategy": "resize-naive",
20
+ "image_size": [
21
+ 224,
22
+ 224
23
+ ],
24
+ "image_sizes": [
25
+ 224,
26
+ 224
27
+ ],
28
+ "is_lora": false,
29
+ "llm_backbone_id": "llama2-7b-pure",
30
+ "llm_max_length": 2048,
31
+ "lora_rank": 32,
32
+ "low_cpu_mem_usage": true,
33
+ "model_name": "openvla_oft",
34
+ "model_type": "openvla",
35
+ "n_action_bins": 256,
36
+ "norm_stats": {
37
+ "libero_10": {
38
+ "action": {
39
+ "mask": [
40
+ true,
41
+ true,
42
+ true,
43
+ true,
44
+ true,
45
+ true,
46
+ false
47
+ ],
48
+ "max": [
49
+ 0.9375,
50
+ 0.9375,
51
+ 0.9375,
52
+ 0.30000001192092896,
53
+ 0.29357144236564636,
54
+ 0.375,
55
+ 1.0
56
+ ],
57
+ "mean": [
58
+ 0.01820324920117855,
59
+ 0.05858374014496803,
60
+ -0.05592384561896324,
61
+ 0.004626928828656673,
62
+ 0.00289608770981431,
63
+ -0.007673131301999092,
64
+ 0.5457824468612671
65
+ ],
66
+ "min": [
67
+ -0.9375,
68
+ -0.9375,
69
+ -0.9375,
70
+ -0.23642857372760773,
71
+ -0.3053571283817291,
72
+ -0.3675000071525574,
73
+ 0.0
74
+ ],
75
+ "q01": [
76
+ -0.6348214149475098,
77
+ -0.7741071581840515,
78
+ -0.7633928656578064,
79
+ -0.09749999642372131,
80
+ -0.14819999992847435,
81
+ -0.2742857038974762,
82
+ 0.0
83
+ ],
84
+ "q99": [
85
+ 0.7714285850524902,
86
+ 0.8464285731315613,
87
+ 0.9375,
88
+ 0.13928571343421936,
89
+ 0.15964286029338837,
90
+ 0.3246428668498993,
91
+ 1.0
92
+ ],
93
+ "std": [
94
+ 0.2825464606285095,
95
+ 0.35904666781425476,
96
+ 0.3673802614212036,
97
+ 0.03770702704787254,
98
+ 0.05429719388484955,
99
+ 0.08725254982709885,
100
+ 0.49815231561660767
101
+ ]
102
+ },
103
+ "num_trajectories": 379,
104
+ "num_transitions": 101469,
105
+ "proprio": {
106
+ "max": [
107
+ 0.0,
108
+ 0.0,
109
+ 0.0,
110
+ 0.0,
111
+ 0.0,
112
+ 0.0,
113
+ 0.0
114
+ ],
115
+ "mean": [
116
+ 0.0,
117
+ 0.0,
118
+ 0.0,
119
+ 0.0,
120
+ 0.0,
121
+ 0.0,
122
+ 0.0
123
+ ],
124
+ "min": [
125
+ 0.0,
126
+ 0.0,
127
+ 0.0,
128
+ 0.0,
129
+ 0.0,
130
+ 0.0,
131
+ 0.0
132
+ ],
133
+ "q01": [
134
+ 0.0,
135
+ 0.0,
136
+ 0.0,
137
+ 0.0,
138
+ 0.0,
139
+ 0.0,
140
+ 0.0
141
+ ],
142
+ "q99": [
143
+ 0.0,
144
+ 0.0,
145
+ 0.0,
146
+ 0.0,
147
+ 0.0,
148
+ 0.0,
149
+ 0.0
150
+ ],
151
+ "std": [
152
+ 0.0,
153
+ 0.0,
154
+ 0.0,
155
+ 0.0,
156
+ 0.0,
157
+ 0.0,
158
+ 0.0
159
+ ]
160
+ }
161
+ },
162
+ "libero_90_no_noops_trajall": {
163
+ "action": {
164
+ "mask": [
165
+ true,
166
+ true,
167
+ true,
168
+ true,
169
+ true,
170
+ true,
171
+ false
172
+ ],
173
+ "max": [
174
+ 0.9375,
175
+ 0.9375,
176
+ 0.9375,
177
+ 0.375,
178
+ 0.375,
179
+ 0.375,
180
+ 1.0
181
+ ],
182
+ "mean": [
183
+ 0.04552333056926727,
184
+ 0.037328675389289856,
185
+ -0.09673094749450684,
186
+ 0.00501923356205225,
187
+ 0.0022719360422343016,
188
+ -0.006229684222489595,
189
+ 0.5282046794891357
190
+ ],
191
+ "min": [
192
+ -0.9375,
193
+ -0.9375,
194
+ -0.9375,
195
+ -0.3257142901420593,
196
+ -0.375,
197
+ -0.375,
198
+ 0.0
199
+ ],
200
+ "q01": [
201
+ -0.6294642686843872,
202
+ -0.8705357313156128,
203
+ -0.8946428298950195,
204
+ -0.12321428209543228,
205
+ -0.1574999988079071,
206
+ -0.2775000035762787,
207
+ 0.0
208
+ ],
209
+ "q99": [
210
+ 0.8517857193946838,
211
+ 0.8464285731315613,
212
+ 0.9375,
213
+ 0.1875,
214
+ 0.1778571456670761,
215
+ 0.3471428453922272,
216
+ 1.0
217
+ ],
218
+ "std": [
219
+ 0.2984432280063629,
220
+ 0.3612210154533386,
221
+ 0.40674421191215515,
222
+ 0.04839186742901802,
223
+ 0.05818791687488556,
224
+ 0.08691468834877014,
225
+ 0.4985364079475403
226
+ ]
227
+ },
228
+ "num_trajectories": 3954,
229
+ "num_transitions": 573965,
230
+ "proprio": {
231
+ "max": [
232
+ 0.20274034142494202,
233
+ 0.4884968400001526,
234
+ 1.3584461212158203,
235
+ 4.8432722091674805,
236
+ 3.966320753097534,
237
+ 2.4007365703582764,
238
+ 0.04637677222490311,
239
+ 0.0017036759527400136
240
+ ],
241
+ "mean": [
242
+ -0.08226173371076584,
243
+ 0.010916306637227535,
244
+ 0.9453046917915344,
245
+ 2.9744503498077393,
246
+ -0.11405276507139206,
247
+ -0.09964624792337418,
248
+ 0.026635831221938133,
249
+ -0.027010178193449974
250
+ ],
251
+ "min": [
252
+ -0.48259806632995605,
253
+ -0.3968846797943115,
254
+ 0.4455491006374359,
255
+ -0.7501075863838196,
256
+ -4.363162040710449,
257
+ -3.2127554416656494,
258
+ -0.002592125441879034,
259
+ -0.04256961867213249
260
+ ],
261
+ "q01": [
262
+ -0.4019535529613495,
263
+ -0.2819894528388977,
264
+ 0.458499813079834,
265
+ 1.229066481590271,
266
+ -2.779330949783325,
267
+ -1.3500228834152221,
268
+ 0.0016688233194872737,
269
+ -0.04004087835550308
270
+ ],
271
+ "q99": [
272
+ 0.12681280374526968,
273
+ 0.3188697147369384,
274
+ 1.2563055849075317,
275
+ 3.8263492584228516,
276
+ 2.3427903938293455,
277
+ 0.6062234616279595,
278
+ 0.04022635221481323,
279
+ -0.0016752025950700054
280
+ ],
281
+ "std": [
282
+ 0.11324016749858856,
283
+ 0.14199486374855042,
284
+ 0.23618334531784058,
285
+ 0.43265825510025024,
286
+ 0.9901652336120605,
287
+ 0.32449689507484436,
288
+ 0.014563564211130142,
289
+ 0.01443739328533411
290
+ ]
291
+ }
292
+ }
293
+ },
294
+ "num_action_chunks": 8,
295
+ "num_images_in_input": 1,
296
+ "output_projector_states": false,
297
+ "pad_to_multiple_of": 64,
298
+ "pad_token_id": 32000,
299
+ "policy_setup": "widowx_bridge",
300
+ "precision": "bf16",
301
+ "text_config": {
302
+ "model_type": "llama",
303
+ "pad_token_id": 32000,
304
+ "torch_dtype": "bfloat16",
305
+ "vocab_size": 32064
306
+ },
307
+ "timm_model_ids": [
308
+ "vit_large_patch14_reg4_dinov2.lvd142m",
309
+ "vit_so400m_patch14_siglip_224"
310
+ ],
311
+ "timm_override_act_layers": [
312
+ null,
313
+ null
314
+ ],
315
+ "torch_dtype": "bfloat16",
316
+ "transformers_version": "4.40.1",
317
+ "trust_remote_code": true,
318
+ "unnorm_key": "libero_90_no_noops_trajall",
319
+ "use_fused_vision_backbone": true,
320
+ "use_proprio": false,
321
+ "value_type": "step_level",
322
+ "vh_mode": "a0",
323
+ "vision_backbone_id": "dinosiglip-vit-so-224px",
324
+ "vocab_size": 32000
325
+ }
configuration_prismatic.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ configuration_prismatic.py
3
+
4
+ HuggingFace-style configuration definition for Prismatic VLMs, inheriting from `transformers.PretrainedConfig`.
5
+ Default configuration specifies `siglip-224px+7b`.
6
+ """
7
+
8
+ from typing import Any, Dict, List, Optional
9
+
10
+ from transformers import PretrainedConfig
11
+ from transformers.models.auto import CONFIG_MAPPING
12
+
13
+ # === Utilities for Mapping Prismatic names to HF names ===
14
+ # fmt: off
15
+ VISION_BACKBONE_TO_RESOLUTION: Dict[str, List[int]] = {
16
+ "clip-vit-l": [224], "siglip-vit-so400m": [224], "dinov2-vit-l": [224], "in1k-vit-l": [224],
17
+
18
+ "clip-vit-l-336px": [336],
19
+ "siglip-vit-so400m-384px": [384],
20
+
21
+ "dinoclip-vit-l-336px": [336, 336],
22
+ "dinosiglip-vit-so-224px": [224, 224],
23
+ "dinosiglip-vit-so-384px": [384, 384],
24
+ }
25
+ VISION_BACKBONE_TO_TIMM_ID: Dict[str, List[str]] = {
26
+ "clip-vit-l": ["vit_large_patch14_clip_224.openai"],
27
+ "clip-vit-l-336px": ["vit_large_patch14_clip_336.openai"],
28
+
29
+ "dinov2-vit-l": ["vit_large_patch14_reg4_dinov2.lvd142m"],
30
+ "in1k-vit-l": ["vit_large_patch16_224.augreg_in21k_ft_in1k"],
31
+
32
+ "siglip-vit-so400m": ["vit_so400m_patch14_siglip_224"],
33
+ "siglip-vit-so400m-384px": ["vit_so400m_patch14_siglip_384"],
34
+
35
+ "dinoclip-vit-l-336px": ["vit_large_patch14_reg4_dinov2.lvd142m", "vit_large_patch14_clip_336.openai"],
36
+ "dinosiglip-vit-so-224px": ["vit_large_patch14_reg4_dinov2.lvd142m", "vit_so400m_patch14_siglip_224"],
37
+ "dinosiglip-vit-so-384px": ["vit_large_patch14_reg4_dinov2.lvd142m", "vit_so400m_patch14_siglip_384"],
38
+ }
39
+ TIMM_OVERRIDE_ACT_LAYER: Dict[str, List[Optional[str]]] = {
40
+ "clip-vit-l": ["quick_gelu"], "clip-vit-l-336px": ["quick_gelu"],
41
+ "dinov2-vit-l": [None], "in1k-vit-l": [None],
42
+ "siglip-vit-so400m": [None], "siglip-vit-so400m-384px": [None],
43
+ "dinoclip-vit-l-336px": [None, "quick_gelu"],
44
+ "dinosiglip-vit-so-224px": [None, None], "dinosiglip-vit-so-384px": [None, None]
45
+ }
46
+
47
+ LLM_BACKBONE_TO_HF_PATH = {
48
+ "llama2-7b-pure": "meta-llama/Llama-2-7b-hf", "llama2-13b-pure": "meta-llama/Llama-2-13b-hf",
49
+ "llama2-7b-chat": "meta-llama/Llama-2-7b-chat-hf", "llama2-13b-chat": "meta-llama/Llama-2-13b-chat-hf",
50
+
51
+ "vicuna-v15-7b": "lmsys/vicuna-7b-v1.5", "vicuna-v15-13b": "lmsys/vicuna-13b-v1.5",
52
+
53
+ "mistral-v0.1-7b-pure": "mistralai/Mistral-7B-v0.1",
54
+ "mistral-v0.1-7b-instruct": "mistralai/Mistral-7B-Instruct-v0.1",
55
+
56
+ "phi-2-3b": "microsoft/phi-2",
57
+ }
58
+ LLM_BACKBONE_TO_HF_METACLASS = {
59
+ "llama2-7b-pure": "llama", "llama2-13b-pure": "llama", "llama2-7b-chat": "llama", "llama2-13b-chat": "llama",
60
+ "vicuna-v15-7b": "llama", "vicuna-v15-13b": "llama",
61
+
62
+ "mistral-v0.1-7b-pure": "mistral", "mistral-v0.1-7b-instruct": "mistral",
63
+
64
+ "phi-2-3b": "phi",
65
+ }
66
+
67
+ VALID_VISION_BACKBONES = set(VISION_BACKBONE_TO_RESOLUTION.keys())
68
+ VALID_LLM_BACKBONES = set(LLM_BACKBONE_TO_HF_PATH)
69
+ # fmt: on
70
+
71
+
72
+ class PrismaticConfig(PretrainedConfig):
73
+ model_type: str = "prismatic"
74
+ is_composition: bool = False
75
+
76
+ def __init__(
77
+ self,
78
+ vision_backbone_id: str = "siglip-vit-so400m",
79
+ llm_backbone_id: str = "vicuna-v15-7b",
80
+ arch_specifier: str = "no-align+gelu-mlp",
81
+ use_fused_vision_backbone: Optional[bool] = None,
82
+ image_resize_strategy: str = "letterbox",
83
+ text_config: Optional[Dict[str, Any]] = None,
84
+ llm_max_length: int = 2048,
85
+ pad_token_id: int = 32000,
86
+ pad_to_multiple_of: int = 64,
87
+ output_projector_states: bool = False,
88
+ **kwargs: str,
89
+ ) -> None:
90
+ if vision_backbone_id not in VALID_VISION_BACKBONES:
91
+ raise ValueError(f"Vision backbone `{vision_backbone_id}` not in {VALID_VISION_BACKBONES = }")
92
+
93
+ if llm_backbone_id not in VALID_LLM_BACKBONES:
94
+ raise ValueError(f"LLM backbone `{llm_backbone_id}` not in {VALID_LLM_BACKBONES = }")
95
+
96
+ # Set Prismatic Configuration Fields
97
+ self.vision_backbone_id = vision_backbone_id
98
+ self.llm_backbone_id = llm_backbone_id
99
+ self.arch_specifier = arch_specifier
100
+ self.output_projector_states = output_projector_states
101
+
102
+ # [Contract] All vision backbone parameters are lists =>> supports fused backbones with different preprocessing
103
+ self.use_fused_vision_backbone = (
104
+ use_fused_vision_backbone
105
+ if use_fused_vision_backbone is not None
106
+ else any(self.vision_backbone_id.startswith(v) for v in ["dinoclip", "dinosiglip"])
107
+ )
108
+
109
+ self.timm_model_ids = VISION_BACKBONE_TO_TIMM_ID[self.vision_backbone_id]
110
+ self.timm_override_act_layers = TIMM_OVERRIDE_ACT_LAYER[self.vision_backbone_id]
111
+ self.image_sizes = VISION_BACKBONE_TO_RESOLUTION[self.vision_backbone_id]
112
+ self.image_resize_strategy = image_resize_strategy
113
+
114
+ self.hf_llm_id = LLM_BACKBONE_TO_HF_PATH[self.llm_backbone_id]
115
+ self.llm_max_length = llm_max_length
116
+ self.pad_token_id, self.pad_to_multiple_of = pad_token_id, pad_to_multiple_of
117
+
118
+ # [IMPORTANT] HF Utilities actually look for a `text_config` field... we need to use that specific naming!
119
+ self.text_config = (
120
+ CONFIG_MAPPING[LLM_BACKBONE_TO_HF_METACLASS[self.llm_backbone_id]](**text_config)
121
+ if text_config is not None
122
+ else CONFIG_MAPPING[LLM_BACKBONE_TO_HF_METACLASS[self.llm_backbone_id]]()
123
+ )
124
+
125
+ # Dispatch **kwargs to super() =>> note that `pad_token_id` collides, so we pass it in here as well...
126
+ super().__init__(pad_token_id=pad_token_id, **kwargs)
127
+
128
+
129
+ class OpenVLAConfig(PrismaticConfig):
130
+ model_type: str = "openvla"
131
+
132
+ def __init__(
133
+ self,
134
+ norm_stats: Optional[Dict[str, Dict[str, Dict[str, Dict[str, List[float]]]]]] = None,
135
+ n_action_bins: int = 256,
136
+ **kwargs: str,
137
+ ) -> None:
138
+ self.norm_stats, self.n_action_bins = norm_stats, n_action_bins
139
+
140
+ super().__init__(**kwargs)
dataset_statistics.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "libero_90_no_noops_trajall": {
3
+ "action": {
4
+ "mean": [
5
+ 0.04552333056926727,
6
+ 0.037328675389289856,
7
+ -0.09673094749450684,
8
+ 0.00501923356205225,
9
+ 0.0022719360422343016,
10
+ -0.006229684222489595,
11
+ 0.5282046794891357
12
+ ],
13
+ "std": [
14
+ 0.2984432280063629,
15
+ 0.3612210154533386,
16
+ 0.40674421191215515,
17
+ 0.04839186742901802,
18
+ 0.05818791687488556,
19
+ 0.08691468834877014,
20
+ 0.4985364079475403
21
+ ],
22
+ "max": [
23
+ 0.9375,
24
+ 0.9375,
25
+ 0.9375,
26
+ 0.375,
27
+ 0.375,
28
+ 0.375,
29
+ 1.0
30
+ ],
31
+ "min": [
32
+ -0.9375,
33
+ -0.9375,
34
+ -0.9375,
35
+ -0.3257142901420593,
36
+ -0.375,
37
+ -0.375,
38
+ 0.0
39
+ ],
40
+ "q01": [
41
+ -0.6294642686843872,
42
+ -0.8705357313156128,
43
+ -0.8946428298950195,
44
+ -0.12321428209543228,
45
+ -0.1574999988079071,
46
+ -0.2775000035762787,
47
+ 0.0
48
+ ],
49
+ "q99": [
50
+ 0.8517857193946838,
51
+ 0.8464285731315613,
52
+ 0.9375,
53
+ 0.1875,
54
+ 0.1778571456670761,
55
+ 0.3471428453922272,
56
+ 1.0
57
+ ],
58
+ "mask": [
59
+ true,
60
+ true,
61
+ true,
62
+ true,
63
+ true,
64
+ true,
65
+ false
66
+ ]
67
+ },
68
+ "proprio": {
69
+ "mean": [
70
+ -0.08226173371076584,
71
+ 0.010916306637227535,
72
+ 0.9453046917915344,
73
+ 2.9744503498077393,
74
+ -0.11405276507139206,
75
+ -0.09964624792337418,
76
+ 0.026635831221938133,
77
+ -0.027010178193449974
78
+ ],
79
+ "std": [
80
+ 0.11324016749858856,
81
+ 0.14199486374855042,
82
+ 0.23618334531784058,
83
+ 0.43265825510025024,
84
+ 0.9901652336120605,
85
+ 0.32449689507484436,
86
+ 0.014563564211130142,
87
+ 0.01443739328533411
88
+ ],
89
+ "max": [
90
+ 0.20274034142494202,
91
+ 0.4884968400001526,
92
+ 1.3584461212158203,
93
+ 4.8432722091674805,
94
+ 3.966320753097534,
95
+ 2.4007365703582764,
96
+ 0.04637677222490311,
97
+ 0.0017036759527400136
98
+ ],
99
+ "min": [
100
+ -0.48259806632995605,
101
+ -0.3968846797943115,
102
+ 0.4455491006374359,
103
+ -0.7501075863838196,
104
+ -4.363162040710449,
105
+ -3.2127554416656494,
106
+ -0.002592125441879034,
107
+ -0.04256961867213249
108
+ ],
109
+ "q01": [
110
+ -0.4019535529613495,
111
+ -0.2819894528388977,
112
+ 0.458499813079834,
113
+ 1.229066481590271,
114
+ -2.779330949783325,
115
+ -1.3500228834152221,
116
+ 0.0016688233194872737,
117
+ -0.04004087835550308
118
+ ],
119
+ "q99": [
120
+ 0.12681280374526968,
121
+ 0.3188697147369384,
122
+ 1.2563055849075317,
123
+ 3.8263492584228516,
124
+ 2.3427903938293455,
125
+ 0.6062234616279595,
126
+ 0.04022635221481323,
127
+ -0.0016752025950700054
128
+ ]
129
+ },
130
+ "num_transitions": 573965,
131
+ "num_trajectories": 3954
132
+ }
133
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 32000,
6
+ "transformers_version": "4.40.1"
7
+ }
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f3e26bb9c109bf8f909daf13f0b08a662184036164b0cecd7b8b20812ec90b5
3
+ size 4925122448
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49c51285a710d9dac42759fbce1a4cfebd5fbf6e226d0893614edb12e6fd43db
3
+ size 4947392496
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e35784711a9ccf5b12164757d3466114726064ba208241d03f4fa3ed6465fbe
3
+ size 4947417456
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e21b4736ecd3c0993b7614a54085d47588ceb876044e7537c3a2795169694981
3
+ size 266997624
model.safetensors.index.json ADDED
@@ -0,0 +1,994 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 15086803072
4
+ },
5
+ "weight_map": {
6
+ "language_model.lm_head.weight": "model-00004-of-00004.safetensors",
7
+ "language_model.model.embed_tokens.weight": "model-00001-of-00004.safetensors",
8
+ "language_model.model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
9
+ "language_model.model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
10
+ "language_model.model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
11
+ "language_model.model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
12
+ "language_model.model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
13
+ "language_model.model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
14
+ "language_model.model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
15
+ "language_model.model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
16
+ "language_model.model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
17
+ "language_model.model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
18
+ "language_model.model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
19
+ "language_model.model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
20
+ "language_model.model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
21
+ "language_model.model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
22
+ "language_model.model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
23
+ "language_model.model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
24
+ "language_model.model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
25
+ "language_model.model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
26
+ "language_model.model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
27
+ "language_model.model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
28
+ "language_model.model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
29
+ "language_model.model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
30
+ "language_model.model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
31
+ "language_model.model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
32
+ "language_model.model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
33
+ "language_model.model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
34
+ "language_model.model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
35
+ "language_model.model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
36
+ "language_model.model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
37
+ "language_model.model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
38
+ "language_model.model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
39
+ "language_model.model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
40
+ "language_model.model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
41
+ "language_model.model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
42
+ "language_model.model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
43
+ "language_model.model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
44
+ "language_model.model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
45
+ "language_model.model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
46
+ "language_model.model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
47
+ "language_model.model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
48
+ "language_model.model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
49
+ "language_model.model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
50
+ "language_model.model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
51
+ "language_model.model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
52
+ "language_model.model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
53
+ "language_model.model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
54
+ "language_model.model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
55
+ "language_model.model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
56
+ "language_model.model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
57
+ "language_model.model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
58
+ "language_model.model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
59
+ "language_model.model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
60
+ "language_model.model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
61
+ "language_model.model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
62
+ "language_model.model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
63
+ "language_model.model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
64
+ "language_model.model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
65
+ "language_model.model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
66
+ "language_model.model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
67
+ "language_model.model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
68
+ "language_model.model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
69
+ "language_model.model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
70
+ "language_model.model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
71
+ "language_model.model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
72
+ "language_model.model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
73
+ "language_model.model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
74
+ "language_model.model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
75
+ "language_model.model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
76
+ "language_model.model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
77
+ "language_model.model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
78
+ "language_model.model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
79
+ "language_model.model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
80
+ "language_model.model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
81
+ "language_model.model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
82
+ "language_model.model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
83
+ "language_model.model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
84
+ "language_model.model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
85
+ "language_model.model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
86
+ "language_model.model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
87
+ "language_model.model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
88
+ "language_model.model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
89
+ "language_model.model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
90
+ "language_model.model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
91
+ "language_model.model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
92
+ "language_model.model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
93
+ "language_model.model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
94
+ "language_model.model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
95
+ "language_model.model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
96
+ "language_model.model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
97
+ "language_model.model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
98
+ "language_model.model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
99
+ "language_model.model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
100
+ "language_model.model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
101
+ "language_model.model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
102
+ "language_model.model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
103
+ "language_model.model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
104
+ "language_model.model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
105
+ "language_model.model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
106
+ "language_model.model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
107
+ "language_model.model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
108
+ "language_model.model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
109
+ "language_model.model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
110
+ "language_model.model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
111
+ "language_model.model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
112
+ "language_model.model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
113
+ "language_model.model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
114
+ "language_model.model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
115
+ "language_model.model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
116
+ "language_model.model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
117
+ "language_model.model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
118
+ "language_model.model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
119
+ "language_model.model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
120
+ "language_model.model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
121
+ "language_model.model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
122
+ "language_model.model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
123
+ "language_model.model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
124
+ "language_model.model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
125
+ "language_model.model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
126
+ "language_model.model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
127
+ "language_model.model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
128
+ "language_model.model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
129
+ "language_model.model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
130
+ "language_model.model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
131
+ "language_model.model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
132
+ "language_model.model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
133
+ "language_model.model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
134
+ "language_model.model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
135
+ "language_model.model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
136
+ "language_model.model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
137
+ "language_model.model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
138
+ "language_model.model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
139
+ "language_model.model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
140
+ "language_model.model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
141
+ "language_model.model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
142
+ "language_model.model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
143
+ "language_model.model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
144
+ "language_model.model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
145
+ "language_model.model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
146
+ "language_model.model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
147
+ "language_model.model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
148
+ "language_model.model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
149
+ "language_model.model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
150
+ "language_model.model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
151
+ "language_model.model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
152
+ "language_model.model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
153
+ "language_model.model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
154
+ "language_model.model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
155
+ "language_model.model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
156
+ "language_model.model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
157
+ "language_model.model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
158
+ "language_model.model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
159
+ "language_model.model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
160
+ "language_model.model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
161
+ "language_model.model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
162
+ "language_model.model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
163
+ "language_model.model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
164
+ "language_model.model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
165
+ "language_model.model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
166
+ "language_model.model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
167
+ "language_model.model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
168
+ "language_model.model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
169
+ "language_model.model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
170
+ "language_model.model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
171
+ "language_model.model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
172
+ "language_model.model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
173
+ "language_model.model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
174
+ "language_model.model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
175
+ "language_model.model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
176
+ "language_model.model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
177
+ "language_model.model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
178
+ "language_model.model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
179
+ "language_model.model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
180
+ "language_model.model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
181
+ "language_model.model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
182
+ "language_model.model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
183
+ "language_model.model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
184
+ "language_model.model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
185
+ "language_model.model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
186
+ "language_model.model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
187
+ "language_model.model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
188
+ "language_model.model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
189
+ "language_model.model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
190
+ "language_model.model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
191
+ "language_model.model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
192
+ "language_model.model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
193
+ "language_model.model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
194
+ "language_model.model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
195
+ "language_model.model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
196
+ "language_model.model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
197
+ "language_model.model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
198
+ "language_model.model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
199
+ "language_model.model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
200
+ "language_model.model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
201
+ "language_model.model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
202
+ "language_model.model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
203
+ "language_model.model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
204
+ "language_model.model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
205
+ "language_model.model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
206
+ "language_model.model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors",
207
+ "language_model.model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
208
+ "language_model.model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
209
+ "language_model.model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
210
+ "language_model.model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
211
+ "language_model.model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
212
+ "language_model.model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
213
+ "language_model.model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
214
+ "language_model.model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
215
+ "language_model.model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
216
+ "language_model.model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
217
+ "language_model.model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
218
+ "language_model.model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
219
+ "language_model.model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
220
+ "language_model.model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
221
+ "language_model.model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
222
+ "language_model.model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
223
+ "language_model.model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
224
+ "language_model.model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
225
+ "language_model.model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
226
+ "language_model.model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
227
+ "language_model.model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
228
+ "language_model.model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
229
+ "language_model.model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
230
+ "language_model.model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
231
+ "language_model.model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
232
+ "language_model.model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
233
+ "language_model.model.layers.31.input_layernorm.weight": "model-00003-of-00004.safetensors",
234
+ "language_model.model.layers.31.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
235
+ "language_model.model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
236
+ "language_model.model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
237
+ "language_model.model.layers.31.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
238
+ "language_model.model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
239
+ "language_model.model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
240
+ "language_model.model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
241
+ "language_model.model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
242
+ "language_model.model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
243
+ "language_model.model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
244
+ "language_model.model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
245
+ "language_model.model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
246
+ "language_model.model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
247
+ "language_model.model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
248
+ "language_model.model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
249
+ "language_model.model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
250
+ "language_model.model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
251
+ "language_model.model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
252
+ "language_model.model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
253
+ "language_model.model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
254
+ "language_model.model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
255
+ "language_model.model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
256
+ "language_model.model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
257
+ "language_model.model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
258
+ "language_model.model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
259
+ "language_model.model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
260
+ "language_model.model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
261
+ "language_model.model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
262
+ "language_model.model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
263
+ "language_model.model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
264
+ "language_model.model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
265
+ "language_model.model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
266
+ "language_model.model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
267
+ "language_model.model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
268
+ "language_model.model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
269
+ "language_model.model.layers.7.input_layernorm.weight": "model-00002-of-00004.safetensors",
270
+ "language_model.model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
271
+ "language_model.model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
272
+ "language_model.model.layers.7.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
273
+ "language_model.model.layers.7.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
274
+ "language_model.model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
275
+ "language_model.model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
276
+ "language_model.model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
277
+ "language_model.model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
278
+ "language_model.model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
279
+ "language_model.model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
280
+ "language_model.model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
281
+ "language_model.model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
282
+ "language_model.model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
283
+ "language_model.model.layers.8.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
284
+ "language_model.model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
285
+ "language_model.model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
286
+ "language_model.model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
287
+ "language_model.model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
288
+ "language_model.model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
289
+ "language_model.model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
290
+ "language_model.model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
291
+ "language_model.model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
292
+ "language_model.model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
293
+ "language_model.model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
294
+ "language_model.model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
295
+ "language_model.model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
296
+ "language_model.model.norm.weight": "model-00003-of-00004.safetensors",
297
+ "projector.fc1.bias": "model-00001-of-00004.safetensors",
298
+ "projector.fc1.weight": "model-00001-of-00004.safetensors",
299
+ "projector.fc2.bias": "model-00001-of-00004.safetensors",
300
+ "projector.fc2.weight": "model-00001-of-00004.safetensors",
301
+ "projector.fc3.bias": "model-00001-of-00004.safetensors",
302
+ "projector.fc3.weight": "model-00001-of-00004.safetensors",
303
+ "value_head.head_l1.bias": "model-00004-of-00004.safetensors",
304
+ "value_head.head_l1.weight": "model-00004-of-00004.safetensors",
305
+ "value_head.head_l2.bias": "model-00004-of-00004.safetensors",
306
+ "value_head.head_l2.weight": "model-00004-of-00004.safetensors",
307
+ "value_head.head_l3.weight": "model-00004-of-00004.safetensors",
308
+ "vision_backbone.featurizer.blocks.0.attn.proj.bias": "model-00001-of-00004.safetensors",
309
+ "vision_backbone.featurizer.blocks.0.attn.proj.weight": "model-00001-of-00004.safetensors",
310
+ "vision_backbone.featurizer.blocks.0.attn.qkv.bias": "model-00001-of-00004.safetensors",
311
+ "vision_backbone.featurizer.blocks.0.attn.qkv.weight": "model-00001-of-00004.safetensors",
312
+ "vision_backbone.featurizer.blocks.0.ls1.scale_factor": "model-00001-of-00004.safetensors",
313
+ "vision_backbone.featurizer.blocks.0.ls2.scale_factor": "model-00001-of-00004.safetensors",
314
+ "vision_backbone.featurizer.blocks.0.mlp.fc1.bias": "model-00001-of-00004.safetensors",
315
+ "vision_backbone.featurizer.blocks.0.mlp.fc1.weight": "model-00001-of-00004.safetensors",
316
+ "vision_backbone.featurizer.blocks.0.mlp.fc2.bias": "model-00001-of-00004.safetensors",
317
+ "vision_backbone.featurizer.blocks.0.mlp.fc2.weight": "model-00001-of-00004.safetensors",
318
+ "vision_backbone.featurizer.blocks.0.norm1.bias": "model-00001-of-00004.safetensors",
319
+ "vision_backbone.featurizer.blocks.0.norm1.weight": "model-00001-of-00004.safetensors",
320
+ "vision_backbone.featurizer.blocks.0.norm2.bias": "model-00001-of-00004.safetensors",
321
+ "vision_backbone.featurizer.blocks.0.norm2.weight": "model-00001-of-00004.safetensors",
322
+ "vision_backbone.featurizer.blocks.1.attn.proj.bias": "model-00001-of-00004.safetensors",
323
+ "vision_backbone.featurizer.blocks.1.attn.proj.weight": "model-00001-of-00004.safetensors",
324
+ "vision_backbone.featurizer.blocks.1.attn.qkv.bias": "model-00001-of-00004.safetensors",
325
+ "vision_backbone.featurizer.blocks.1.attn.qkv.weight": "model-00001-of-00004.safetensors",
326
+ "vision_backbone.featurizer.blocks.1.ls1.scale_factor": "model-00001-of-00004.safetensors",
327
+ "vision_backbone.featurizer.blocks.1.ls2.scale_factor": "model-00001-of-00004.safetensors",
328
+ "vision_backbone.featurizer.blocks.1.mlp.fc1.bias": "model-00001-of-00004.safetensors",
329
+ "vision_backbone.featurizer.blocks.1.mlp.fc1.weight": "model-00001-of-00004.safetensors",
330
+ "vision_backbone.featurizer.blocks.1.mlp.fc2.bias": "model-00001-of-00004.safetensors",
331
+ "vision_backbone.featurizer.blocks.1.mlp.fc2.weight": "model-00001-of-00004.safetensors",
332
+ "vision_backbone.featurizer.blocks.1.norm1.bias": "model-00001-of-00004.safetensors",
333
+ "vision_backbone.featurizer.blocks.1.norm1.weight": "model-00001-of-00004.safetensors",
334
+ "vision_backbone.featurizer.blocks.1.norm2.bias": "model-00001-of-00004.safetensors",
335
+ "vision_backbone.featurizer.blocks.1.norm2.weight": "model-00001-of-00004.safetensors",
336
+ "vision_backbone.featurizer.blocks.10.attn.proj.bias": "model-00001-of-00004.safetensors",
337
+ "vision_backbone.featurizer.blocks.10.attn.proj.weight": "model-00001-of-00004.safetensors",
338
+ "vision_backbone.featurizer.blocks.10.attn.qkv.bias": "model-00001-of-00004.safetensors",
339
+ "vision_backbone.featurizer.blocks.10.attn.qkv.weight": "model-00001-of-00004.safetensors",
340
+ "vision_backbone.featurizer.blocks.10.ls1.scale_factor": "model-00001-of-00004.safetensors",
341
+ "vision_backbone.featurizer.blocks.10.ls2.scale_factor": "model-00001-of-00004.safetensors",
342
+ "vision_backbone.featurizer.blocks.10.mlp.fc1.bias": "model-00001-of-00004.safetensors",
343
+ "vision_backbone.featurizer.blocks.10.mlp.fc1.weight": "model-00001-of-00004.safetensors",
344
+ "vision_backbone.featurizer.blocks.10.mlp.fc2.bias": "model-00001-of-00004.safetensors",
345
+ "vision_backbone.featurizer.blocks.10.mlp.fc2.weight": "model-00001-of-00004.safetensors",
346
+ "vision_backbone.featurizer.blocks.10.norm1.bias": "model-00001-of-00004.safetensors",
347
+ "vision_backbone.featurizer.blocks.10.norm1.weight": "model-00001-of-00004.safetensors",
348
+ "vision_backbone.featurizer.blocks.10.norm2.bias": "model-00001-of-00004.safetensors",
349
+ "vision_backbone.featurizer.blocks.10.norm2.weight": "model-00001-of-00004.safetensors",
350
+ "vision_backbone.featurizer.blocks.11.attn.proj.bias": "model-00001-of-00004.safetensors",
351
+ "vision_backbone.featurizer.blocks.11.attn.proj.weight": "model-00001-of-00004.safetensors",
352
+ "vision_backbone.featurizer.blocks.11.attn.qkv.bias": "model-00001-of-00004.safetensors",
353
+ "vision_backbone.featurizer.blocks.11.attn.qkv.weight": "model-00001-of-00004.safetensors",
354
+ "vision_backbone.featurizer.blocks.11.ls1.scale_factor": "model-00001-of-00004.safetensors",
355
+ "vision_backbone.featurizer.blocks.11.ls2.scale_factor": "model-00001-of-00004.safetensors",
356
+ "vision_backbone.featurizer.blocks.11.mlp.fc1.bias": "model-00001-of-00004.safetensors",
357
+ "vision_backbone.featurizer.blocks.11.mlp.fc1.weight": "model-00001-of-00004.safetensors",
358
+ "vision_backbone.featurizer.blocks.11.mlp.fc2.bias": "model-00001-of-00004.safetensors",
359
+ "vision_backbone.featurizer.blocks.11.mlp.fc2.weight": "model-00001-of-00004.safetensors",
360
+ "vision_backbone.featurizer.blocks.11.norm1.bias": "model-00001-of-00004.safetensors",
361
+ "vision_backbone.featurizer.blocks.11.norm1.weight": "model-00001-of-00004.safetensors",
362
+ "vision_backbone.featurizer.blocks.11.norm2.bias": "model-00001-of-00004.safetensors",
363
+ "vision_backbone.featurizer.blocks.11.norm2.weight": "model-00001-of-00004.safetensors",
364
+ "vision_backbone.featurizer.blocks.12.attn.proj.bias": "model-00001-of-00004.safetensors",
365
+ "vision_backbone.featurizer.blocks.12.attn.proj.weight": "model-00001-of-00004.safetensors",
366
+ "vision_backbone.featurizer.blocks.12.attn.qkv.bias": "model-00001-of-00004.safetensors",
367
+ "vision_backbone.featurizer.blocks.12.attn.qkv.weight": "model-00001-of-00004.safetensors",
368
+ "vision_backbone.featurizer.blocks.12.ls1.scale_factor": "model-00001-of-00004.safetensors",
369
+ "vision_backbone.featurizer.blocks.12.ls2.scale_factor": "model-00001-of-00004.safetensors",
370
+ "vision_backbone.featurizer.blocks.12.mlp.fc1.bias": "model-00001-of-00004.safetensors",
371
+ "vision_backbone.featurizer.blocks.12.mlp.fc1.weight": "model-00001-of-00004.safetensors",
372
+ "vision_backbone.featurizer.blocks.12.mlp.fc2.bias": "model-00001-of-00004.safetensors",
373
+ "vision_backbone.featurizer.blocks.12.mlp.fc2.weight": "model-00001-of-00004.safetensors",
374
+ "vision_backbone.featurizer.blocks.12.norm1.bias": "model-00001-of-00004.safetensors",
375
+ "vision_backbone.featurizer.blocks.12.norm1.weight": "model-00001-of-00004.safetensors",
376
+ "vision_backbone.featurizer.blocks.12.norm2.bias": "model-00001-of-00004.safetensors",
377
+ "vision_backbone.featurizer.blocks.12.norm2.weight": "model-00001-of-00004.safetensors",
378
+ "vision_backbone.featurizer.blocks.13.attn.proj.bias": "model-00001-of-00004.safetensors",
379
+ "vision_backbone.featurizer.blocks.13.attn.proj.weight": "model-00001-of-00004.safetensors",
380
+ "vision_backbone.featurizer.blocks.13.attn.qkv.bias": "model-00001-of-00004.safetensors",
381
+ "vision_backbone.featurizer.blocks.13.attn.qkv.weight": "model-00001-of-00004.safetensors",
382
+ "vision_backbone.featurizer.blocks.13.ls1.scale_factor": "model-00001-of-00004.safetensors",
383
+ "vision_backbone.featurizer.blocks.13.ls2.scale_factor": "model-00001-of-00004.safetensors",
384
+ "vision_backbone.featurizer.blocks.13.mlp.fc1.bias": "model-00001-of-00004.safetensors",
385
+ "vision_backbone.featurizer.blocks.13.mlp.fc1.weight": "model-00001-of-00004.safetensors",
386
+ "vision_backbone.featurizer.blocks.13.mlp.fc2.bias": "model-00001-of-00004.safetensors",
387
+ "vision_backbone.featurizer.blocks.13.mlp.fc2.weight": "model-00001-of-00004.safetensors",
388
+ "vision_backbone.featurizer.blocks.13.norm1.bias": "model-00001-of-00004.safetensors",
389
+ "vision_backbone.featurizer.blocks.13.norm1.weight": "model-00001-of-00004.safetensors",
390
+ "vision_backbone.featurizer.blocks.13.norm2.bias": "model-00001-of-00004.safetensors",
391
+ "vision_backbone.featurizer.blocks.13.norm2.weight": "model-00001-of-00004.safetensors",
392
+ "vision_backbone.featurizer.blocks.14.attn.proj.bias": "model-00001-of-00004.safetensors",
393
+ "vision_backbone.featurizer.blocks.14.attn.proj.weight": "model-00001-of-00004.safetensors",
394
+ "vision_backbone.featurizer.blocks.14.attn.qkv.bias": "model-00001-of-00004.safetensors",
395
+ "vision_backbone.featurizer.blocks.14.attn.qkv.weight": "model-00001-of-00004.safetensors",
396
+ "vision_backbone.featurizer.blocks.14.ls1.scale_factor": "model-00001-of-00004.safetensors",
397
+ "vision_backbone.featurizer.blocks.14.ls2.scale_factor": "model-00001-of-00004.safetensors",
398
+ "vision_backbone.featurizer.blocks.14.mlp.fc1.bias": "model-00001-of-00004.safetensors",
399
+ "vision_backbone.featurizer.blocks.14.mlp.fc1.weight": "model-00001-of-00004.safetensors",
400
+ "vision_backbone.featurizer.blocks.14.mlp.fc2.bias": "model-00001-of-00004.safetensors",
401
+ "vision_backbone.featurizer.blocks.14.mlp.fc2.weight": "model-00001-of-00004.safetensors",
402
+ "vision_backbone.featurizer.blocks.14.norm1.bias": "model-00001-of-00004.safetensors",
403
+ "vision_backbone.featurizer.blocks.14.norm1.weight": "model-00001-of-00004.safetensors",
404
+ "vision_backbone.featurizer.blocks.14.norm2.bias": "model-00001-of-00004.safetensors",
405
+ "vision_backbone.featurizer.blocks.14.norm2.weight": "model-00001-of-00004.safetensors",
406
+ "vision_backbone.featurizer.blocks.15.attn.proj.bias": "model-00001-of-00004.safetensors",
407
+ "vision_backbone.featurizer.blocks.15.attn.proj.weight": "model-00001-of-00004.safetensors",
408
+ "vision_backbone.featurizer.blocks.15.attn.qkv.bias": "model-00001-of-00004.safetensors",
409
+ "vision_backbone.featurizer.blocks.15.attn.qkv.weight": "model-00001-of-00004.safetensors",
410
+ "vision_backbone.featurizer.blocks.15.ls1.scale_factor": "model-00001-of-00004.safetensors",
411
+ "vision_backbone.featurizer.blocks.15.ls2.scale_factor": "model-00001-of-00004.safetensors",
412
+ "vision_backbone.featurizer.blocks.15.mlp.fc1.bias": "model-00001-of-00004.safetensors",
413
+ "vision_backbone.featurizer.blocks.15.mlp.fc1.weight": "model-00001-of-00004.safetensors",
414
+ "vision_backbone.featurizer.blocks.15.mlp.fc2.bias": "model-00001-of-00004.safetensors",
415
+ "vision_backbone.featurizer.blocks.15.mlp.fc2.weight": "model-00001-of-00004.safetensors",
416
+ "vision_backbone.featurizer.blocks.15.norm1.bias": "model-00001-of-00004.safetensors",
417
+ "vision_backbone.featurizer.blocks.15.norm1.weight": "model-00001-of-00004.safetensors",
418
+ "vision_backbone.featurizer.blocks.15.norm2.bias": "model-00001-of-00004.safetensors",
419
+ "vision_backbone.featurizer.blocks.15.norm2.weight": "model-00001-of-00004.safetensors",
420
+ "vision_backbone.featurizer.blocks.16.attn.proj.bias": "model-00001-of-00004.safetensors",
421
+ "vision_backbone.featurizer.blocks.16.attn.proj.weight": "model-00001-of-00004.safetensors",
422
+ "vision_backbone.featurizer.blocks.16.attn.qkv.bias": "model-00001-of-00004.safetensors",
423
+ "vision_backbone.featurizer.blocks.16.attn.qkv.weight": "model-00001-of-00004.safetensors",
424
+ "vision_backbone.featurizer.blocks.16.ls1.scale_factor": "model-00001-of-00004.safetensors",
425
+ "vision_backbone.featurizer.blocks.16.ls2.scale_factor": "model-00001-of-00004.safetensors",
426
+ "vision_backbone.featurizer.blocks.16.mlp.fc1.bias": "model-00001-of-00004.safetensors",
427
+ "vision_backbone.featurizer.blocks.16.mlp.fc1.weight": "model-00001-of-00004.safetensors",
428
+ "vision_backbone.featurizer.blocks.16.mlp.fc2.bias": "model-00001-of-00004.safetensors",
429
+ "vision_backbone.featurizer.blocks.16.mlp.fc2.weight": "model-00001-of-00004.safetensors",
430
+ "vision_backbone.featurizer.blocks.16.norm1.bias": "model-00001-of-00004.safetensors",
431
+ "vision_backbone.featurizer.blocks.16.norm1.weight": "model-00001-of-00004.safetensors",
432
+ "vision_backbone.featurizer.blocks.16.norm2.bias": "model-00001-of-00004.safetensors",
433
+ "vision_backbone.featurizer.blocks.16.norm2.weight": "model-00001-of-00004.safetensors",
434
+ "vision_backbone.featurizer.blocks.17.attn.proj.bias": "model-00001-of-00004.safetensors",
435
+ "vision_backbone.featurizer.blocks.17.attn.proj.weight": "model-00001-of-00004.safetensors",
436
+ "vision_backbone.featurizer.blocks.17.attn.qkv.bias": "model-00001-of-00004.safetensors",
437
+ "vision_backbone.featurizer.blocks.17.attn.qkv.weight": "model-00001-of-00004.safetensors",
438
+ "vision_backbone.featurizer.blocks.17.ls1.scale_factor": "model-00001-of-00004.safetensors",
439
+ "vision_backbone.featurizer.blocks.17.ls2.scale_factor": "model-00001-of-00004.safetensors",
440
+ "vision_backbone.featurizer.blocks.17.mlp.fc1.bias": "model-00001-of-00004.safetensors",
441
+ "vision_backbone.featurizer.blocks.17.mlp.fc1.weight": "model-00001-of-00004.safetensors",
442
+ "vision_backbone.featurizer.blocks.17.mlp.fc2.bias": "model-00001-of-00004.safetensors",
443
+ "vision_backbone.featurizer.blocks.17.mlp.fc2.weight": "model-00001-of-00004.safetensors",
444
+ "vision_backbone.featurizer.blocks.17.norm1.bias": "model-00001-of-00004.safetensors",
445
+ "vision_backbone.featurizer.blocks.17.norm1.weight": "model-00001-of-00004.safetensors",
446
+ "vision_backbone.featurizer.blocks.17.norm2.bias": "model-00001-of-00004.safetensors",
447
+ "vision_backbone.featurizer.blocks.17.norm2.weight": "model-00001-of-00004.safetensors",
448
+ "vision_backbone.featurizer.blocks.18.attn.proj.bias": "model-00001-of-00004.safetensors",
449
+ "vision_backbone.featurizer.blocks.18.attn.proj.weight": "model-00001-of-00004.safetensors",
450
+ "vision_backbone.featurizer.blocks.18.attn.qkv.bias": "model-00001-of-00004.safetensors",
451
+ "vision_backbone.featurizer.blocks.18.attn.qkv.weight": "model-00001-of-00004.safetensors",
452
+ "vision_backbone.featurizer.blocks.18.ls1.scale_factor": "model-00001-of-00004.safetensors",
453
+ "vision_backbone.featurizer.blocks.18.ls2.scale_factor": "model-00001-of-00004.safetensors",
454
+ "vision_backbone.featurizer.blocks.18.mlp.fc1.bias": "model-00001-of-00004.safetensors",
455
+ "vision_backbone.featurizer.blocks.18.mlp.fc1.weight": "model-00001-of-00004.safetensors",
456
+ "vision_backbone.featurizer.blocks.18.mlp.fc2.bias": "model-00001-of-00004.safetensors",
457
+ "vision_backbone.featurizer.blocks.18.mlp.fc2.weight": "model-00001-of-00004.safetensors",
458
+ "vision_backbone.featurizer.blocks.18.norm1.bias": "model-00001-of-00004.safetensors",
459
+ "vision_backbone.featurizer.blocks.18.norm1.weight": "model-00001-of-00004.safetensors",
460
+ "vision_backbone.featurizer.blocks.18.norm2.bias": "model-00001-of-00004.safetensors",
461
+ "vision_backbone.featurizer.blocks.18.norm2.weight": "model-00001-of-00004.safetensors",
462
+ "vision_backbone.featurizer.blocks.19.attn.proj.bias": "model-00001-of-00004.safetensors",
463
+ "vision_backbone.featurizer.blocks.19.attn.proj.weight": "model-00001-of-00004.safetensors",
464
+ "vision_backbone.featurizer.blocks.19.attn.qkv.bias": "model-00001-of-00004.safetensors",
465
+ "vision_backbone.featurizer.blocks.19.attn.qkv.weight": "model-00001-of-00004.safetensors",
466
+ "vision_backbone.featurizer.blocks.19.ls1.scale_factor": "model-00001-of-00004.safetensors",
467
+ "vision_backbone.featurizer.blocks.19.ls2.scale_factor": "model-00001-of-00004.safetensors",
468
+ "vision_backbone.featurizer.blocks.19.mlp.fc1.bias": "model-00001-of-00004.safetensors",
469
+ "vision_backbone.featurizer.blocks.19.mlp.fc1.weight": "model-00001-of-00004.safetensors",
470
+ "vision_backbone.featurizer.blocks.19.mlp.fc2.bias": "model-00001-of-00004.safetensors",
471
+ "vision_backbone.featurizer.blocks.19.mlp.fc2.weight": "model-00001-of-00004.safetensors",
472
+ "vision_backbone.featurizer.blocks.19.norm1.bias": "model-00001-of-00004.safetensors",
473
+ "vision_backbone.featurizer.blocks.19.norm1.weight": "model-00001-of-00004.safetensors",
474
+ "vision_backbone.featurizer.blocks.19.norm2.bias": "model-00001-of-00004.safetensors",
475
+ "vision_backbone.featurizer.blocks.19.norm2.weight": "model-00001-of-00004.safetensors",
476
+ "vision_backbone.featurizer.blocks.2.attn.proj.bias": "model-00001-of-00004.safetensors",
477
+ "vision_backbone.featurizer.blocks.2.attn.proj.weight": "model-00001-of-00004.safetensors",
478
+ "vision_backbone.featurizer.blocks.2.attn.qkv.bias": "model-00001-of-00004.safetensors",
479
+ "vision_backbone.featurizer.blocks.2.attn.qkv.weight": "model-00001-of-00004.safetensors",
480
+ "vision_backbone.featurizer.blocks.2.ls1.scale_factor": "model-00001-of-00004.safetensors",
481
+ "vision_backbone.featurizer.blocks.2.ls2.scale_factor": "model-00001-of-00004.safetensors",
482
+ "vision_backbone.featurizer.blocks.2.mlp.fc1.bias": "model-00001-of-00004.safetensors",
483
+ "vision_backbone.featurizer.blocks.2.mlp.fc1.weight": "model-00001-of-00004.safetensors",
484
+ "vision_backbone.featurizer.blocks.2.mlp.fc2.bias": "model-00001-of-00004.safetensors",
485
+ "vision_backbone.featurizer.blocks.2.mlp.fc2.weight": "model-00001-of-00004.safetensors",
486
+ "vision_backbone.featurizer.blocks.2.norm1.bias": "model-00001-of-00004.safetensors",
487
+ "vision_backbone.featurizer.blocks.2.norm1.weight": "model-00001-of-00004.safetensors",
488
+ "vision_backbone.featurizer.blocks.2.norm2.bias": "model-00001-of-00004.safetensors",
489
+ "vision_backbone.featurizer.blocks.2.norm2.weight": "model-00001-of-00004.safetensors",
490
+ "vision_backbone.featurizer.blocks.20.attn.proj.bias": "model-00001-of-00004.safetensors",
491
+ "vision_backbone.featurizer.blocks.20.attn.proj.weight": "model-00001-of-00004.safetensors",
492
+ "vision_backbone.featurizer.blocks.20.attn.qkv.bias": "model-00001-of-00004.safetensors",
493
+ "vision_backbone.featurizer.blocks.20.attn.qkv.weight": "model-00001-of-00004.safetensors",
494
+ "vision_backbone.featurizer.blocks.20.ls1.scale_factor": "model-00001-of-00004.safetensors",
495
+ "vision_backbone.featurizer.blocks.20.ls2.scale_factor": "model-00001-of-00004.safetensors",
496
+ "vision_backbone.featurizer.blocks.20.mlp.fc1.bias": "model-00001-of-00004.safetensors",
497
+ "vision_backbone.featurizer.blocks.20.mlp.fc1.weight": "model-00001-of-00004.safetensors",
498
+ "vision_backbone.featurizer.blocks.20.mlp.fc2.bias": "model-00001-of-00004.safetensors",
499
+ "vision_backbone.featurizer.blocks.20.mlp.fc2.weight": "model-00001-of-00004.safetensors",
500
+ "vision_backbone.featurizer.blocks.20.norm1.bias": "model-00001-of-00004.safetensors",
501
+ "vision_backbone.featurizer.blocks.20.norm1.weight": "model-00001-of-00004.safetensors",
502
+ "vision_backbone.featurizer.blocks.20.norm2.bias": "model-00001-of-00004.safetensors",
503
+ "vision_backbone.featurizer.blocks.20.norm2.weight": "model-00001-of-00004.safetensors",
504
+ "vision_backbone.featurizer.blocks.21.attn.proj.bias": "model-00001-of-00004.safetensors",
505
+ "vision_backbone.featurizer.blocks.21.attn.proj.weight": "model-00001-of-00004.safetensors",
506
+ "vision_backbone.featurizer.blocks.21.attn.qkv.bias": "model-00001-of-00004.safetensors",
507
+ "vision_backbone.featurizer.blocks.21.attn.qkv.weight": "model-00001-of-00004.safetensors",
508
+ "vision_backbone.featurizer.blocks.21.ls1.scale_factor": "model-00001-of-00004.safetensors",
509
+ "vision_backbone.featurizer.blocks.21.ls2.scale_factor": "model-00001-of-00004.safetensors",
510
+ "vision_backbone.featurizer.blocks.21.mlp.fc1.bias": "model-00001-of-00004.safetensors",
511
+ "vision_backbone.featurizer.blocks.21.mlp.fc1.weight": "model-00001-of-00004.safetensors",
512
+ "vision_backbone.featurizer.blocks.21.mlp.fc2.bias": "model-00001-of-00004.safetensors",
513
+ "vision_backbone.featurizer.blocks.21.mlp.fc2.weight": "model-00001-of-00004.safetensors",
514
+ "vision_backbone.featurizer.blocks.21.norm1.bias": "model-00001-of-00004.safetensors",
515
+ "vision_backbone.featurizer.blocks.21.norm1.weight": "model-00001-of-00004.safetensors",
516
+ "vision_backbone.featurizer.blocks.21.norm2.bias": "model-00001-of-00004.safetensors",
517
+ "vision_backbone.featurizer.blocks.21.norm2.weight": "model-00001-of-00004.safetensors",
518
+ "vision_backbone.featurizer.blocks.22.attn.proj.bias": "model-00001-of-00004.safetensors",
519
+ "vision_backbone.featurizer.blocks.22.attn.proj.weight": "model-00001-of-00004.safetensors",
520
+ "vision_backbone.featurizer.blocks.22.attn.qkv.bias": "model-00001-of-00004.safetensors",
521
+ "vision_backbone.featurizer.blocks.22.attn.qkv.weight": "model-00001-of-00004.safetensors",
522
+ "vision_backbone.featurizer.blocks.22.ls1.scale_factor": "model-00001-of-00004.safetensors",
523
+ "vision_backbone.featurizer.blocks.22.ls2.scale_factor": "model-00001-of-00004.safetensors",
524
+ "vision_backbone.featurizer.blocks.22.mlp.fc1.bias": "model-00001-of-00004.safetensors",
525
+ "vision_backbone.featurizer.blocks.22.mlp.fc1.weight": "model-00001-of-00004.safetensors",
526
+ "vision_backbone.featurizer.blocks.22.mlp.fc2.bias": "model-00001-of-00004.safetensors",
527
+ "vision_backbone.featurizer.blocks.22.mlp.fc2.weight": "model-00001-of-00004.safetensors",
528
+ "vision_backbone.featurizer.blocks.22.norm1.bias": "model-00001-of-00004.safetensors",
529
+ "vision_backbone.featurizer.blocks.22.norm1.weight": "model-00001-of-00004.safetensors",
530
+ "vision_backbone.featurizer.blocks.22.norm2.bias": "model-00001-of-00004.safetensors",
531
+ "vision_backbone.featurizer.blocks.22.norm2.weight": "model-00001-of-00004.safetensors",
532
+ "vision_backbone.featurizer.blocks.23.attn.proj.bias": "model-00001-of-00004.safetensors",
533
+ "vision_backbone.featurizer.blocks.23.attn.proj.weight": "model-00001-of-00004.safetensors",
534
+ "vision_backbone.featurizer.blocks.23.attn.qkv.bias": "model-00001-of-00004.safetensors",
535
+ "vision_backbone.featurizer.blocks.23.attn.qkv.weight": "model-00001-of-00004.safetensors",
536
+ "vision_backbone.featurizer.blocks.23.ls1.scale_factor": "model-00001-of-00004.safetensors",
537
+ "vision_backbone.featurizer.blocks.23.ls2.scale_factor": "model-00001-of-00004.safetensors",
538
+ "vision_backbone.featurizer.blocks.23.mlp.fc1.bias": "model-00001-of-00004.safetensors",
539
+ "vision_backbone.featurizer.blocks.23.mlp.fc1.weight": "model-00001-of-00004.safetensors",
540
+ "vision_backbone.featurizer.blocks.23.mlp.fc2.bias": "model-00001-of-00004.safetensors",
541
+ "vision_backbone.featurizer.blocks.23.mlp.fc2.weight": "model-00001-of-00004.safetensors",
542
+ "vision_backbone.featurizer.blocks.23.norm1.bias": "model-00001-of-00004.safetensors",
543
+ "vision_backbone.featurizer.blocks.23.norm1.weight": "model-00001-of-00004.safetensors",
544
+ "vision_backbone.featurizer.blocks.23.norm2.bias": "model-00001-of-00004.safetensors",
545
+ "vision_backbone.featurizer.blocks.23.norm2.weight": "model-00001-of-00004.safetensors",
546
+ "vision_backbone.featurizer.blocks.3.attn.proj.bias": "model-00001-of-00004.safetensors",
547
+ "vision_backbone.featurizer.blocks.3.attn.proj.weight": "model-00001-of-00004.safetensors",
548
+ "vision_backbone.featurizer.blocks.3.attn.qkv.bias": "model-00001-of-00004.safetensors",
549
+ "vision_backbone.featurizer.blocks.3.attn.qkv.weight": "model-00001-of-00004.safetensors",
550
+ "vision_backbone.featurizer.blocks.3.ls1.scale_factor": "model-00001-of-00004.safetensors",
551
+ "vision_backbone.featurizer.blocks.3.ls2.scale_factor": "model-00001-of-00004.safetensors",
552
+ "vision_backbone.featurizer.blocks.3.mlp.fc1.bias": "model-00001-of-00004.safetensors",
553
+ "vision_backbone.featurizer.blocks.3.mlp.fc1.weight": "model-00001-of-00004.safetensors",
554
+ "vision_backbone.featurizer.blocks.3.mlp.fc2.bias": "model-00001-of-00004.safetensors",
555
+ "vision_backbone.featurizer.blocks.3.mlp.fc2.weight": "model-00001-of-00004.safetensors",
556
+ "vision_backbone.featurizer.blocks.3.norm1.bias": "model-00001-of-00004.safetensors",
557
+ "vision_backbone.featurizer.blocks.3.norm1.weight": "model-00001-of-00004.safetensors",
558
+ "vision_backbone.featurizer.blocks.3.norm2.bias": "model-00001-of-00004.safetensors",
559
+ "vision_backbone.featurizer.blocks.3.norm2.weight": "model-00001-of-00004.safetensors",
560
+ "vision_backbone.featurizer.blocks.4.attn.proj.bias": "model-00001-of-00004.safetensors",
561
+ "vision_backbone.featurizer.blocks.4.attn.proj.weight": "model-00001-of-00004.safetensors",
562
+ "vision_backbone.featurizer.blocks.4.attn.qkv.bias": "model-00001-of-00004.safetensors",
563
+ "vision_backbone.featurizer.blocks.4.attn.qkv.weight": "model-00001-of-00004.safetensors",
564
+ "vision_backbone.featurizer.blocks.4.ls1.scale_factor": "model-00001-of-00004.safetensors",
565
+ "vision_backbone.featurizer.blocks.4.ls2.scale_factor": "model-00001-of-00004.safetensors",
566
+ "vision_backbone.featurizer.blocks.4.mlp.fc1.bias": "model-00001-of-00004.safetensors",
567
+ "vision_backbone.featurizer.blocks.4.mlp.fc1.weight": "model-00001-of-00004.safetensors",
568
+ "vision_backbone.featurizer.blocks.4.mlp.fc2.bias": "model-00001-of-00004.safetensors",
569
+ "vision_backbone.featurizer.blocks.4.mlp.fc2.weight": "model-00001-of-00004.safetensors",
570
+ "vision_backbone.featurizer.blocks.4.norm1.bias": "model-00001-of-00004.safetensors",
571
+ "vision_backbone.featurizer.blocks.4.norm1.weight": "model-00001-of-00004.safetensors",
572
+ "vision_backbone.featurizer.blocks.4.norm2.bias": "model-00001-of-00004.safetensors",
573
+ "vision_backbone.featurizer.blocks.4.norm2.weight": "model-00001-of-00004.safetensors",
574
+ "vision_backbone.featurizer.blocks.5.attn.proj.bias": "model-00001-of-00004.safetensors",
575
+ "vision_backbone.featurizer.blocks.5.attn.proj.weight": "model-00001-of-00004.safetensors",
576
+ "vision_backbone.featurizer.blocks.5.attn.qkv.bias": "model-00001-of-00004.safetensors",
577
+ "vision_backbone.featurizer.blocks.5.attn.qkv.weight": "model-00001-of-00004.safetensors",
578
+ "vision_backbone.featurizer.blocks.5.ls1.scale_factor": "model-00001-of-00004.safetensors",
579
+ "vision_backbone.featurizer.blocks.5.ls2.scale_factor": "model-00001-of-00004.safetensors",
580
+ "vision_backbone.featurizer.blocks.5.mlp.fc1.bias": "model-00001-of-00004.safetensors",
581
+ "vision_backbone.featurizer.blocks.5.mlp.fc1.weight": "model-00001-of-00004.safetensors",
582
+ "vision_backbone.featurizer.blocks.5.mlp.fc2.bias": "model-00001-of-00004.safetensors",
583
+ "vision_backbone.featurizer.blocks.5.mlp.fc2.weight": "model-00001-of-00004.safetensors",
584
+ "vision_backbone.featurizer.blocks.5.norm1.bias": "model-00001-of-00004.safetensors",
585
+ "vision_backbone.featurizer.blocks.5.norm1.weight": "model-00001-of-00004.safetensors",
586
+ "vision_backbone.featurizer.blocks.5.norm2.bias": "model-00001-of-00004.safetensors",
587
+ "vision_backbone.featurizer.blocks.5.norm2.weight": "model-00001-of-00004.safetensors",
588
+ "vision_backbone.featurizer.blocks.6.attn.proj.bias": "model-00001-of-00004.safetensors",
589
+ "vision_backbone.featurizer.blocks.6.attn.proj.weight": "model-00001-of-00004.safetensors",
590
+ "vision_backbone.featurizer.blocks.6.attn.qkv.bias": "model-00001-of-00004.safetensors",
591
+ "vision_backbone.featurizer.blocks.6.attn.qkv.weight": "model-00001-of-00004.safetensors",
592
+ "vision_backbone.featurizer.blocks.6.ls1.scale_factor": "model-00001-of-00004.safetensors",
593
+ "vision_backbone.featurizer.blocks.6.ls2.scale_factor": "model-00001-of-00004.safetensors",
594
+ "vision_backbone.featurizer.blocks.6.mlp.fc1.bias": "model-00001-of-00004.safetensors",
595
+ "vision_backbone.featurizer.blocks.6.mlp.fc1.weight": "model-00001-of-00004.safetensors",
596
+ "vision_backbone.featurizer.blocks.6.mlp.fc2.bias": "model-00001-of-00004.safetensors",
597
+ "vision_backbone.featurizer.blocks.6.mlp.fc2.weight": "model-00001-of-00004.safetensors",
598
+ "vision_backbone.featurizer.blocks.6.norm1.bias": "model-00001-of-00004.safetensors",
599
+ "vision_backbone.featurizer.blocks.6.norm1.weight": "model-00001-of-00004.safetensors",
600
+ "vision_backbone.featurizer.blocks.6.norm2.bias": "model-00001-of-00004.safetensors",
601
+ "vision_backbone.featurizer.blocks.6.norm2.weight": "model-00001-of-00004.safetensors",
602
+ "vision_backbone.featurizer.blocks.7.attn.proj.bias": "model-00001-of-00004.safetensors",
603
+ "vision_backbone.featurizer.blocks.7.attn.proj.weight": "model-00001-of-00004.safetensors",
604
+ "vision_backbone.featurizer.blocks.7.attn.qkv.bias": "model-00001-of-00004.safetensors",
605
+ "vision_backbone.featurizer.blocks.7.attn.qkv.weight": "model-00001-of-00004.safetensors",
606
+ "vision_backbone.featurizer.blocks.7.ls1.scale_factor": "model-00001-of-00004.safetensors",
607
+ "vision_backbone.featurizer.blocks.7.ls2.scale_factor": "model-00001-of-00004.safetensors",
608
+ "vision_backbone.featurizer.blocks.7.mlp.fc1.bias": "model-00001-of-00004.safetensors",
609
+ "vision_backbone.featurizer.blocks.7.mlp.fc1.weight": "model-00001-of-00004.safetensors",
610
+ "vision_backbone.featurizer.blocks.7.mlp.fc2.bias": "model-00001-of-00004.safetensors",
611
+ "vision_backbone.featurizer.blocks.7.mlp.fc2.weight": "model-00001-of-00004.safetensors",
612
+ "vision_backbone.featurizer.blocks.7.norm1.bias": "model-00001-of-00004.safetensors",
613
+ "vision_backbone.featurizer.blocks.7.norm1.weight": "model-00001-of-00004.safetensors",
614
+ "vision_backbone.featurizer.blocks.7.norm2.bias": "model-00001-of-00004.safetensors",
615
+ "vision_backbone.featurizer.blocks.7.norm2.weight": "model-00001-of-00004.safetensors",
616
+ "vision_backbone.featurizer.blocks.8.attn.proj.bias": "model-00001-of-00004.safetensors",
617
+ "vision_backbone.featurizer.blocks.8.attn.proj.weight": "model-00001-of-00004.safetensors",
618
+ "vision_backbone.featurizer.blocks.8.attn.qkv.bias": "model-00001-of-00004.safetensors",
619
+ "vision_backbone.featurizer.blocks.8.attn.qkv.weight": "model-00001-of-00004.safetensors",
620
+ "vision_backbone.featurizer.blocks.8.ls1.scale_factor": "model-00001-of-00004.safetensors",
621
+ "vision_backbone.featurizer.blocks.8.ls2.scale_factor": "model-00001-of-00004.safetensors",
622
+ "vision_backbone.featurizer.blocks.8.mlp.fc1.bias": "model-00001-of-00004.safetensors",
623
+ "vision_backbone.featurizer.blocks.8.mlp.fc1.weight": "model-00001-of-00004.safetensors",
624
+ "vision_backbone.featurizer.blocks.8.mlp.fc2.bias": "model-00001-of-00004.safetensors",
625
+ "vision_backbone.featurizer.blocks.8.mlp.fc2.weight": "model-00001-of-00004.safetensors",
626
+ "vision_backbone.featurizer.blocks.8.norm1.bias": "model-00001-of-00004.safetensors",
627
+ "vision_backbone.featurizer.blocks.8.norm1.weight": "model-00001-of-00004.safetensors",
628
+ "vision_backbone.featurizer.blocks.8.norm2.bias": "model-00001-of-00004.safetensors",
629
+ "vision_backbone.featurizer.blocks.8.norm2.weight": "model-00001-of-00004.safetensors",
630
+ "vision_backbone.featurizer.blocks.9.attn.proj.bias": "model-00001-of-00004.safetensors",
631
+ "vision_backbone.featurizer.blocks.9.attn.proj.weight": "model-00001-of-00004.safetensors",
632
+ "vision_backbone.featurizer.blocks.9.attn.qkv.bias": "model-00001-of-00004.safetensors",
633
+ "vision_backbone.featurizer.blocks.9.attn.qkv.weight": "model-00001-of-00004.safetensors",
634
+ "vision_backbone.featurizer.blocks.9.ls1.scale_factor": "model-00001-of-00004.safetensors",
635
+ "vision_backbone.featurizer.blocks.9.ls2.scale_factor": "model-00001-of-00004.safetensors",
636
+ "vision_backbone.featurizer.blocks.9.mlp.fc1.bias": "model-00001-of-00004.safetensors",
637
+ "vision_backbone.featurizer.blocks.9.mlp.fc1.weight": "model-00001-of-00004.safetensors",
638
+ "vision_backbone.featurizer.blocks.9.mlp.fc2.bias": "model-00001-of-00004.safetensors",
639
+ "vision_backbone.featurizer.blocks.9.mlp.fc2.weight": "model-00001-of-00004.safetensors",
640
+ "vision_backbone.featurizer.blocks.9.norm1.bias": "model-00001-of-00004.safetensors",
641
+ "vision_backbone.featurizer.blocks.9.norm1.weight": "model-00001-of-00004.safetensors",
642
+ "vision_backbone.featurizer.blocks.9.norm2.bias": "model-00001-of-00004.safetensors",
643
+ "vision_backbone.featurizer.blocks.9.norm2.weight": "model-00001-of-00004.safetensors",
644
+ "vision_backbone.featurizer.cls_token": "model-00001-of-00004.safetensors",
645
+ "vision_backbone.featurizer.norm.bias": "model-00001-of-00004.safetensors",
646
+ "vision_backbone.featurizer.norm.weight": "model-00001-of-00004.safetensors",
647
+ "vision_backbone.featurizer.patch_embed.proj.bias": "model-00001-of-00004.safetensors",
648
+ "vision_backbone.featurizer.patch_embed.proj.weight": "model-00001-of-00004.safetensors",
649
+ "vision_backbone.featurizer.pos_embed": "model-00001-of-00004.safetensors",
650
+ "vision_backbone.featurizer.reg_token": "model-00001-of-00004.safetensors",
651
+ "vision_backbone.fused_featurizer.attn_pool.kv.bias": "model-00001-of-00004.safetensors",
652
+ "vision_backbone.fused_featurizer.attn_pool.kv.weight": "model-00001-of-00004.safetensors",
653
+ "vision_backbone.fused_featurizer.attn_pool.latent": "model-00001-of-00004.safetensors",
654
+ "vision_backbone.fused_featurizer.attn_pool.mlp.fc1.bias": "model-00001-of-00004.safetensors",
655
+ "vision_backbone.fused_featurizer.attn_pool.mlp.fc1.weight": "model-00001-of-00004.safetensors",
656
+ "vision_backbone.fused_featurizer.attn_pool.mlp.fc2.bias": "model-00001-of-00004.safetensors",
657
+ "vision_backbone.fused_featurizer.attn_pool.mlp.fc2.weight": "model-00001-of-00004.safetensors",
658
+ "vision_backbone.fused_featurizer.attn_pool.norm.bias": "model-00001-of-00004.safetensors",
659
+ "vision_backbone.fused_featurizer.attn_pool.norm.weight": "model-00001-of-00004.safetensors",
660
+ "vision_backbone.fused_featurizer.attn_pool.proj.bias": "model-00001-of-00004.safetensors",
661
+ "vision_backbone.fused_featurizer.attn_pool.proj.weight": "model-00001-of-00004.safetensors",
662
+ "vision_backbone.fused_featurizer.attn_pool.q.bias": "model-00001-of-00004.safetensors",
663
+ "vision_backbone.fused_featurizer.attn_pool.q.weight": "model-00001-of-00004.safetensors",
664
+ "vision_backbone.fused_featurizer.blocks.0.attn.proj.bias": "model-00001-of-00004.safetensors",
665
+ "vision_backbone.fused_featurizer.blocks.0.attn.proj.weight": "model-00001-of-00004.safetensors",
666
+ "vision_backbone.fused_featurizer.blocks.0.attn.qkv.bias": "model-00001-of-00004.safetensors",
667
+ "vision_backbone.fused_featurizer.blocks.0.attn.qkv.weight": "model-00001-of-00004.safetensors",
668
+ "vision_backbone.fused_featurizer.blocks.0.mlp.fc1.bias": "model-00001-of-00004.safetensors",
669
+ "vision_backbone.fused_featurizer.blocks.0.mlp.fc1.weight": "model-00001-of-00004.safetensors",
670
+ "vision_backbone.fused_featurizer.blocks.0.mlp.fc2.bias": "model-00001-of-00004.safetensors",
671
+ "vision_backbone.fused_featurizer.blocks.0.mlp.fc2.weight": "model-00001-of-00004.safetensors",
672
+ "vision_backbone.fused_featurizer.blocks.0.norm1.bias": "model-00001-of-00004.safetensors",
673
+ "vision_backbone.fused_featurizer.blocks.0.norm1.weight": "model-00001-of-00004.safetensors",
674
+ "vision_backbone.fused_featurizer.blocks.0.norm2.bias": "model-00001-of-00004.safetensors",
675
+ "vision_backbone.fused_featurizer.blocks.0.norm2.weight": "model-00001-of-00004.safetensors",
676
+ "vision_backbone.fused_featurizer.blocks.1.attn.proj.bias": "model-00001-of-00004.safetensors",
677
+ "vision_backbone.fused_featurizer.blocks.1.attn.proj.weight": "model-00001-of-00004.safetensors",
678
+ "vision_backbone.fused_featurizer.blocks.1.attn.qkv.bias": "model-00001-of-00004.safetensors",
679
+ "vision_backbone.fused_featurizer.blocks.1.attn.qkv.weight": "model-00001-of-00004.safetensors",
680
+ "vision_backbone.fused_featurizer.blocks.1.mlp.fc1.bias": "model-00001-of-00004.safetensors",
681
+ "vision_backbone.fused_featurizer.blocks.1.mlp.fc1.weight": "model-00001-of-00004.safetensors",
682
+ "vision_backbone.fused_featurizer.blocks.1.mlp.fc2.bias": "model-00001-of-00004.safetensors",
683
+ "vision_backbone.fused_featurizer.blocks.1.mlp.fc2.weight": "model-00001-of-00004.safetensors",
684
+ "vision_backbone.fused_featurizer.blocks.1.norm1.bias": "model-00001-of-00004.safetensors",
685
+ "vision_backbone.fused_featurizer.blocks.1.norm1.weight": "model-00001-of-00004.safetensors",
686
+ "vision_backbone.fused_featurizer.blocks.1.norm2.bias": "model-00001-of-00004.safetensors",
687
+ "vision_backbone.fused_featurizer.blocks.1.norm2.weight": "model-00001-of-00004.safetensors",
688
+ "vision_backbone.fused_featurizer.blocks.10.attn.proj.bias": "model-00001-of-00004.safetensors",
689
+ "vision_backbone.fused_featurizer.blocks.10.attn.proj.weight": "model-00001-of-00004.safetensors",
690
+ "vision_backbone.fused_featurizer.blocks.10.attn.qkv.bias": "model-00001-of-00004.safetensors",
691
+ "vision_backbone.fused_featurizer.blocks.10.attn.qkv.weight": "model-00001-of-00004.safetensors",
692
+ "vision_backbone.fused_featurizer.blocks.10.mlp.fc1.bias": "model-00001-of-00004.safetensors",
693
+ "vision_backbone.fused_featurizer.blocks.10.mlp.fc1.weight": "model-00001-of-00004.safetensors",
694
+ "vision_backbone.fused_featurizer.blocks.10.mlp.fc2.bias": "model-00001-of-00004.safetensors",
695
+ "vision_backbone.fused_featurizer.blocks.10.mlp.fc2.weight": "model-00001-of-00004.safetensors",
696
+ "vision_backbone.fused_featurizer.blocks.10.norm1.bias": "model-00001-of-00004.safetensors",
697
+ "vision_backbone.fused_featurizer.blocks.10.norm1.weight": "model-00001-of-00004.safetensors",
698
+ "vision_backbone.fused_featurizer.blocks.10.norm2.bias": "model-00001-of-00004.safetensors",
699
+ "vision_backbone.fused_featurizer.blocks.10.norm2.weight": "model-00001-of-00004.safetensors",
700
+ "vision_backbone.fused_featurizer.blocks.11.attn.proj.bias": "model-00001-of-00004.safetensors",
701
+ "vision_backbone.fused_featurizer.blocks.11.attn.proj.weight": "model-00001-of-00004.safetensors",
702
+ "vision_backbone.fused_featurizer.blocks.11.attn.qkv.bias": "model-00001-of-00004.safetensors",
703
+ "vision_backbone.fused_featurizer.blocks.11.attn.qkv.weight": "model-00001-of-00004.safetensors",
704
+ "vision_backbone.fused_featurizer.blocks.11.mlp.fc1.bias": "model-00001-of-00004.safetensors",
705
+ "vision_backbone.fused_featurizer.blocks.11.mlp.fc1.weight": "model-00001-of-00004.safetensors",
706
+ "vision_backbone.fused_featurizer.blocks.11.mlp.fc2.bias": "model-00001-of-00004.safetensors",
707
+ "vision_backbone.fused_featurizer.blocks.11.mlp.fc2.weight": "model-00001-of-00004.safetensors",
708
+ "vision_backbone.fused_featurizer.blocks.11.norm1.bias": "model-00001-of-00004.safetensors",
709
+ "vision_backbone.fused_featurizer.blocks.11.norm1.weight": "model-00001-of-00004.safetensors",
710
+ "vision_backbone.fused_featurizer.blocks.11.norm2.bias": "model-00001-of-00004.safetensors",
711
+ "vision_backbone.fused_featurizer.blocks.11.norm2.weight": "model-00001-of-00004.safetensors",
712
+ "vision_backbone.fused_featurizer.blocks.12.attn.proj.bias": "model-00001-of-00004.safetensors",
713
+ "vision_backbone.fused_featurizer.blocks.12.attn.proj.weight": "model-00001-of-00004.safetensors",
714
+ "vision_backbone.fused_featurizer.blocks.12.attn.qkv.bias": "model-00001-of-00004.safetensors",
715
+ "vision_backbone.fused_featurizer.blocks.12.attn.qkv.weight": "model-00001-of-00004.safetensors",
716
+ "vision_backbone.fused_featurizer.blocks.12.mlp.fc1.bias": "model-00001-of-00004.safetensors",
717
+ "vision_backbone.fused_featurizer.blocks.12.mlp.fc1.weight": "model-00001-of-00004.safetensors",
718
+ "vision_backbone.fused_featurizer.blocks.12.mlp.fc2.bias": "model-00001-of-00004.safetensors",
719
+ "vision_backbone.fused_featurizer.blocks.12.mlp.fc2.weight": "model-00001-of-00004.safetensors",
720
+ "vision_backbone.fused_featurizer.blocks.12.norm1.bias": "model-00001-of-00004.safetensors",
721
+ "vision_backbone.fused_featurizer.blocks.12.norm1.weight": "model-00001-of-00004.safetensors",
722
+ "vision_backbone.fused_featurizer.blocks.12.norm2.bias": "model-00001-of-00004.safetensors",
723
+ "vision_backbone.fused_featurizer.blocks.12.norm2.weight": "model-00001-of-00004.safetensors",
724
+ "vision_backbone.fused_featurizer.blocks.13.attn.proj.bias": "model-00001-of-00004.safetensors",
725
+ "vision_backbone.fused_featurizer.blocks.13.attn.proj.weight": "model-00001-of-00004.safetensors",
726
+ "vision_backbone.fused_featurizer.blocks.13.attn.qkv.bias": "model-00001-of-00004.safetensors",
727
+ "vision_backbone.fused_featurizer.blocks.13.attn.qkv.weight": "model-00001-of-00004.safetensors",
728
+ "vision_backbone.fused_featurizer.blocks.13.mlp.fc1.bias": "model-00001-of-00004.safetensors",
729
+ "vision_backbone.fused_featurizer.blocks.13.mlp.fc1.weight": "model-00001-of-00004.safetensors",
730
+ "vision_backbone.fused_featurizer.blocks.13.mlp.fc2.bias": "model-00001-of-00004.safetensors",
731
+ "vision_backbone.fused_featurizer.blocks.13.mlp.fc2.weight": "model-00001-of-00004.safetensors",
732
+ "vision_backbone.fused_featurizer.blocks.13.norm1.bias": "model-00001-of-00004.safetensors",
733
+ "vision_backbone.fused_featurizer.blocks.13.norm1.weight": "model-00001-of-00004.safetensors",
734
+ "vision_backbone.fused_featurizer.blocks.13.norm2.bias": "model-00001-of-00004.safetensors",
735
+ "vision_backbone.fused_featurizer.blocks.13.norm2.weight": "model-00001-of-00004.safetensors",
736
+ "vision_backbone.fused_featurizer.blocks.14.attn.proj.bias": "model-00001-of-00004.safetensors",
737
+ "vision_backbone.fused_featurizer.blocks.14.attn.proj.weight": "model-00001-of-00004.safetensors",
738
+ "vision_backbone.fused_featurizer.blocks.14.attn.qkv.bias": "model-00001-of-00004.safetensors",
739
+ "vision_backbone.fused_featurizer.blocks.14.attn.qkv.weight": "model-00001-of-00004.safetensors",
740
+ "vision_backbone.fused_featurizer.blocks.14.mlp.fc1.bias": "model-00001-of-00004.safetensors",
741
+ "vision_backbone.fused_featurizer.blocks.14.mlp.fc1.weight": "model-00001-of-00004.safetensors",
742
+ "vision_backbone.fused_featurizer.blocks.14.mlp.fc2.bias": "model-00001-of-00004.safetensors",
743
+ "vision_backbone.fused_featurizer.blocks.14.mlp.fc2.weight": "model-00001-of-00004.safetensors",
744
+ "vision_backbone.fused_featurizer.blocks.14.norm1.bias": "model-00001-of-00004.safetensors",
745
+ "vision_backbone.fused_featurizer.blocks.14.norm1.weight": "model-00001-of-00004.safetensors",
746
+ "vision_backbone.fused_featurizer.blocks.14.norm2.bias": "model-00001-of-00004.safetensors",
747
+ "vision_backbone.fused_featurizer.blocks.14.norm2.weight": "model-00001-of-00004.safetensors",
748
+ "vision_backbone.fused_featurizer.blocks.15.attn.proj.bias": "model-00001-of-00004.safetensors",
749
+ "vision_backbone.fused_featurizer.blocks.15.attn.proj.weight": "model-00001-of-00004.safetensors",
750
+ "vision_backbone.fused_featurizer.blocks.15.attn.qkv.bias": "model-00001-of-00004.safetensors",
751
+ "vision_backbone.fused_featurizer.blocks.15.attn.qkv.weight": "model-00001-of-00004.safetensors",
752
+ "vision_backbone.fused_featurizer.blocks.15.mlp.fc1.bias": "model-00001-of-00004.safetensors",
753
+ "vision_backbone.fused_featurizer.blocks.15.mlp.fc1.weight": "model-00001-of-00004.safetensors",
754
+ "vision_backbone.fused_featurizer.blocks.15.mlp.fc2.bias": "model-00001-of-00004.safetensors",
755
+ "vision_backbone.fused_featurizer.blocks.15.mlp.fc2.weight": "model-00001-of-00004.safetensors",
756
+ "vision_backbone.fused_featurizer.blocks.15.norm1.bias": "model-00001-of-00004.safetensors",
757
+ "vision_backbone.fused_featurizer.blocks.15.norm1.weight": "model-00001-of-00004.safetensors",
758
+ "vision_backbone.fused_featurizer.blocks.15.norm2.bias": "model-00001-of-00004.safetensors",
759
+ "vision_backbone.fused_featurizer.blocks.15.norm2.weight": "model-00001-of-00004.safetensors",
760
+ "vision_backbone.fused_featurizer.blocks.16.attn.proj.bias": "model-00001-of-00004.safetensors",
761
+ "vision_backbone.fused_featurizer.blocks.16.attn.proj.weight": "model-00001-of-00004.safetensors",
762
+ "vision_backbone.fused_featurizer.blocks.16.attn.qkv.bias": "model-00001-of-00004.safetensors",
763
+ "vision_backbone.fused_featurizer.blocks.16.attn.qkv.weight": "model-00001-of-00004.safetensors",
764
+ "vision_backbone.fused_featurizer.blocks.16.mlp.fc1.bias": "model-00001-of-00004.safetensors",
765
+ "vision_backbone.fused_featurizer.blocks.16.mlp.fc1.weight": "model-00001-of-00004.safetensors",
766
+ "vision_backbone.fused_featurizer.blocks.16.mlp.fc2.bias": "model-00001-of-00004.safetensors",
767
+ "vision_backbone.fused_featurizer.blocks.16.mlp.fc2.weight": "model-00001-of-00004.safetensors",
768
+ "vision_backbone.fused_featurizer.blocks.16.norm1.bias": "model-00001-of-00004.safetensors",
769
+ "vision_backbone.fused_featurizer.blocks.16.norm1.weight": "model-00001-of-00004.safetensors",
770
+ "vision_backbone.fused_featurizer.blocks.16.norm2.bias": "model-00001-of-00004.safetensors",
771
+ "vision_backbone.fused_featurizer.blocks.16.norm2.weight": "model-00001-of-00004.safetensors",
772
+ "vision_backbone.fused_featurizer.blocks.17.attn.proj.bias": "model-00001-of-00004.safetensors",
773
+ "vision_backbone.fused_featurizer.blocks.17.attn.proj.weight": "model-00001-of-00004.safetensors",
774
+ "vision_backbone.fused_featurizer.blocks.17.attn.qkv.bias": "model-00001-of-00004.safetensors",
775
+ "vision_backbone.fused_featurizer.blocks.17.attn.qkv.weight": "model-00001-of-00004.safetensors",
776
+ "vision_backbone.fused_featurizer.blocks.17.mlp.fc1.bias": "model-00001-of-00004.safetensors",
777
+ "vision_backbone.fused_featurizer.blocks.17.mlp.fc1.weight": "model-00001-of-00004.safetensors",
778
+ "vision_backbone.fused_featurizer.blocks.17.mlp.fc2.bias": "model-00001-of-00004.safetensors",
779
+ "vision_backbone.fused_featurizer.blocks.17.mlp.fc2.weight": "model-00001-of-00004.safetensors",
780
+ "vision_backbone.fused_featurizer.blocks.17.norm1.bias": "model-00001-of-00004.safetensors",
781
+ "vision_backbone.fused_featurizer.blocks.17.norm1.weight": "model-00001-of-00004.safetensors",
782
+ "vision_backbone.fused_featurizer.blocks.17.norm2.bias": "model-00001-of-00004.safetensors",
783
+ "vision_backbone.fused_featurizer.blocks.17.norm2.weight": "model-00001-of-00004.safetensors",
784
+ "vision_backbone.fused_featurizer.blocks.18.attn.proj.bias": "model-00001-of-00004.safetensors",
785
+ "vision_backbone.fused_featurizer.blocks.18.attn.proj.weight": "model-00001-of-00004.safetensors",
786
+ "vision_backbone.fused_featurizer.blocks.18.attn.qkv.bias": "model-00001-of-00004.safetensors",
787
+ "vision_backbone.fused_featurizer.blocks.18.attn.qkv.weight": "model-00001-of-00004.safetensors",
788
+ "vision_backbone.fused_featurizer.blocks.18.mlp.fc1.bias": "model-00001-of-00004.safetensors",
789
+ "vision_backbone.fused_featurizer.blocks.18.mlp.fc1.weight": "model-00001-of-00004.safetensors",
790
+ "vision_backbone.fused_featurizer.blocks.18.mlp.fc2.bias": "model-00001-of-00004.safetensors",
791
+ "vision_backbone.fused_featurizer.blocks.18.mlp.fc2.weight": "model-00001-of-00004.safetensors",
792
+ "vision_backbone.fused_featurizer.blocks.18.norm1.bias": "model-00001-of-00004.safetensors",
793
+ "vision_backbone.fused_featurizer.blocks.18.norm1.weight": "model-00001-of-00004.safetensors",
794
+ "vision_backbone.fused_featurizer.blocks.18.norm2.bias": "model-00001-of-00004.safetensors",
795
+ "vision_backbone.fused_featurizer.blocks.18.norm2.weight": "model-00001-of-00004.safetensors",
796
+ "vision_backbone.fused_featurizer.blocks.19.attn.proj.bias": "model-00001-of-00004.safetensors",
797
+ "vision_backbone.fused_featurizer.blocks.19.attn.proj.weight": "model-00001-of-00004.safetensors",
798
+ "vision_backbone.fused_featurizer.blocks.19.attn.qkv.bias": "model-00001-of-00004.safetensors",
799
+ "vision_backbone.fused_featurizer.blocks.19.attn.qkv.weight": "model-00001-of-00004.safetensors",
800
+ "vision_backbone.fused_featurizer.blocks.19.mlp.fc1.bias": "model-00001-of-00004.safetensors",
801
+ "vision_backbone.fused_featurizer.blocks.19.mlp.fc1.weight": "model-00001-of-00004.safetensors",
802
+ "vision_backbone.fused_featurizer.blocks.19.mlp.fc2.bias": "model-00001-of-00004.safetensors",
803
+ "vision_backbone.fused_featurizer.blocks.19.mlp.fc2.weight": "model-00001-of-00004.safetensors",
804
+ "vision_backbone.fused_featurizer.blocks.19.norm1.bias": "model-00001-of-00004.safetensors",
805
+ "vision_backbone.fused_featurizer.blocks.19.norm1.weight": "model-00001-of-00004.safetensors",
806
+ "vision_backbone.fused_featurizer.blocks.19.norm2.bias": "model-00001-of-00004.safetensors",
807
+ "vision_backbone.fused_featurizer.blocks.19.norm2.weight": "model-00001-of-00004.safetensors",
808
+ "vision_backbone.fused_featurizer.blocks.2.attn.proj.bias": "model-00001-of-00004.safetensors",
809
+ "vision_backbone.fused_featurizer.blocks.2.attn.proj.weight": "model-00001-of-00004.safetensors",
810
+ "vision_backbone.fused_featurizer.blocks.2.attn.qkv.bias": "model-00001-of-00004.safetensors",
811
+ "vision_backbone.fused_featurizer.blocks.2.attn.qkv.weight": "model-00001-of-00004.safetensors",
812
+ "vision_backbone.fused_featurizer.blocks.2.mlp.fc1.bias": "model-00001-of-00004.safetensors",
813
+ "vision_backbone.fused_featurizer.blocks.2.mlp.fc1.weight": "model-00001-of-00004.safetensors",
814
+ "vision_backbone.fused_featurizer.blocks.2.mlp.fc2.bias": "model-00001-of-00004.safetensors",
815
+ "vision_backbone.fused_featurizer.blocks.2.mlp.fc2.weight": "model-00001-of-00004.safetensors",
816
+ "vision_backbone.fused_featurizer.blocks.2.norm1.bias": "model-00001-of-00004.safetensors",
817
+ "vision_backbone.fused_featurizer.blocks.2.norm1.weight": "model-00001-of-00004.safetensors",
818
+ "vision_backbone.fused_featurizer.blocks.2.norm2.bias": "model-00001-of-00004.safetensors",
819
+ "vision_backbone.fused_featurizer.blocks.2.norm2.weight": "model-00001-of-00004.safetensors",
820
+ "vision_backbone.fused_featurizer.blocks.20.attn.proj.bias": "model-00001-of-00004.safetensors",
821
+ "vision_backbone.fused_featurizer.blocks.20.attn.proj.weight": "model-00001-of-00004.safetensors",
822
+ "vision_backbone.fused_featurizer.blocks.20.attn.qkv.bias": "model-00001-of-00004.safetensors",
823
+ "vision_backbone.fused_featurizer.blocks.20.attn.qkv.weight": "model-00001-of-00004.safetensors",
824
+ "vision_backbone.fused_featurizer.blocks.20.mlp.fc1.bias": "model-00001-of-00004.safetensors",
825
+ "vision_backbone.fused_featurizer.blocks.20.mlp.fc1.weight": "model-00001-of-00004.safetensors",
826
+ "vision_backbone.fused_featurizer.blocks.20.mlp.fc2.bias": "model-00001-of-00004.safetensors",
827
+ "vision_backbone.fused_featurizer.blocks.20.mlp.fc2.weight": "model-00001-of-00004.safetensors",
828
+ "vision_backbone.fused_featurizer.blocks.20.norm1.bias": "model-00001-of-00004.safetensors",
829
+ "vision_backbone.fused_featurizer.blocks.20.norm1.weight": "model-00001-of-00004.safetensors",
830
+ "vision_backbone.fused_featurizer.blocks.20.norm2.bias": "model-00001-of-00004.safetensors",
831
+ "vision_backbone.fused_featurizer.blocks.20.norm2.weight": "model-00001-of-00004.safetensors",
832
+ "vision_backbone.fused_featurizer.blocks.21.attn.proj.bias": "model-00001-of-00004.safetensors",
833
+ "vision_backbone.fused_featurizer.blocks.21.attn.proj.weight": "model-00001-of-00004.safetensors",
834
+ "vision_backbone.fused_featurizer.blocks.21.attn.qkv.bias": "model-00001-of-00004.safetensors",
835
+ "vision_backbone.fused_featurizer.blocks.21.attn.qkv.weight": "model-00001-of-00004.safetensors",
836
+ "vision_backbone.fused_featurizer.blocks.21.mlp.fc1.bias": "model-00001-of-00004.safetensors",
837
+ "vision_backbone.fused_featurizer.blocks.21.mlp.fc1.weight": "model-00001-of-00004.safetensors",
838
+ "vision_backbone.fused_featurizer.blocks.21.mlp.fc2.bias": "model-00001-of-00004.safetensors",
839
+ "vision_backbone.fused_featurizer.blocks.21.mlp.fc2.weight": "model-00001-of-00004.safetensors",
840
+ "vision_backbone.fused_featurizer.blocks.21.norm1.bias": "model-00001-of-00004.safetensors",
841
+ "vision_backbone.fused_featurizer.blocks.21.norm1.weight": "model-00001-of-00004.safetensors",
842
+ "vision_backbone.fused_featurizer.blocks.21.norm2.bias": "model-00001-of-00004.safetensors",
843
+ "vision_backbone.fused_featurizer.blocks.21.norm2.weight": "model-00001-of-00004.safetensors",
844
+ "vision_backbone.fused_featurizer.blocks.22.attn.proj.bias": "model-00001-of-00004.safetensors",
845
+ "vision_backbone.fused_featurizer.blocks.22.attn.proj.weight": "model-00001-of-00004.safetensors",
846
+ "vision_backbone.fused_featurizer.blocks.22.attn.qkv.bias": "model-00001-of-00004.safetensors",
847
+ "vision_backbone.fused_featurizer.blocks.22.attn.qkv.weight": "model-00001-of-00004.safetensors",
848
+ "vision_backbone.fused_featurizer.blocks.22.mlp.fc1.bias": "model-00001-of-00004.safetensors",
849
+ "vision_backbone.fused_featurizer.blocks.22.mlp.fc1.weight": "model-00001-of-00004.safetensors",
850
+ "vision_backbone.fused_featurizer.blocks.22.mlp.fc2.bias": "model-00001-of-00004.safetensors",
851
+ "vision_backbone.fused_featurizer.blocks.22.mlp.fc2.weight": "model-00001-of-00004.safetensors",
852
+ "vision_backbone.fused_featurizer.blocks.22.norm1.bias": "model-00001-of-00004.safetensors",
853
+ "vision_backbone.fused_featurizer.blocks.22.norm1.weight": "model-00001-of-00004.safetensors",
854
+ "vision_backbone.fused_featurizer.blocks.22.norm2.bias": "model-00001-of-00004.safetensors",
855
+ "vision_backbone.fused_featurizer.blocks.22.norm2.weight": "model-00001-of-00004.safetensors",
856
+ "vision_backbone.fused_featurizer.blocks.23.attn.proj.bias": "model-00001-of-00004.safetensors",
857
+ "vision_backbone.fused_featurizer.blocks.23.attn.proj.weight": "model-00001-of-00004.safetensors",
858
+ "vision_backbone.fused_featurizer.blocks.23.attn.qkv.bias": "model-00001-of-00004.safetensors",
859
+ "vision_backbone.fused_featurizer.blocks.23.attn.qkv.weight": "model-00001-of-00004.safetensors",
860
+ "vision_backbone.fused_featurizer.blocks.23.mlp.fc1.bias": "model-00001-of-00004.safetensors",
861
+ "vision_backbone.fused_featurizer.blocks.23.mlp.fc1.weight": "model-00001-of-00004.safetensors",
862
+ "vision_backbone.fused_featurizer.blocks.23.mlp.fc2.bias": "model-00001-of-00004.safetensors",
863
+ "vision_backbone.fused_featurizer.blocks.23.mlp.fc2.weight": "model-00001-of-00004.safetensors",
864
+ "vision_backbone.fused_featurizer.blocks.23.norm1.bias": "model-00001-of-00004.safetensors",
865
+ "vision_backbone.fused_featurizer.blocks.23.norm1.weight": "model-00001-of-00004.safetensors",
866
+ "vision_backbone.fused_featurizer.blocks.23.norm2.bias": "model-00001-of-00004.safetensors",
867
+ "vision_backbone.fused_featurizer.blocks.23.norm2.weight": "model-00001-of-00004.safetensors",
868
+ "vision_backbone.fused_featurizer.blocks.24.attn.proj.bias": "model-00001-of-00004.safetensors",
869
+ "vision_backbone.fused_featurizer.blocks.24.attn.proj.weight": "model-00001-of-00004.safetensors",
870
+ "vision_backbone.fused_featurizer.blocks.24.attn.qkv.bias": "model-00001-of-00004.safetensors",
871
+ "vision_backbone.fused_featurizer.blocks.24.attn.qkv.weight": "model-00001-of-00004.safetensors",
872
+ "vision_backbone.fused_featurizer.blocks.24.mlp.fc1.bias": "model-00001-of-00004.safetensors",
873
+ "vision_backbone.fused_featurizer.blocks.24.mlp.fc1.weight": "model-00001-of-00004.safetensors",
874
+ "vision_backbone.fused_featurizer.blocks.24.mlp.fc2.bias": "model-00001-of-00004.safetensors",
875
+ "vision_backbone.fused_featurizer.blocks.24.mlp.fc2.weight": "model-00001-of-00004.safetensors",
876
+ "vision_backbone.fused_featurizer.blocks.24.norm1.bias": "model-00001-of-00004.safetensors",
877
+ "vision_backbone.fused_featurizer.blocks.24.norm1.weight": "model-00001-of-00004.safetensors",
878
+ "vision_backbone.fused_featurizer.blocks.24.norm2.bias": "model-00001-of-00004.safetensors",
879
+ "vision_backbone.fused_featurizer.blocks.24.norm2.weight": "model-00001-of-00004.safetensors",
880
+ "vision_backbone.fused_featurizer.blocks.25.attn.proj.bias": "model-00001-of-00004.safetensors",
881
+ "vision_backbone.fused_featurizer.blocks.25.attn.proj.weight": "model-00001-of-00004.safetensors",
882
+ "vision_backbone.fused_featurizer.blocks.25.attn.qkv.bias": "model-00001-of-00004.safetensors",
883
+ "vision_backbone.fused_featurizer.blocks.25.attn.qkv.weight": "model-00001-of-00004.safetensors",
884
+ "vision_backbone.fused_featurizer.blocks.25.mlp.fc1.bias": "model-00001-of-00004.safetensors",
885
+ "vision_backbone.fused_featurizer.blocks.25.mlp.fc1.weight": "model-00001-of-00004.safetensors",
886
+ "vision_backbone.fused_featurizer.blocks.25.mlp.fc2.bias": "model-00001-of-00004.safetensors",
887
+ "vision_backbone.fused_featurizer.blocks.25.mlp.fc2.weight": "model-00001-of-00004.safetensors",
888
+ "vision_backbone.fused_featurizer.blocks.25.norm1.bias": "model-00001-of-00004.safetensors",
889
+ "vision_backbone.fused_featurizer.blocks.25.norm1.weight": "model-00001-of-00004.safetensors",
890
+ "vision_backbone.fused_featurizer.blocks.25.norm2.bias": "model-00001-of-00004.safetensors",
891
+ "vision_backbone.fused_featurizer.blocks.25.norm2.weight": "model-00001-of-00004.safetensors",
892
+ "vision_backbone.fused_featurizer.blocks.26.attn.proj.bias": "model-00001-of-00004.safetensors",
893
+ "vision_backbone.fused_featurizer.blocks.26.attn.proj.weight": "model-00001-of-00004.safetensors",
894
+ "vision_backbone.fused_featurizer.blocks.26.attn.qkv.bias": "model-00001-of-00004.safetensors",
895
+ "vision_backbone.fused_featurizer.blocks.26.attn.qkv.weight": "model-00001-of-00004.safetensors",
896
+ "vision_backbone.fused_featurizer.blocks.26.mlp.fc1.bias": "model-00001-of-00004.safetensors",
897
+ "vision_backbone.fused_featurizer.blocks.26.mlp.fc1.weight": "model-00001-of-00004.safetensors",
898
+ "vision_backbone.fused_featurizer.blocks.26.mlp.fc2.bias": "model-00001-of-00004.safetensors",
899
+ "vision_backbone.fused_featurizer.blocks.26.mlp.fc2.weight": "model-00001-of-00004.safetensors",
900
+ "vision_backbone.fused_featurizer.blocks.26.norm1.bias": "model-00001-of-00004.safetensors",
901
+ "vision_backbone.fused_featurizer.blocks.26.norm1.weight": "model-00001-of-00004.safetensors",
902
+ "vision_backbone.fused_featurizer.blocks.26.norm2.bias": "model-00001-of-00004.safetensors",
903
+ "vision_backbone.fused_featurizer.blocks.26.norm2.weight": "model-00001-of-00004.safetensors",
904
+ "vision_backbone.fused_featurizer.blocks.3.attn.proj.bias": "model-00001-of-00004.safetensors",
905
+ "vision_backbone.fused_featurizer.blocks.3.attn.proj.weight": "model-00001-of-00004.safetensors",
906
+ "vision_backbone.fused_featurizer.blocks.3.attn.qkv.bias": "model-00001-of-00004.safetensors",
907
+ "vision_backbone.fused_featurizer.blocks.3.attn.qkv.weight": "model-00001-of-00004.safetensors",
908
+ "vision_backbone.fused_featurizer.blocks.3.mlp.fc1.bias": "model-00001-of-00004.safetensors",
909
+ "vision_backbone.fused_featurizer.blocks.3.mlp.fc1.weight": "model-00001-of-00004.safetensors",
910
+ "vision_backbone.fused_featurizer.blocks.3.mlp.fc2.bias": "model-00001-of-00004.safetensors",
911
+ "vision_backbone.fused_featurizer.blocks.3.mlp.fc2.weight": "model-00001-of-00004.safetensors",
912
+ "vision_backbone.fused_featurizer.blocks.3.norm1.bias": "model-00001-of-00004.safetensors",
913
+ "vision_backbone.fused_featurizer.blocks.3.norm1.weight": "model-00001-of-00004.safetensors",
914
+ "vision_backbone.fused_featurizer.blocks.3.norm2.bias": "model-00001-of-00004.safetensors",
915
+ "vision_backbone.fused_featurizer.blocks.3.norm2.weight": "model-00001-of-00004.safetensors",
916
+ "vision_backbone.fused_featurizer.blocks.4.attn.proj.bias": "model-00001-of-00004.safetensors",
917
+ "vision_backbone.fused_featurizer.blocks.4.attn.proj.weight": "model-00001-of-00004.safetensors",
918
+ "vision_backbone.fused_featurizer.blocks.4.attn.qkv.bias": "model-00001-of-00004.safetensors",
919
+ "vision_backbone.fused_featurizer.blocks.4.attn.qkv.weight": "model-00001-of-00004.safetensors",
920
+ "vision_backbone.fused_featurizer.blocks.4.mlp.fc1.bias": "model-00001-of-00004.safetensors",
921
+ "vision_backbone.fused_featurizer.blocks.4.mlp.fc1.weight": "model-00001-of-00004.safetensors",
922
+ "vision_backbone.fused_featurizer.blocks.4.mlp.fc2.bias": "model-00001-of-00004.safetensors",
923
+ "vision_backbone.fused_featurizer.blocks.4.mlp.fc2.weight": "model-00001-of-00004.safetensors",
924
+ "vision_backbone.fused_featurizer.blocks.4.norm1.bias": "model-00001-of-00004.safetensors",
925
+ "vision_backbone.fused_featurizer.blocks.4.norm1.weight": "model-00001-of-00004.safetensors",
926
+ "vision_backbone.fused_featurizer.blocks.4.norm2.bias": "model-00001-of-00004.safetensors",
927
+ "vision_backbone.fused_featurizer.blocks.4.norm2.weight": "model-00001-of-00004.safetensors",
928
+ "vision_backbone.fused_featurizer.blocks.5.attn.proj.bias": "model-00001-of-00004.safetensors",
929
+ "vision_backbone.fused_featurizer.blocks.5.attn.proj.weight": "model-00001-of-00004.safetensors",
930
+ "vision_backbone.fused_featurizer.blocks.5.attn.qkv.bias": "model-00001-of-00004.safetensors",
931
+ "vision_backbone.fused_featurizer.blocks.5.attn.qkv.weight": "model-00001-of-00004.safetensors",
932
+ "vision_backbone.fused_featurizer.blocks.5.mlp.fc1.bias": "model-00001-of-00004.safetensors",
933
+ "vision_backbone.fused_featurizer.blocks.5.mlp.fc1.weight": "model-00001-of-00004.safetensors",
934
+ "vision_backbone.fused_featurizer.blocks.5.mlp.fc2.bias": "model-00001-of-00004.safetensors",
935
+ "vision_backbone.fused_featurizer.blocks.5.mlp.fc2.weight": "model-00001-of-00004.safetensors",
936
+ "vision_backbone.fused_featurizer.blocks.5.norm1.bias": "model-00001-of-00004.safetensors",
937
+ "vision_backbone.fused_featurizer.blocks.5.norm1.weight": "model-00001-of-00004.safetensors",
938
+ "vision_backbone.fused_featurizer.blocks.5.norm2.bias": "model-00001-of-00004.safetensors",
939
+ "vision_backbone.fused_featurizer.blocks.5.norm2.weight": "model-00001-of-00004.safetensors",
940
+ "vision_backbone.fused_featurizer.blocks.6.attn.proj.bias": "model-00001-of-00004.safetensors",
941
+ "vision_backbone.fused_featurizer.blocks.6.attn.proj.weight": "model-00001-of-00004.safetensors",
942
+ "vision_backbone.fused_featurizer.blocks.6.attn.qkv.bias": "model-00001-of-00004.safetensors",
943
+ "vision_backbone.fused_featurizer.blocks.6.attn.qkv.weight": "model-00001-of-00004.safetensors",
944
+ "vision_backbone.fused_featurizer.blocks.6.mlp.fc1.bias": "model-00001-of-00004.safetensors",
945
+ "vision_backbone.fused_featurizer.blocks.6.mlp.fc1.weight": "model-00001-of-00004.safetensors",
946
+ "vision_backbone.fused_featurizer.blocks.6.mlp.fc2.bias": "model-00001-of-00004.safetensors",
947
+ "vision_backbone.fused_featurizer.blocks.6.mlp.fc2.weight": "model-00001-of-00004.safetensors",
948
+ "vision_backbone.fused_featurizer.blocks.6.norm1.bias": "model-00001-of-00004.safetensors",
949
+ "vision_backbone.fused_featurizer.blocks.6.norm1.weight": "model-00001-of-00004.safetensors",
950
+ "vision_backbone.fused_featurizer.blocks.6.norm2.bias": "model-00001-of-00004.safetensors",
951
+ "vision_backbone.fused_featurizer.blocks.6.norm2.weight": "model-00001-of-00004.safetensors",
952
+ "vision_backbone.fused_featurizer.blocks.7.attn.proj.bias": "model-00001-of-00004.safetensors",
953
+ "vision_backbone.fused_featurizer.blocks.7.attn.proj.weight": "model-00001-of-00004.safetensors",
954
+ "vision_backbone.fused_featurizer.blocks.7.attn.qkv.bias": "model-00001-of-00004.safetensors",
955
+ "vision_backbone.fused_featurizer.blocks.7.attn.qkv.weight": "model-00001-of-00004.safetensors",
956
+ "vision_backbone.fused_featurizer.blocks.7.mlp.fc1.bias": "model-00001-of-00004.safetensors",
957
+ "vision_backbone.fused_featurizer.blocks.7.mlp.fc1.weight": "model-00001-of-00004.safetensors",
958
+ "vision_backbone.fused_featurizer.blocks.7.mlp.fc2.bias": "model-00001-of-00004.safetensors",
959
+ "vision_backbone.fused_featurizer.blocks.7.mlp.fc2.weight": "model-00001-of-00004.safetensors",
960
+ "vision_backbone.fused_featurizer.blocks.7.norm1.bias": "model-00001-of-00004.safetensors",
961
+ "vision_backbone.fused_featurizer.blocks.7.norm1.weight": "model-00001-of-00004.safetensors",
962
+ "vision_backbone.fused_featurizer.blocks.7.norm2.bias": "model-00001-of-00004.safetensors",
963
+ "vision_backbone.fused_featurizer.blocks.7.norm2.weight": "model-00001-of-00004.safetensors",
964
+ "vision_backbone.fused_featurizer.blocks.8.attn.proj.bias": "model-00001-of-00004.safetensors",
965
+ "vision_backbone.fused_featurizer.blocks.8.attn.proj.weight": "model-00001-of-00004.safetensors",
966
+ "vision_backbone.fused_featurizer.blocks.8.attn.qkv.bias": "model-00001-of-00004.safetensors",
967
+ "vision_backbone.fused_featurizer.blocks.8.attn.qkv.weight": "model-00001-of-00004.safetensors",
968
+ "vision_backbone.fused_featurizer.blocks.8.mlp.fc1.bias": "model-00001-of-00004.safetensors",
969
+ "vision_backbone.fused_featurizer.blocks.8.mlp.fc1.weight": "model-00001-of-00004.safetensors",
970
+ "vision_backbone.fused_featurizer.blocks.8.mlp.fc2.bias": "model-00001-of-00004.safetensors",
971
+ "vision_backbone.fused_featurizer.blocks.8.mlp.fc2.weight": "model-00001-of-00004.safetensors",
972
+ "vision_backbone.fused_featurizer.blocks.8.norm1.bias": "model-00001-of-00004.safetensors",
973
+ "vision_backbone.fused_featurizer.blocks.8.norm1.weight": "model-00001-of-00004.safetensors",
974
+ "vision_backbone.fused_featurizer.blocks.8.norm2.bias": "model-00001-of-00004.safetensors",
975
+ "vision_backbone.fused_featurizer.blocks.8.norm2.weight": "model-00001-of-00004.safetensors",
976
+ "vision_backbone.fused_featurizer.blocks.9.attn.proj.bias": "model-00001-of-00004.safetensors",
977
+ "vision_backbone.fused_featurizer.blocks.9.attn.proj.weight": "model-00001-of-00004.safetensors",
978
+ "vision_backbone.fused_featurizer.blocks.9.attn.qkv.bias": "model-00001-of-00004.safetensors",
979
+ "vision_backbone.fused_featurizer.blocks.9.attn.qkv.weight": "model-00001-of-00004.safetensors",
980
+ "vision_backbone.fused_featurizer.blocks.9.mlp.fc1.bias": "model-00001-of-00004.safetensors",
981
+ "vision_backbone.fused_featurizer.blocks.9.mlp.fc1.weight": "model-00001-of-00004.safetensors",
982
+ "vision_backbone.fused_featurizer.blocks.9.mlp.fc2.bias": "model-00001-of-00004.safetensors",
983
+ "vision_backbone.fused_featurizer.blocks.9.mlp.fc2.weight": "model-00001-of-00004.safetensors",
984
+ "vision_backbone.fused_featurizer.blocks.9.norm1.bias": "model-00001-of-00004.safetensors",
985
+ "vision_backbone.fused_featurizer.blocks.9.norm1.weight": "model-00001-of-00004.safetensors",
986
+ "vision_backbone.fused_featurizer.blocks.9.norm2.bias": "model-00001-of-00004.safetensors",
987
+ "vision_backbone.fused_featurizer.blocks.9.norm2.weight": "model-00001-of-00004.safetensors",
988
+ "vision_backbone.fused_featurizer.norm.bias": "model-00001-of-00004.safetensors",
989
+ "vision_backbone.fused_featurizer.norm.weight": "model-00001-of-00004.safetensors",
990
+ "vision_backbone.fused_featurizer.patch_embed.proj.bias": "model-00001-of-00004.safetensors",
991
+ "vision_backbone.fused_featurizer.patch_embed.proj.weight": "model-00001-of-00004.safetensors",
992
+ "vision_backbone.fused_featurizer.pos_embed": "model-00001-of-00004.safetensors"
993
+ }
994
+ }
modeling_prismatic.py ADDED
@@ -0,0 +1,1085 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ modeling_prismatic.py
3
+
4
+ Core HuggingFace-style PrismaticPreTrainedModel and PrismaticForConditionalGeneration class definitions.
5
+ Inherits from the default `transformers.PretrainedModel`. Meant to be standalone and self-contained,
6
+ but exactly replicate the logic in `prismatic.models.vlms.prismatic.py`.
7
+ """
8
+
9
+ import logging
10
+ from dataclasses import dataclass
11
+ from functools import partial
12
+ from typing import Any, Callable, ClassVar, Dict, List, Optional, Tuple, Union
13
+
14
+ import numpy as np
15
+ import timm
16
+ import tokenizers
17
+ import torch
18
+ import torch.nn as nn
19
+ import transformers
20
+ from timm.models.vision_transformer import LayerScale
21
+ from transformers import AutoModelForCausalLM, PretrainedConfig, PreTrainedModel
22
+ from transformers.modeling_outputs import ModelOutput
23
+
24
+ from prismatic.training.train_utils import (
25
+ get_current_action_mask,
26
+ get_next_actions_mask,
27
+ )
28
+ from prismatic.vla.constants import (
29
+ ACTION_DIM,
30
+ ACTION_PROPRIO_NORMALIZATION_TYPE,
31
+ ACTION_TOKEN_BEGIN_IDX,
32
+ IGNORE_INDEX,
33
+ NUM_ACTIONS_CHUNK,
34
+ STOP_INDEX,
35
+ NormalizationType,
36
+ )
37
+
38
+ from .configuration_prismatic import OpenVLAConfig, PrismaticConfig
39
+
40
+ # Set up logger
41
+ logger = logging.getLogger(__name__)
42
+
43
+
44
+ # === Utility Functions for Monkey-Patching ===
45
+ def unpack_tuple(fn: Callable[[Any], Tuple[Any]]) -> Callable[[Any], Any]:
46
+ def wrapper(*args: Any, **kwargs: Any) -> Any:
47
+ result = fn(*args, **kwargs)
48
+ return result[0] if isinstance(result, tuple) else result
49
+
50
+ return wrapper
51
+
52
+
53
+ # HF Transformers overwrites parameters with names containing `gamma`; we're going to patch VisionBackbone.LayerScale.
54
+ # =>> TIMM :: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L109
55
+ # =>> Transformers :: https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_utils.py#L3960
56
+ def _ls_new_forward(self, x: torch.Tensor) -> torch.Tensor:
57
+ return x.mul_(self.scale_factor) if self.inplace else x * self.scale_factor
58
+
59
+
60
+ def ls_apply_patch(ls_module: LayerScale):
61
+ ls_module.scale_factor = nn.Parameter(ls_module.gamma.clone())
62
+ ls_module.forward = _ls_new_forward.__get__(ls_module, LayerScale)
63
+ del ls_module.gamma
64
+
65
+
66
+ # === Prismatic Vision Backbone (nn.Module) Definitions (w/ Fused Backbone Support) ===
67
+ class PrismaticVisionBackbone(nn.Module):
68
+ """
69
+ Vision backbone for Prismatic models that handles image feature extraction.
70
+
71
+ Supports both single backbone (e.g., SigLIP) and fused backbone (e.g., SigLIP + DINOv2) configurations.
72
+ For fused backbones, features from both models are concatenated along the feature dimension.
73
+ """
74
+
75
+ def __init__(
76
+ self,
77
+ use_fused_vision_backbone: bool,
78
+ image_sizes: List[int],
79
+ timm_model_ids: List[str],
80
+ timm_override_act_layers: List[Optional[str]],
81
+ ) -> None:
82
+ """
83
+ Initialize the vision backbone.
84
+
85
+ Args:
86
+ use_fused_vision_backbone: Whether to use two backbones and fuse their features
87
+ image_sizes: List of image sizes for each backbone
88
+ timm_model_ids: List of TIMM model IDs to use for each backbone
89
+ timm_override_act_layers: List of activation layer overrides for each backbone
90
+ """
91
+ super().__init__()
92
+ self.use_fused_vision_backbone = use_fused_vision_backbone
93
+ self.num_images_in_input = 1 # Default value, can be overridden later
94
+
95
+ # Validate number of (fused) vision backbones
96
+ if len(timm_model_ids) > 2:
97
+ raise ValueError("Prismatic models only support up to 2 (fused) vision backbones!")
98
+
99
+ # Create primary featurizer
100
+ self.featurizer = self._create_featurizer(
101
+ model_id=timm_model_ids[0], img_size=image_sizes[0], act_layer=timm_override_act_layers[0]
102
+ )
103
+ self.embed_dim = self.featurizer.embed_dim
104
+
105
+ # Create secondary featurizer if using fused backbone
106
+ if self.use_fused_vision_backbone:
107
+ self.fused_featurizer = self._create_featurizer(
108
+ model_id=timm_model_ids[1], img_size=image_sizes[1], act_layer=timm_override_act_layers[1]
109
+ )
110
+ self.embed_dim += self.fused_featurizer.embed_dim
111
+
112
+ # Patch LayerScale modules for HF compatibility
113
+ self._patch_layer_scales()
114
+
115
+ def _create_featurizer(self, model_id: str, img_size: int, act_layer: Optional[str]) -> nn.Module:
116
+ """
117
+ Create a TIMM-based featurizer model with appropriate configurations.
118
+
119
+ Args:
120
+ model_id: The TIMM model ID to load
121
+ img_size: Input image size for the model
122
+ act_layer: Override for the activation layer type
123
+
124
+ Returns:
125
+ A configured featurizer model
126
+ """
127
+ featurizer = timm.create_model(
128
+ model_id,
129
+ pretrained=False,
130
+ num_classes=0,
131
+ img_size=img_size,
132
+ act_layer=act_layer,
133
+ )
134
+
135
+ # Monkey-patch the forward function to extract the second-to-last layer features
136
+ num_blocks = len(featurizer.blocks)
137
+ featurizer.forward = unpack_tuple(partial(featurizer.get_intermediate_layers, n={num_blocks - 2}))
138
+
139
+ return featurizer
140
+
141
+ def _patch_layer_scales(self) -> None:
142
+ """
143
+ Patch all LayerScale modules to be compatible with HF's parameter naming.
144
+
145
+ HF Transformers overwrites parameters with names containing 'gamma',
146
+ so we need to rename and modify the forward method.
147
+ """
148
+ # Patch primary featurizer
149
+ for module in self.featurizer.modules():
150
+ if isinstance(module, LayerScale):
151
+ ls_apply_patch(module)
152
+
153
+ # Patch secondary featurizer if it exists
154
+ if self.use_fused_vision_backbone:
155
+ for module in self.fused_featurizer.modules():
156
+ if isinstance(module, LayerScale):
157
+ ls_apply_patch(module)
158
+
159
+ def get_num_patches(self) -> int:
160
+ """
161
+ Returns the number of vision patches output by the vision backbone.
162
+
163
+ Returns:
164
+ Number of patches per image
165
+ """
166
+ return self.featurizer.patch_embed.num_patches
167
+
168
+ def get_num_images_in_input(self) -> int:
169
+ """
170
+ Returns the number of input images for the vision backbone.
171
+
172
+ Returns:
173
+ Number of images expected in the input
174
+ """
175
+ return self.num_images_in_input
176
+
177
+ def set_num_images_in_input(self, num_images_in_input: int) -> None:
178
+ """
179
+ Sets the number of input images for the vision backbone.
180
+
181
+ Args:
182
+ num_images_in_input: Number of images to expect in the input
183
+ """
184
+ self.num_images_in_input = num_images_in_input
185
+
186
+ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
187
+ """
188
+ Implements the forward pass for the vision backbone.
189
+
190
+ If `self.use_fused_vision_backbone == True`, uses both SigLIP and DINOv2 transformers to extract visual features
191
+ (otherwise uses SigLIP only). Allows multi-image inputs (but only for fused vision backbone).
192
+
193
+ Args:
194
+ pixel_values (torch.Tensor): Pixels for input image(s), (B, C, H, W).
195
+ """
196
+ if self.num_images_in_input == 1:
197
+ if not self.use_fused_vision_backbone:
198
+ return self.featurizer(pixel_values)
199
+
200
+ # Split `pixel_values :: [bsz, 2 * 3, resolution, resolution]` =>> featurize =>> channel stack
201
+ img, img_fused = torch.split(pixel_values, [3, 3], dim=1)
202
+ patches, patches_fused = self.featurizer(img), self.fused_featurizer(img_fused)
203
+
204
+ return torch.cat([patches, patches_fused], dim=2)
205
+
206
+ else:
207
+ assert self.use_fused_vision_backbone, "Multi-image inputs require using fused backbone!"
208
+
209
+ # Split `pixel_values` into individual images (each with 6 channels: 3 for SigLIP + 3 for DINOv2)
210
+ images = torch.split(pixel_values, [6] * self.num_images_in_input, dim=1)
211
+
212
+ # Process each image and collect patches
213
+ all_patches = []
214
+ for img in images:
215
+ # Split each image further into two stacks of channels (each with 3 channels)
216
+ img_regular, img_fused = torch.split(img, [3, 3], dim=1)
217
+
218
+ # Get patches from both SigLIP and DINOv2 vision transformers
219
+ patches = self.featurizer(img_regular)
220
+ patches_fused = self.fused_featurizer(img_fused)
221
+
222
+ # Concatenate SigLIP and DINOv2 patches along the hidden dimension
223
+ combined_patches = torch.cat([patches, patches_fused], dim=2)
224
+ all_patches.append(combined_patches)
225
+
226
+ # Concatenate all patches along the patch dimension
227
+ return torch.cat(all_patches, dim=1)
228
+
229
+
230
+ # === Prismatic Projector (nn.Module) Definitions ===
231
+ class PrismaticProjector(nn.Module):
232
+ def __init__(self, use_fused_vision_backbone: bool, vision_dim: int, llm_dim: int) -> None:
233
+ super().__init__()
234
+ self.use_fused_vision_backbone = use_fused_vision_backbone
235
+ self.vision_dim, self.llm_dim = vision_dim, llm_dim
236
+
237
+ # Switch on `use_fused_vision_backbone` =>> use slightly different MLPs and projection factors!
238
+ if not self.use_fused_vision_backbone:
239
+ self.fc1 = nn.Linear(self.vision_dim, self.llm_dim, bias=True)
240
+ self.fc2 = nn.Linear(self.llm_dim, self.llm_dim, bias=True)
241
+ self.act_fn1 = nn.GELU()
242
+ else:
243
+ initial_projection_dim = 4 * vision_dim
244
+ self.fc1 = nn.Linear(self.vision_dim, initial_projection_dim, bias=True)
245
+ self.fc2 = nn.Linear(initial_projection_dim, self.llm_dim, bias=True)
246
+ self.fc3 = nn.Linear(self.llm_dim, self.llm_dim, bias=True)
247
+ self.act_fn1 = nn.GELU()
248
+ self.act_fn2 = nn.GELU()
249
+
250
+ def forward(self, img_patches: torch.Tensor) -> torch.Tensor:
251
+ if not self.use_fused_vision_backbone:
252
+ projected_features = self.fc1(img_patches)
253
+ projected_features = self.act_fn1(projected_features)
254
+ projected_features = self.fc2(projected_features)
255
+ else:
256
+ projected_features = self.fc1(img_patches)
257
+ projected_features = self.act_fn1(projected_features)
258
+ projected_features = self.fc2(projected_features)
259
+ projected_features = self.act_fn2(projected_features)
260
+ projected_features = self.fc3(projected_features)
261
+
262
+ return projected_features
263
+
264
+
265
+ # === Main HF Class Definitions ===
266
+ @dataclass
267
+ class PrismaticCausalLMOutputWithPast(ModelOutput):
268
+ """Base class for Prismatic casual (visually-conditioned) language model outputs; also exposes visual features."""
269
+
270
+ loss: Optional[torch.FloatTensor] = None
271
+ logits: torch.FloatTensor = None
272
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
273
+ hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
274
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
275
+
276
+ # Additions for VLMs
277
+ projector_features: Optional[torch.FloatTensor] = None
278
+
279
+
280
+ class PrismaticPreTrainedModel(PreTrainedModel):
281
+ config_class: PretrainedConfig = PrismaticConfig
282
+ base_model_prefix: str = "model"
283
+ supports_gradient_checkpointing: bool = True
284
+
285
+ _no_split_modules: ClassVar[List[str]] = ["PrismaticProjector"]
286
+ _skip_keys_device_placement: str = "past_key_values"
287
+ _supports_flash_attn_2: bool = True
288
+
289
+ def _init_weights(self, module: nn.Module) -> None:
290
+ # Important :: this HF ported version is *not* meant for training from scratch; only inference and fine-tuning!
291
+ # => As such, this init_weights code is not correct; if training VLMs from scratch, use the main codebase at
292
+ # https://github.com/TRI-ML/prismatic-vlms
293
+ std = (
294
+ self.config.initializer_range
295
+ if hasattr(self.config, "initializer_range")
296
+ else self.config.text_config.initializer_range
297
+ )
298
+
299
+ if hasattr(module, "class_embedding"):
300
+ module.class_embedding.data.normal_(mean=0.0, std=std)
301
+
302
+ if isinstance(module, (nn.Linear, nn.Conv2d)):
303
+ module.weight.data.normal_(mean=0.0, std=std)
304
+ if module.bias is not None:
305
+ module.bias.data.zero_()
306
+ elif isinstance(module, nn.Embedding):
307
+ module.weight.data.normal_(mean=0.0, std=std)
308
+ if module.padding_idx is not None:
309
+ module.weight.data[module.padding_idx].zero_()
310
+
311
+ @property
312
+ def _supports_sdpa(self) -> bool:
313
+ """Check LLM supports SDPA Attention"""
314
+ return self.language_model._supports_sdpa
315
+
316
+
317
+ class PrismaticForConditionalGeneration(PrismaticPreTrainedModel):
318
+ def __init__(self, config: PrismaticConfig) -> None:
319
+ super().__init__(config)
320
+
321
+ # [Validation] Lightweight Validate on `config` Fields + Dependency Versions
322
+ if config.use_fused_vision_backbone is None:
323
+ raise ValueError("Missing config field `use_fused_vision_backbone`")
324
+
325
+ if timm.__version__ not in {"0.9.10", "0.9.11", "0.9.12", "0.9.16"}:
326
+ raise NotImplementedError(
327
+ "TIMM Version must be >= 0.9.10 and < 1.0.0 (breaking); please raise a GitHub Issue "
328
+ "if you urgently need support for latest TIMM versions."
329
+ )
330
+
331
+ if (transformers.__version__ != "4.40.1") or (tokenizers.__version__ != "0.19.1"):
332
+ logger.warning(
333
+ f"Expected `transformers==4.40.1` and `tokenizers==0.19.1` but got "
334
+ f"`transformers=={transformers.__version__}` and `tokenizers=={tokenizers.__version__}`; "
335
+ f"there might be inference-time regressions due to dependency changes. If in doubt, please"
336
+ f"use the above versions."
337
+ )
338
+
339
+ # Instantiate PrismaticVisionBackbone (w/ Potential Fused Backbone)
340
+ self.vision_backbone = PrismaticVisionBackbone(
341
+ config.use_fused_vision_backbone, config.image_sizes, config.timm_model_ids, config.timm_override_act_layers
342
+ )
343
+
344
+ # Create Multimodal Projector
345
+ self.projector = PrismaticProjector(
346
+ config.use_fused_vision_backbone,
347
+ vision_dim=self.vision_backbone.embed_dim,
348
+ llm_dim=config.text_config.hidden_size,
349
+ )
350
+
351
+ # Instantiate LLM Backbone
352
+ self.language_model = AutoModelForCausalLM.from_config(
353
+ config.text_config, attn_implementation=config._attn_implementation
354
+ )
355
+ self.vocab_size = config.text_config.vocab_size
356
+ self.pad_token_id = config.pad_token_id
357
+ self.llm_dim = config.text_config.hidden_size
358
+
359
+ # HF Boilerplate =>> initializes weights via `_init_weights()` and sets gradient checkpointing
360
+ self.post_init()
361
+
362
+ # === `PreTrainedModel` Boilerplate ===
363
+ def get_input_embeddings(self) -> nn.Module:
364
+ return self.language_model.get_input_embeddings()
365
+
366
+ def set_input_embeddings(self, value: nn.Module) -> None:
367
+ self.language_model.set_input_embeddings(value)
368
+
369
+ def get_output_embeddings(self) -> nn.Module:
370
+ return self.language_model.get_output_embeddings()
371
+
372
+ def set_output_embeddings(self, new_embeddings: nn.Module) -> None:
373
+ self.language_model.set_output_embeddings(new_embeddings)
374
+
375
+ def get_decoder(self) -> nn.Module:
376
+ return self.language_model.get_decoder()
377
+
378
+ def set_decoder(self, decoder: nn.Module) -> None:
379
+ self.language_model.set_decoder(decoder)
380
+
381
+ def tie_weights(self) -> None:
382
+ self.language_model.tie_weights() # Note: `Llama-2` and `Mistral` don't tie weights (no-op)
383
+
384
+ def resize_token_embeddings(
385
+ self, new_num_tokens: Optional[int] = None, pad_to_multiple_of: Optional[int] = None
386
+ ) -> nn.Embedding:
387
+ updated_embeddings = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
388
+
389
+ # Update config/instance variables
390
+ self.config.text_config.vocab_size = updated_embeddings.num_embeddings
391
+ self.vocab_size = updated_embeddings.num_embeddings
392
+
393
+ return updated_embeddings
394
+
395
+ def _replace_input_embeddings(self, input_embeddings, all_actions_mask, noisy_action_features):
396
+ """
397
+ Replace embeddings in input_embeddings at positions where all_actions_mask is True
398
+ with embeddings from noisy_action_features, using vectorized operations.
399
+
400
+ Args:
401
+ input_embeddings: Tensor of shape (B, S, D)
402
+ all_actions_mask: Boolean tensor of shape (B, S)
403
+ noisy_action_features: Tensor of shape (B, K, D) where K is the number of True values in mask per sample
404
+
405
+ Returns:
406
+ Modified input_embeddings tensor
407
+ """
408
+ # Clone input to avoid modifying the original tensor
409
+ new_input_embeddings = input_embeddings.clone()
410
+
411
+ # Create a tensor with the same shape of input_embeddings to hold the noisy action features
412
+ repositioned_noisy_action_features = torch.zeros_like(input_embeddings)
413
+
414
+ # Create batch indices for splicing
415
+ batch_indices = torch.arange(input_embeddings.shape[0], device=input_embeddings.device)
416
+ batch_indices = batch_indices.unsqueeze(1).expand(-1, noisy_action_features.shape[1])
417
+
418
+ # Get indices where mask is True for each sample
419
+ masked_indices = torch.stack([torch.where(mask)[0] for mask in all_actions_mask])
420
+
421
+ # Move the noisy action features into their correct positions
422
+ repositioned_noisy_action_features[batch_indices, masked_indices] = noisy_action_features
423
+
424
+ # Combine original input embeddings and noisy action embeddings using the mask
425
+ new_input_embeddings = torch.where(
426
+ all_actions_mask.unsqueeze(-1), repositioned_noisy_action_features, new_input_embeddings
427
+ )
428
+
429
+ return new_input_embeddings
430
+
431
+ def _process_action_masks(self, labels):
432
+ """Helper to get action masks from labels"""
433
+ current_action_mask = get_current_action_mask(labels)
434
+ next_actions_mask = get_next_actions_mask(labels)
435
+ all_actions_mask = current_action_mask | next_actions_mask # (B, seq_len)
436
+ return all_actions_mask
437
+
438
+ def _process_vision_features(self, pixel_values, language_embeddings=None, use_film=False):
439
+ """Process vision features with optional FiLM conditioning"""
440
+ if use_film:
441
+ # FiLM: Infuse language inputs into visual features
442
+ patch_features = self.vision_backbone(pixel_values, language_embeddings) # (bsz, 256 * num_images, D)
443
+ else:
444
+ patch_features = self.vision_backbone(pixel_values) # (bsz, 256 * num_images, D)
445
+
446
+ # Project patch embeddings into language embedding space
447
+ return self.projector(patch_features)
448
+
449
+ def _process_proprio_features(self, projected_patch_embeddings, proprio, proprio_projector):
450
+ """Process proprioceptive features and append to vision features"""
451
+ if proprio_projector is not None and proprio is not None:
452
+ # projected_patch_embeddings: (bsz, num_patches * num_images, llm_dim)
453
+ # proprio: (bsz, proprio_dim) or (propro_dim,)
454
+ proprio = proprio.reshape(projected_patch_embeddings.shape[0], -1) # (bsz, proprio_dim)
455
+ proprio_features = proprio_projector(proprio) # (bsz, llm_dim)
456
+ proprio_features = proprio_features.unsqueeze(dim=1) # (bsz, 1, llm_dim)
457
+ # For simplicity, just append proprio token to the end of projected vision patch tokens
458
+ return torch.cat((projected_patch_embeddings, proprio_features), dim=1)
459
+ return projected_patch_embeddings
460
+
461
+ def _build_multimodal_attention(self, input_embeddings, projected_patch_embeddings, attention_mask):
462
+ """Build multimodal embeddings and attention mask"""
463
+ # Update attention mask
464
+ projected_patch_attention_mask = None
465
+ if attention_mask is not None:
466
+ projected_patch_attention_mask = torch.full(
467
+ (projected_patch_embeddings.shape[0], projected_patch_embeddings.shape[1]),
468
+ fill_value=True,
469
+ dtype=attention_mask.dtype,
470
+ device=attention_mask.device,
471
+ )
472
+
473
+ # Build multimodal embeddings & attention mask; insert embeddings after <BOS> token (1:)
474
+ multimodal_embeddings = torch.cat(
475
+ [input_embeddings[:, :1, :], projected_patch_embeddings, input_embeddings[:, 1:, :]], dim=1
476
+ )
477
+
478
+ multimodal_attention_mask = None
479
+ if attention_mask is not None:
480
+ multimodal_attention_mask = torch.cat(
481
+ [attention_mask[:, :1], projected_patch_attention_mask, attention_mask[:, 1:]], dim=1
482
+ )
483
+
484
+ return multimodal_embeddings, multimodal_attention_mask
485
+
486
+ def _build_multimodal_labels(self, labels, projected_patch_embeddings):
487
+ """Build multimodal labels with IGNORE_INDEX for patch embeddings"""
488
+ if labels is not None:
489
+ projected_patch_labels = torch.full(
490
+ (projected_patch_embeddings.shape[0], projected_patch_embeddings.shape[1]),
491
+ fill_value=IGNORE_INDEX,
492
+ dtype=labels.dtype,
493
+ device=labels.device,
494
+ )
495
+ return torch.cat([labels[:, :1], projected_patch_labels, labels[:, 1:]], dim=1)
496
+ return None
497
+
498
+ # === Core Prismatic VLM `forward()` Logic ===
499
+ def forward(
500
+ self,
501
+ input_ids: Optional[torch.LongTensor] = None,
502
+ attention_mask: Optional[torch.Tensor] = None,
503
+ pixel_values: Optional[torch.FloatTensor] = None,
504
+ labels: Optional[torch.LongTensor] = None,
505
+ inputs_embeds: Optional[torch.FloatTensor] = None,
506
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
507
+ use_cache: Optional[bool] = None,
508
+ output_attentions: Optional[bool] = None,
509
+ output_hidden_states: Optional[bool] = None,
510
+ output_projector_features: Optional[bool] = None,
511
+ return_dict: Optional[bool] = None,
512
+ proprio=None,
513
+ proprio_projector=None,
514
+ noisy_actions=None,
515
+ noisy_action_projector=None,
516
+ diffusion_timestep_embeddings=None,
517
+ use_film: bool = False,
518
+ ) -> Union[Tuple, PrismaticCausalLMOutputWithPast]:
519
+ """Run a forward pass through the VLM, returning a PrismaticCausalLMOutputWithPast instance."""
520
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
521
+ output_hidden_states = (
522
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
523
+ )
524
+ output_projector_features = output_projector_features if output_projector_features is not None else False
525
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
526
+
527
+ # Respect `use_cache` only if not training (even if `gradient_checkpointing` is off)
528
+ use_cache = use_cache and not self.training
529
+
530
+ # Instantiate Placeholder for Projector Features
531
+ projected_patch_embeddings = None
532
+
533
+ # === Handle Generation with Cache (`input_ids.shape[1] == 1`) =>> requires `past_keys_values` ===
534
+ if input_ids.shape[1] == 1:
535
+ assert input_ids.shape[0] == 1, "Generation is only currently supported for batch size of 1!"
536
+ assert past_key_values is not None, "You must provide `past_key_values` during cached generation!"
537
+ assert labels is None, "Unexpected key `labels` provided during cached generation!"
538
+
539
+ language_model_output = self.language_model(
540
+ input_ids=input_ids,
541
+ attention_mask=None,
542
+ position_ids=None,
543
+ past_key_values=past_key_values,
544
+ inputs_embeds=None,
545
+ labels=None,
546
+ use_cache=use_cache,
547
+ output_attentions=output_attentions,
548
+ output_hidden_states=output_hidden_states,
549
+ return_dict=return_dict,
550
+ )
551
+
552
+ # === Handle Unimodal Forward ===
553
+ elif pixel_values is None:
554
+ assert (input_ids is not None) and (inputs_embeds is None), "Missing `input_ids` in language-only forward!"
555
+ assert past_key_values is None, "Unexpected key `past_key_values` provided during language-only forward!"
556
+
557
+ language_model_output = self.language_model(
558
+ input_ids=input_ids,
559
+ attention_mask=attention_mask,
560
+ position_ids=None,
561
+ past_key_values=None,
562
+ inputs_embeds=None,
563
+ labels=labels,
564
+ use_cache=use_cache,
565
+ output_attentions=output_attentions,
566
+ output_hidden_states=output_hidden_states,
567
+ return_dict=return_dict,
568
+ )
569
+
570
+ # === Handle Multimodal Forward ===
571
+ elif (input_ids.shape[0] == pixel_values.shape[0]) or (inputs_embeds.shape[0] == pixel_values.shape[0]):
572
+ assert past_key_values is None, "Unexpected key `past_key_values` provided during multimodal forward!"
573
+
574
+ # Get input embeddings (from language model embeddings)
575
+ input_embeddings = self.get_input_embeddings()(input_ids) # (B, seq_len, D)
576
+
577
+ # Extract action masks
578
+ all_actions_mask = self._process_action_masks(labels)
579
+
580
+ # Extract the language portion of the input embeddings (i.e. remove the action tokens portion)
581
+ language_embeddings = input_embeddings[~all_actions_mask].reshape(
582
+ input_embeddings.shape[0], -1, input_embeddings.shape[2]
583
+ ) # (B, lang_seq_len, llm_dim)
584
+
585
+ # Get visual features
586
+ projected_patch_embeddings = self._process_vision_features(pixel_values, language_embeddings, use_film)
587
+
588
+ # Add proprioceptive state if provided
589
+ projected_patch_embeddings = self._process_proprio_features(
590
+ projected_patch_embeddings, proprio, proprio_projector
591
+ )
592
+
593
+ # [Diffusion] Add diffusion timestep embedding if provided
594
+ if diffusion_timestep_embeddings is not None:
595
+ # For simplicity, just append diffusion timestep embedding to the end of projected vision patch tokens
596
+ projected_patch_embeddings = torch.cat(
597
+ (projected_patch_embeddings, diffusion_timestep_embeddings), dim=1
598
+ )
599
+
600
+ # Process action embeddings
601
+ if noisy_actions is not None:
602
+ # Get mask corresponding to all action tokens
603
+ all_actions_mask = self._process_action_masks(labels)
604
+
605
+ # Reshape noisy actions into individual action tokens
606
+ # noisy_actions: (B, chunk_len, action_dim) -> (B, chunk_len * action_dim, 1)
607
+ B = noisy_actions.shape[0]
608
+ noisy_actions = noisy_actions.reshape(B, -1).unsqueeze(-1)
609
+
610
+ # Project noisy action tokens into language model embedding space
611
+ noisy_action_features = noisy_action_projector(noisy_actions) # (B, chunk_len * action_dim, llm_dim)
612
+
613
+ # Replace embeddings of the action tokens with noisy action embeddings
614
+ input_embeddings = self._replace_input_embeddings(
615
+ input_embeddings, all_actions_mask, noisy_action_features
616
+ )
617
+ else:
618
+ # Replace the embeddings of the action tokens with zeros
619
+ # (Later on, the positional embeddings will be added to them)
620
+ all_actions_mask = all_actions_mask.unsqueeze(-1) # (B, seq_len, 1)
621
+ input_embeddings = input_embeddings * ~all_actions_mask
622
+
623
+ # Build multimodal embeddings & attention mask
624
+ multimodal_embeddings, multimodal_attention_mask = self._build_multimodal_attention(
625
+ input_embeddings, projected_patch_embeddings, attention_mask
626
+ )
627
+
628
+ # Build labels for multimodal sequence if needed
629
+ multimodal_labels = self._build_multimodal_labels(labels, projected_patch_embeddings)
630
+
631
+ # Dispatch to language model
632
+ language_model_output = self.language_model(
633
+ input_ids=None,
634
+ attention_mask=multimodal_attention_mask,
635
+ position_ids=None,
636
+ past_key_values=None,
637
+ inputs_embeds=multimodal_embeddings,
638
+ labels=multimodal_labels,
639
+ use_cache=use_cache,
640
+ output_attentions=output_attentions,
641
+ output_hidden_states=output_hidden_states,
642
+ return_dict=return_dict,
643
+ )
644
+
645
+ # === Otherwise =>> Assume Invalid! ===
646
+ elif (input_ids.shape[0] != pixel_values.shape[0]) or (inputs_embeds.shape[0] != pixel_values.shape[0]):
647
+ raise ValueError("Non-homogenous batch of (text, image) input -- forward() does not support mixed batches!")
648
+
649
+ else:
650
+ raise ValueError(
651
+ "Invalid PrismaticForConditionalGeneration `forward()` call with provided arguments:\n"
652
+ f"=> `input_ids` = {input_ids is not None}\n"
653
+ f"=> `attention_mask` = {attention_mask is not None}\n"
654
+ f"=> `pixel_values` = {pixel_values is not None}\n"
655
+ f"=> `labels` = {labels is not None}\n"
656
+ f"=> `input_embeds` = {inputs_embeds is not None}\n"
657
+ f"=> `past_key_values` = {past_key_values is not None}\n"
658
+ f"=> `use_cache` = {use_cache}"
659
+ )
660
+
661
+ # Unpack `language_model_output` and return PrismaticCausalLMOutputWithPast (or tuple if not `return_dict`)
662
+ if not return_dict:
663
+ if output_projector_features and (projected_patch_embeddings is not None):
664
+ return *language_model_output, projected_patch_embeddings
665
+
666
+ return language_model_output
667
+
668
+ return PrismaticCausalLMOutputWithPast(
669
+ loss=language_model_output.loss,
670
+ logits=language_model_output.logits,
671
+ past_key_values=language_model_output.past_key_values,
672
+ hidden_states=language_model_output.hidden_states,
673
+ attentions=language_model_output.attentions,
674
+ projector_features=projected_patch_embeddings,
675
+ )
676
+
677
+ # === GenerationMixin Methods ===
678
+ def prepare_inputs_for_generation(
679
+ self,
680
+ input_ids: Optional[torch.Tensor] = None,
681
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
682
+ inputs_embeds: Optional[torch.FloatTensor] = None,
683
+ pixel_values: Optional[torch.FloatTensor] = None,
684
+ attention_mask: Optional[torch.Tensor] = None,
685
+ **kwargs: str,
686
+ ) -> Dict[str, torch.Tensor]:
687
+ """Borrowed from `LlamaForCausalLM` and simplified for batch size = 1; mirrors original PrismaticVLM logic."""
688
+ if ((input_ids is not None) and (input_ids.shape[0] > 1)) or (
689
+ (inputs_embeds is not None) and (inputs_embeds.shape[0] > 1)
690
+ ):
691
+ raise ValueError("Generation with batch size > 1 is not currently supported!")
692
+
693
+ # Handle `past_key_values` (cache) =>> assume `input_ids` just has unprocessed tokens
694
+ if past_key_values is not None:
695
+ input_ids = input_ids[:, -1:]
696
+
697
+ # If `input_embeds` are passed, we only want to use them in the 1st generation step
698
+ if inputs_embeds is not None and past_key_values is None:
699
+ model_inputs = {"input_embeds": inputs_embeds}
700
+ else:
701
+ model_inputs = {"input_ids": input_ids}
702
+
703
+ # Make sure `pixel_values` are preserved in `model_inputs`
704
+ model_inputs.update(
705
+ {
706
+ "attention_mask": attention_mask,
707
+ "pixel_values": pixel_values,
708
+ "past_key_values": past_key_values,
709
+ "use_cache": kwargs.get("use_cache"),
710
+ }
711
+ )
712
+
713
+ return model_inputs
714
+
715
+ # Defer to Language Model (all handle this differently, with different return types)
716
+ def _reorder_cache(self, *args, **kwargs) -> Any:
717
+ return self.language_model._reorder_cache(*args, **kwargs)
718
+
719
+
720
+ class OpenVLAForActionPrediction(PrismaticForConditionalGeneration):
721
+ config_class: PretrainedConfig = OpenVLAConfig
722
+
723
+ def __init__(self, config: OpenVLAConfig) -> None:
724
+ super().__init__(config)
725
+ self.norm_stats = config.norm_stats
726
+
727
+ # Compute action bins
728
+ self.bins = np.linspace(-1, 1, config.n_action_bins)
729
+ self.bin_centers = (self.bins[:-1] + self.bins[1:]) / 2.0
730
+
731
+ # Compute vocab size for de-tokenization -- revert added "multiple of"
732
+ self.vocab_size = self.config.text_config.vocab_size - self.config.pad_to_multiple_of
733
+
734
+ def _prepare_input_for_action_prediction(self, input_ids, attention_mask):
735
+ """Prepares input for action prediction by adding necessary tokens"""
736
+ # Add (ACTION_DIM * NUM_ACTIONS_CHUNK) placeholder tokens to input_ids to simulate action tokens
737
+ placeholder_action_token_ids = (
738
+ torch.ones((input_ids.shape[0], ACTION_DIM * NUM_ACTIONS_CHUNK)).to(input_ids.device).to(input_ids.dtype)
739
+ )
740
+ input_ids = torch.cat([input_ids, placeholder_action_token_ids], dim=-1)
741
+
742
+ # Add stop token to sequence (needed in non-causal bi-directional self-attention, as it appears at train time)
743
+ stop_token_id = torch.ones((input_ids.shape[0], 1)).to(input_ids.device).to(input_ids.dtype) * STOP_INDEX
744
+ input_ids = torch.cat([input_ids, stop_token_id], dim=-1)
745
+
746
+ # Extend the attention mask to fit the new shape of input
747
+ # Note: Only batch size == 1 supported right now
748
+ mask_extension = (
749
+ torch.ones((attention_mask.shape[0], input_ids.shape[-1] - attention_mask.shape[-1]))
750
+ .to(attention_mask.device)
751
+ .to(attention_mask.dtype)
752
+ )
753
+ attention_mask = torch.cat([attention_mask, mask_extension], dim=-1)
754
+
755
+ return input_ids, attention_mask
756
+
757
+ def _prepare_labels_for_action_prediction(self, labels, input_ids):
758
+ """Creates labels tensor for action prediction if not provided"""
759
+ # Extend labels tensor with fake action labels
760
+ ARBITRARY_ACTION_TOKEN_IDX = ACTION_TOKEN_BEGIN_IDX + 1
761
+ labels_extension = (
762
+ torch.ones((labels.shape[0], input_ids.shape[-1] - labels.shape[-1])).to(labels.device).to(labels.dtype)
763
+ * ARBITRARY_ACTION_TOKEN_IDX
764
+ )
765
+ labels = torch.cat([labels, labels_extension], dim=-1)
766
+
767
+ # Replace last label token with stop token
768
+ labels[:, -1] = STOP_INDEX
769
+
770
+ return labels
771
+
772
+ def _unnormalize_actions(self, normalized_actions, unnorm_key=None):
773
+ """Unnormalize actions using dataset statistics"""
774
+ action_norm_stats = self.get_action_stats(unnorm_key)
775
+
776
+ if ACTION_PROPRIO_NORMALIZATION_TYPE == NormalizationType.BOUNDS:
777
+ mask = action_norm_stats.get("mask", np.ones_like(action_norm_stats["min"], dtype=bool))
778
+ action_high, action_low = np.array(action_norm_stats["max"]), np.array(action_norm_stats["min"])
779
+ elif ACTION_PROPRIO_NORMALIZATION_TYPE == NormalizationType.BOUNDS_Q99:
780
+ mask = action_norm_stats.get("mask", np.ones_like(action_norm_stats["q01"], dtype=bool))
781
+ action_high, action_low = np.array(action_norm_stats["q99"]), np.array(action_norm_stats["q01"])
782
+ else:
783
+ raise ValueError("Unsupported action/proprio normalization type detected!")
784
+
785
+ actions = np.where(
786
+ mask,
787
+ 0.5 * (normalized_actions + 1) * (action_high - action_low + 1e-8) + action_low,
788
+ normalized_actions,
789
+ )
790
+
791
+ return actions
792
+
793
+ def _run_diffusion_prediction(
794
+ self,
795
+ input_embeddings,
796
+ all_actions_mask,
797
+ noise,
798
+ action_head,
799
+ projected_patch_embeddings,
800
+ labels,
801
+ attention_mask,
802
+ NUM_PATCHES,
803
+ NUM_PROMPT_TOKENS,
804
+ noisy_action_projector,
805
+ ):
806
+ """Run diffusion-based action prediction"""
807
+ # Clone embedding for reuse in each timestep
808
+ orig_projected_patch_embeddings = projected_patch_embeddings.clone()
809
+ curr_noisy_actions = noise
810
+
811
+ # Reverse diffusion: Iteratively denoise to generate action prediction
812
+ for t in action_head.noise_scheduler.timesteps:
813
+ # Get diffusion model's noise prediction (conditioned on VLA latent embedding, current noisy action
814
+ # embedding, and diffusion timestep embedding)
815
+ timesteps = torch.Tensor([t]).to(labels.device)
816
+ diffusion_timestep_embeddings = (
817
+ action_head.time_encoder(timesteps).to(curr_noisy_actions.dtype).to(curr_noisy_actions.device)
818
+ ) # (B, llm_dim)
819
+ diffusion_timestep_embeddings = diffusion_timestep_embeddings.unsqueeze(1) # (B, 1, llm_dim)
820
+
821
+ # [Diffusion] Replace the embeddings of the action tokens with noisy actions
822
+ # (Later on, the positional embeddings will be added to them)
823
+
824
+ # For simplicity, append diffusion timestep embedding to the end of projected vision tokens
825
+ projected_patch_embeddings = torch.cat(
826
+ (orig_projected_patch_embeddings, diffusion_timestep_embeddings), dim=1
827
+ )
828
+
829
+ # Reshape and project noisy actions into language embedding space
830
+ B = curr_noisy_actions.shape[0]
831
+ orig_curr_noisy_actions_shape = curr_noisy_actions.shape
832
+ curr_noisy_actions = curr_noisy_actions.reshape(B, -1).unsqueeze(-1)
833
+ noisy_action_features = noisy_action_projector(curr_noisy_actions)
834
+ curr_noisy_actions = curr_noisy_actions.reshape(orig_curr_noisy_actions_shape)
835
+
836
+ # Replace action token embeddings with noisy action embeddings
837
+ input_embeddings = self._replace_input_embeddings(
838
+ input_embeddings.clone(), all_actions_mask, noisy_action_features
839
+ )
840
+
841
+ # Build multimodal embeddings and attention mask
842
+ multimodal_embeddings, multimodal_attention_mask = self._build_multimodal_attention(
843
+ input_embeddings, projected_patch_embeddings, attention_mask
844
+ )
845
+
846
+ # Forward pass through language model
847
+ language_model_output = self.language_model(
848
+ input_ids=None,
849
+ attention_mask=multimodal_attention_mask,
850
+ position_ids=None,
851
+ past_key_values=None,
852
+ inputs_embeds=multimodal_embeddings,
853
+ labels=None,
854
+ use_cache=None,
855
+ output_attentions=False,
856
+ output_hidden_states=True,
857
+ return_dict=True,
858
+ )
859
+
860
+ # Extract hidden states for action portion of response
861
+ last_hidden_states = language_model_output.hidden_states[-1] # (B, seq_len, D)
862
+ actions_hidden_states = last_hidden_states[
863
+ :,
864
+ NUM_PATCHES + NUM_PROMPT_TOKENS : NUM_PATCHES + NUM_PROMPT_TOKENS + ACTION_DIM * NUM_ACTIONS_CHUNK,
865
+ :,
866
+ ] # (B, act_chunk_len, D)
867
+
868
+ # Predict noise and update noisy actions: x_t -> x_{t-1}
869
+ noise_pred = action_head.predict_noise(actions_hidden_states)
870
+ curr_noisy_actions = action_head.noise_scheduler.step(noise_pred, t, curr_noisy_actions).prev_sample
871
+
872
+ curr_noisy_actions = curr_noisy_actions.reshape(NUM_ACTIONS_CHUNK, ACTION_DIM)
873
+
874
+ # Return final actions
875
+ return curr_noisy_actions.float().cpu().detach().numpy(), actions_hidden_states
876
+
877
+ def _regression_or_discrete_prediction(
878
+ self,
879
+ input_embeddings,
880
+ all_actions_mask,
881
+ projected_patch_embeddings,
882
+ attention_mask,
883
+ labels,
884
+ NUM_PATCHES,
885
+ NUM_PROMPT_TOKENS,
886
+ action_head=None,
887
+ ):
888
+ """Run L1 regression-based continuous action prediction or discrete action tokens prediction."""
889
+ # Zero out action token embeddings
890
+ all_actions_mask = all_actions_mask.unsqueeze(-1) # (B, seq_len, 1)
891
+ input_embeddings = input_embeddings * ~all_actions_mask
892
+
893
+ # Build multimodal embeddings and attention mask
894
+ multimodal_embeddings, multimodal_attention_mask = self._build_multimodal_attention(
895
+ input_embeddings, projected_patch_embeddings, attention_mask
896
+ )
897
+
898
+ # Forward pass through language model
899
+ language_model_output = self.language_model(
900
+ input_ids=None,
901
+ attention_mask=multimodal_attention_mask,
902
+ position_ids=None,
903
+ past_key_values=None,
904
+ inputs_embeds=multimodal_embeddings,
905
+ labels=None,
906
+ use_cache=None,
907
+ output_attentions=False,
908
+ output_hidden_states=True,
909
+ return_dict=True,
910
+ )
911
+
912
+ # Extract hidden states for action tokens
913
+ last_hidden_states = language_model_output.hidden_states[-1] # (B, seq_len, D)
914
+ actions_hidden_states = last_hidden_states[
915
+ :,
916
+ NUM_PATCHES + NUM_PROMPT_TOKENS : NUM_PATCHES + NUM_PROMPT_TOKENS + ACTION_DIM * NUM_ACTIONS_CHUNK,
917
+ :,
918
+ ] # (B, act_chunk_len, D)
919
+
920
+ # Handle different prediction methods
921
+ if action_head is not None:
922
+ # L1 regression prediction
923
+ normalized_actions = action_head.predict_action(actions_hidden_states)
924
+ normalized_actions = normalized_actions.reshape(NUM_ACTIONS_CHUNK, ACTION_DIM)
925
+ normalized_actions = normalized_actions.float().cpu().detach().numpy()
926
+ else:
927
+ # Discrete token-based prediction
928
+ predicted_action_token_ids = (
929
+ language_model_output.logits[
930
+ :,
931
+ NUM_PATCHES + NUM_PROMPT_TOKENS : NUM_PATCHES + NUM_PROMPT_TOKENS + ACTION_DIM * NUM_ACTIONS_CHUNK,
932
+ ]
933
+ .argmax(dim=2)
934
+ .cpu()
935
+ .numpy()
936
+ )
937
+ discretized_actions = self.vocab_size - predicted_action_token_ids
938
+ discretized_actions = np.clip(discretized_actions - 1, a_min=0, a_max=self.bin_centers.shape[0] - 1)
939
+ normalized_actions = self.bin_centers[discretized_actions]
940
+ normalized_actions = normalized_actions.reshape(NUM_ACTIONS_CHUNK, ACTION_DIM)
941
+
942
+ return normalized_actions, actions_hidden_states
943
+
944
+ def predict_action(
945
+ self,
946
+ input_ids: Optional[torch.LongTensor] = None,
947
+ unnorm_key: Optional[str] = None,
948
+ proprio=None,
949
+ proprio_projector=None,
950
+ action_head=None,
951
+ noisy_action_projector=None,
952
+ use_film: bool = False,
953
+ **kwargs: str,
954
+ ) -> np.ndarray:
955
+ """Predict actions from input sequence, with options for different prediction methods.
956
+
957
+ Args:
958
+ input_ids: Input token ids
959
+ unnorm_key: Key for unnormalization statistics
960
+ proprio: Proprioceptive features
961
+ proprio_projector: Projector for proprioceptive features
962
+ action_head: Optional head for L1 regression or diffusion-based prediction
963
+ noisy_action_projector: Projector for noisy actions in diffusion-based prediction
964
+ use_film: Whether to use FiLM conditioning
965
+ **kwargs: Additional arguments including pixel_values and attention_mask
966
+
967
+ Returns:
968
+ Tuple of (unnormalized_actions, action_hidden_states)
969
+ """
970
+ # If the special empty token ('') does not already appear after the colon (':') token in the prompt
971
+ # (after "OUT:" or "ASSISTANT:"), insert it to match the inputs seen at training time
972
+ if not torch.all(input_ids[:, -1] == 29871):
973
+ input_ids = torch.cat(
974
+ (input_ids, torch.unsqueeze(torch.Tensor([29871]).long(), dim=0).to(input_ids.device)), dim=1
975
+ )
976
+
977
+ pixel_values = kwargs["pixel_values"]
978
+ attention_mask = kwargs["attention_mask"]
979
+
980
+ # Create fake labels tensor (needed for action mask)
981
+ labels = input_ids.clone()
982
+ labels[:] = IGNORE_INDEX
983
+
984
+ # Get number of tokens in prompt (excluding the start token)
985
+ NUM_PROMPT_TOKENS = input_ids.shape[-1] - 1 # Subtract action tokens and stop token
986
+
987
+ # Prepare inputs by adding necessary tokens
988
+ input_ids, attention_mask = self._prepare_input_for_action_prediction(input_ids, attention_mask)
989
+
990
+ # Update labels tensor for action mask computation later
991
+ labels = self._prepare_labels_for_action_prediction(labels, input_ids)
992
+
993
+ # Get input embeddings and action masks
994
+ input_embeddings = self.get_input_embeddings()(input_ids)
995
+ all_actions_mask = self._process_action_masks(labels)
996
+
997
+ # Extract language embeddings
998
+ language_embeddings = input_embeddings[~all_actions_mask].reshape(
999
+ input_embeddings.shape[0], -1, input_embeddings.shape[2]
1000
+ )
1001
+
1002
+ # Process vision features
1003
+ projected_patch_embeddings = self._process_vision_features(pixel_values, language_embeddings, use_film)
1004
+
1005
+ # Add proprioceptive features if provided
1006
+ use_proprio = proprio_projector is not None and proprio is not None
1007
+ if use_proprio:
1008
+ proprio = torch.Tensor(proprio).to(projected_patch_embeddings.device, dtype=projected_patch_embeddings.dtype)
1009
+ projected_patch_embeddings = self._process_proprio_features(
1010
+ projected_patch_embeddings, proprio, proprio_projector
1011
+ )
1012
+
1013
+ # Use diffusion if provided, otherwise use regression or discrete prediction
1014
+ use_diffusion = noisy_action_projector is not None and hasattr(action_head, "noise_scheduler")
1015
+
1016
+ # Calculate number of patches (including proprio token and/or diffusion timestep embedding if present)
1017
+ NUM_PATCHES = self.vision_backbone.get_num_patches() * self.vision_backbone.get_num_images_in_input()
1018
+ if use_proprio:
1019
+ NUM_PATCHES += 1
1020
+ if use_diffusion:
1021
+ NUM_PATCHES += 1
1022
+
1023
+ if use_diffusion:
1024
+ # Sample random noise with shape equal to output action, used as the starting state for reverse diffusion
1025
+ noise = torch.randn(
1026
+ size=(1, NUM_ACTIONS_CHUNK, ACTION_DIM), device=input_embeddings.device, dtype=input_embeddings.dtype
1027
+ )
1028
+
1029
+ # Run diffusion-based prediction
1030
+ normalized_actions, actions_hidden_states = self._run_diffusion_prediction(
1031
+ input_embeddings,
1032
+ all_actions_mask,
1033
+ noise,
1034
+ action_head,
1035
+ projected_patch_embeddings,
1036
+ labels,
1037
+ attention_mask,
1038
+ NUM_PATCHES,
1039
+ NUM_PROMPT_TOKENS,
1040
+ noisy_action_projector,
1041
+ )
1042
+ else:
1043
+ # Run regression or discrete token-based prediction
1044
+ normalized_actions, actions_hidden_states = self._regression_or_discrete_prediction(
1045
+ input_embeddings,
1046
+ all_actions_mask,
1047
+ projected_patch_embeddings,
1048
+ attention_mask,
1049
+ labels,
1050
+ NUM_PATCHES,
1051
+ NUM_PROMPT_TOKENS,
1052
+ action_head,
1053
+ )
1054
+
1055
+ # Unnormalize predicted actions
1056
+ actions = self._unnormalize_actions(normalized_actions, unnorm_key)
1057
+
1058
+ return actions, actions_hidden_states
1059
+
1060
+ @staticmethod
1061
+ def _check_unnorm_key(norm_stats: Dict[str, Dict[str, Any]], unnorm_key: Optional[str]) -> str:
1062
+ """Validate and resolve the unnormalization key for action statistics"""
1063
+ if unnorm_key is None:
1064
+ assert len(norm_stats) == 1, (
1065
+ f"Your model was trained on more than one dataset, "
1066
+ f"please pass a `unnorm_key` from the following options to choose the statistics "
1067
+ f"used for un-normalizing actions: {norm_stats.keys()}"
1068
+ )
1069
+ unnorm_key = next(iter(norm_stats.keys()))
1070
+
1071
+ assert unnorm_key in norm_stats, (
1072
+ f"The `unnorm_key` you chose is not in the set of available dataset statistics, "
1073
+ f"please choose from: {norm_stats.keys()}"
1074
+ )
1075
+ return unnorm_key
1076
+
1077
+ def get_action_dim(self, unnorm_key: Optional[str] = None) -> int:
1078
+ """Get the dimensionality of the policy's action space."""
1079
+ unnorm_key = self._check_unnorm_key(self.norm_stats, unnorm_key)
1080
+ return len(self.norm_stats[unnorm_key]["action"]["min"])
1081
+
1082
+ def get_action_stats(self, unnorm_key: Optional[str] = None) -> Dict[str, Any]:
1083
+ """Get all the logged statistics for the given dataset."""
1084
+ unnorm_key = self._check_unnorm_key(self.norm_stats, unnorm_key)
1085
+ return self.norm_stats[unnorm_key]["action"]
preprocessor_config.json ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoImageProcessor": "processing_prismatic.PrismaticImageProcessor",
4
+ "AutoProcessor": "processing_prismatic.PrismaticProcessor"
5
+ },
6
+ "image_processor_type": "PrismaticImageProcessor",
7
+ "image_resize_strategy": "resize-naive",
8
+ "input_sizes": [
9
+ [
10
+ 3,
11
+ 224,
12
+ 224
13
+ ],
14
+ [
15
+ 3,
16
+ 224,
17
+ 224
18
+ ]
19
+ ],
20
+ "interpolations": [
21
+ "bicubic",
22
+ "bicubic"
23
+ ],
24
+ "means": [
25
+ [
26
+ 0.485,
27
+ 0.456,
28
+ 0.406
29
+ ],
30
+ [
31
+ 0.5,
32
+ 0.5,
33
+ 0.5
34
+ ]
35
+ ],
36
+ "processor_class": "PrismaticProcessor",
37
+ "stds": [
38
+ [
39
+ 0.229,
40
+ 0.224,
41
+ 0.225
42
+ ],
43
+ [
44
+ 0.5,
45
+ 0.5,
46
+ 0.5
47
+ ]
48
+ ],
49
+ "tvf_crop_params": [
50
+ {
51
+ "output_size": [
52
+ 224,
53
+ 224
54
+ ]
55
+ },
56
+ {
57
+ "output_size": [
58
+ 224,
59
+ 224
60
+ ]
61
+ }
62
+ ],
63
+ "tvf_do_letterbox": false,
64
+ "tvf_letterbox_fill": null,
65
+ "tvf_normalize_params": [
66
+ {
67
+ "inplace": false,
68
+ "mean": [
69
+ 0.484375,
70
+ 0.455078125,
71
+ 0.40625
72
+ ],
73
+ "std": [
74
+ 0.228515625,
75
+ 0.2236328125,
76
+ 0.224609375
77
+ ]
78
+ },
79
+ {
80
+ "inplace": false,
81
+ "mean": [
82
+ 0.5,
83
+ 0.5,
84
+ 0.5
85
+ ],
86
+ "std": [
87
+ 0.5,
88
+ 0.5,
89
+ 0.5
90
+ ]
91
+ }
92
+ ],
93
+ "tvf_resize_params": [
94
+ {
95
+ "antialias": true,
96
+ "interpolation": 3,
97
+ "max_size": null,
98
+ "size": [
99
+ 224,
100
+ 224
101
+ ]
102
+ },
103
+ {
104
+ "antialias": true,
105
+ "interpolation": 3,
106
+ "max_size": null,
107
+ "size": [
108
+ 224,
109
+ 224
110
+ ]
111
+ }
112
+ ],
113
+ "use_fused_vision_backbone": true
114
+ }
processing_prismatic.py ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ processing_prismatic.py
3
+
4
+ HuggingFace-style preprocessor definitions for Prismatic VLMs, inheriting from `ProcessorMixin`. Default configuration
5
+ specifies `siglip-224px+7b`.
6
+ """
7
+
8
+ from typing import Any, ClassVar, List, Optional, Tuple, Union
9
+
10
+ import timm.data
11
+ import torch
12
+ import torchvision.transforms.functional as TVF
13
+ from PIL import Image
14
+ from torchvision.transforms import CenterCrop, Compose, Normalize, Resize, ToTensor
15
+ from transformers import PreTrainedTokenizerBase
16
+ from transformers.image_processing_utils import BatchFeature, ImageProcessingMixin
17
+ from transformers.processing_utils import ProcessorMixin
18
+ from transformers.tokenization_utils import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
19
+ from transformers.utils import TensorType
20
+
21
+
22
+ # === Image Processing ===
23
+ def letterbox_pad_transform(image: Image.Image, padding_fill_value: Tuple[int, int, int]) -> Image.Image:
24
+ """Given a PIL.Image, pad to square by adding a symmetric border around the height/width."""
25
+ (w, h), max_wh = image.size, max(image.size)
26
+ horizontal_pad, vertical_pad = int((max_wh - w) / 2), int((max_wh - h) / 2)
27
+ padding = (horizontal_pad, vertical_pad, horizontal_pad, vertical_pad)
28
+
29
+ return TVF.pad(image, padding, fill=padding_fill_value, padding_mode="constant")
30
+
31
+
32
+ class PrismaticImageProcessor(ImageProcessingMixin):
33
+ model_input_names: ClassVar[List[str]] = ["pixel_values"]
34
+
35
+ def __init__(
36
+ self,
37
+ use_fused_vision_backbone: bool = False,
38
+ image_resize_strategy: str = "letterbox",
39
+ input_sizes: Optional[List[Tuple[int, int, int]]] = None,
40
+ interpolations: Optional[List[str]] = None,
41
+ means: Optional[List[Tuple[float, float, float]]] = None,
42
+ stds: Optional[List[Tuple[float, float, float]]] = None,
43
+ **kwargs: str,
44
+ ) -> None:
45
+ """
46
+ Initialize a PrismaticImageProcessor as a wrapper around a torchvision transform; this transform will be
47
+ created by TIMM, and edited to follow our custom `image_resize_strategy` logic.
48
+
49
+ @param use_fused_vision_backbone: Boolean indicating single or fused (dual) vision backbone
50
+ @param image_resize_strategy: Prismatic image resize strategy in < resize-naive | resize-crop | letterbox >
51
+ @param input_size: [TIMM :: `data_cfg`] Input image size as tuple (channels, width, height)
52
+ @param interpolation: [TIMM :: `data_cfg`] Interpolation as string (default: "bicubic")
53
+ @param mean: [TIMM :: `data_cfg`] Normalization mean as float tuple (or two-tuple if `fused_backbone`)
54
+ @param std: [TIMM :: `data_cfg`] Normalization std as float tuple (or two-tuple if `fused_backbone`)
55
+ """
56
+ self.use_fused_vision_backbone = use_fused_vision_backbone
57
+ self.image_resize_strategy = image_resize_strategy
58
+
59
+ # Handle `None` default values
60
+ input_sizes = [(3, 224, 224)] if input_sizes is None else input_sizes
61
+ means = [(0.5, 0.5, 0.5)] if means is None else means
62
+ stds = [(0.5, 0.5, 0.5)] if stds is None else stds
63
+
64
+ # TIMM `data_cfg` Parameters
65
+ self.input_sizes, self.interpolations, self.means, self.stds = input_sizes, interpolations, means, stds
66
+
67
+ # Grab torchvision transforms via TIMM =>> need to parse for specific "functional" transform values!
68
+ self.tvf_resize_params, self.tvf_crop_params, self.tvf_normalize_params = [], [], []
69
+ self.tvf_do_letterbox, self.tvf_letterbox_fill = False, None
70
+
71
+ for idx in range(len(input_sizes)):
72
+ transform = timm.data.create_transform(
73
+ input_size=self.input_sizes[idx],
74
+ interpolation=self.interpolations[idx],
75
+ mean=self.means[idx],
76
+ std=self.stds[idx],
77
+ crop_pct=1.0, # Set to 1.0 to ignore cropping (initial Resize sets `input_size`)
78
+ crop_mode="center", # Default crop mode -- no-op when `crop_pct == 1.0`
79
+ is_training=False, # No image augmentations when loading the transform!
80
+ )
81
+
82
+ # [Validation] Ensure appropriate transform structure, expected sizes
83
+ if not (
84
+ isinstance(transform, Compose)
85
+ and (len(transform.transforms) == 4)
86
+ and isinstance(transform.transforms[0], Resize)
87
+ and isinstance(transform.transforms[1], CenterCrop)
88
+ and isinstance(transform.transforms[2], ToTensor)
89
+ and isinstance(transform.transforms[3], Normalize)
90
+ and (transform.transforms[0].size == self.input_sizes[idx][-1])
91
+ and (transform.transforms[1].size == self.input_sizes[idx][-2:])
92
+ ):
93
+ raise ValueError(f"Unexpected TIMM image transformation structure/sizes: `{transform}`")
94
+
95
+ # HF Image Processors *must* be JSON-serializable; as such, cannot have torchvision. as an attribute.
96
+ # => Instead, we're going to parse the transform and call "torchvision.transforms.functional" (`tvf`)
97
+ resize_t, crop_t, norm_t = transform.transforms[0], transform.transforms[1], transform.transforms[3]
98
+ self.tvf_resize_params.append(
99
+ {
100
+ "size": resize_t.size,
101
+ "interpolation": TVF.pil_modes_mapping[resize_t.interpolation],
102
+ "max_size": None,
103
+ "antialias": True,
104
+ }
105
+ )
106
+ self.tvf_crop_params.append({"output_size": crop_t.size})
107
+ self.tvf_normalize_params.append(
108
+ {
109
+ "mean": norm_t.mean.float().numpy().tolist(),
110
+ "std": norm_t.std.float().numpy().tolist(),
111
+ "inplace": False,
112
+ }
113
+ )
114
+ self.tvf_do_letterbox, self.tvf_letterbox_fill = False, None
115
+
116
+ # Handle Prismatic `image_resize_strategy`
117
+ if self.image_resize_strategy == "resize-naive":
118
+ self.tvf_resize_params[idx]["size"] = (resize_t.size, resize_t.size)
119
+ elif self.image_resize_strategy == "letterbox":
120
+ self.tvf_do_letterbox, self.tvf_letterbox_fill = True, tuple([int(x * 255) for x in self.means[idx]])
121
+ elif self.image_resize_strategy == "resize-crop":
122
+ pass
123
+ else:
124
+ raise ValueError(f"Image resize strategy `{self.image_resize_strategy}` is not supported!")
125
+
126
+ # Dispatch **kwargs to super()
127
+ super().__init__(**kwargs)
128
+
129
+ def apply_transform(self, img: Image.Image) -> torch.Tensor:
130
+ """Apply `functional` variant of TIMM's Transform = Compose([Resize -> CenterCrop -> ToTensor -> Normalize])"""
131
+ if self.tvf_do_letterbox:
132
+ img = letterbox_pad_transform(img, self.tvf_letterbox_fill)
133
+
134
+ # [Contract] Fused Backbones expect "channel-stacked" inputs; we'll unpack on the model side!
135
+ imgs_t = []
136
+ for idx in range(len(self.input_sizes)):
137
+ img_idx = TVF.resize(img, **self.tvf_resize_params[idx])
138
+ img_idx = TVF.center_crop(img_idx, **self.tvf_crop_params[idx])
139
+ img_idx_t = TVF.to_tensor(img_idx)
140
+ img_idx_t = TVF.normalize(img_idx_t, **self.tvf_normalize_params[idx])
141
+ imgs_t.append(img_idx_t)
142
+
143
+ # [Contract] `imgs_t` is a list of Tensors of shape [3, input_size, input_size]; stack along dim = 0
144
+ img_t = torch.vstack(imgs_t)
145
+
146
+ return img_t
147
+
148
+ def preprocess(
149
+ self,
150
+ images: Union[Image.Image, List[Image.Image]],
151
+ return_tensors: Optional[Union[str, TensorType]] = None,
152
+ **_: str,
153
+ ) -> BatchFeature:
154
+ """
155
+ Preprocess an image (or batch of images); note that unlike the `transformers :: BaseImageProcessor` we
156
+ explicitly only handle PIL.Image.Image instances for simplicity.
157
+
158
+ @param images: A (batch of) PIL.Image.Image instance(s) to preprocess.
159
+ @param return_tensors: BatchFeature default Tensor format (e.g., "pt" for torch); if None, returns np.ndarray
160
+
161
+ @return: Instance of `transformers :: BatchFeature` with a single key "pixel_values"
162
+ """
163
+ if not isinstance(images, list):
164
+ images = [images]
165
+
166
+ # Apply `self.img_transform` to each image (will return list of torch.Tensors); stack into "batched" Tensor
167
+ pixel_values = torch.stack([self.apply_transform(img.convert("RGB")) for img in images])
168
+
169
+ # Return BatchFeature =>> note that for compatibility, constructor expects Dict[str, np.ndarray], so we convert
170
+ return BatchFeature(data={"pixel_values": pixel_values.float().numpy()}, tensor_type=return_tensors)
171
+
172
+ def __call__(self, images: Union[Image.Image, List[Image.Image]], **kwargs) -> BatchFeature:
173
+ return self.preprocess(images, **kwargs)
174
+
175
+
176
+ # === PrismaticProcessor =>> Wraps both ImageProcessor and Tokenizer ===
177
+ # =>> https://github.com/huggingface/transformers/blob/main/src/transformers/models/llava/processing_llava.py
178
+ class PrismaticProcessor(ProcessorMixin):
179
+ attributes: ClassVar[List[str]] = ["image_processor", "tokenizer"]
180
+ image_processor_class: str = "AutoImageProcessor"
181
+ tokenizer_class: str = "AutoTokenizer"
182
+
183
+ def __init__(
184
+ self,
185
+ image_processor: Optional[ImageProcessingMixin] = None,
186
+ tokenizer: Optional[PreTrainedTokenizerBase] = None,
187
+ ) -> None:
188
+ super().__init__(image_processor, tokenizer)
189
+
190
+ def __call__(
191
+ self,
192
+ text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
193
+ images: Union[Image.Image, List[Image.Image]],
194
+ padding: Union[bool, str, PaddingStrategy] = False,
195
+ truncation: Optional[Union[bool, str, TruncationStrategy]] = None,
196
+ max_length: Optional[int] = None,
197
+ return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
198
+ ) -> BatchFeature:
199
+ """
200
+ Preprocess a given (batch) of text/images for a Prismatic VLM; forwards text to the underlying LLM's tokenizer,
201
+ forwards images to PrismaticImageProcessor.
202
+
203
+ @param text: The (batch) of text to encode; must be a string or list of strings.
204
+ @param images: A (batch of) PIL.Image.Image instance(s) to preprocess.
205
+ @param padding: Sequence padding strategy (if multiple specified) in < True = "longest" | "max_length" | False >
206
+ @param truncation: Truncation strategy for the output sequences; requires `max_length` to be specified
207
+ @param max_length: Maximum length (in tokens) to truncate
208
+ @param return_tensors: Type of return tensors (usually "pt" or TensorType.PYTORCH)
209
+
210
+ @return: BatchFeature with keys for `input_ids`, `attention_mask` and `pixel_values`.
211
+ """
212
+ pixel_values = self.image_processor(images, return_tensors=return_tensors)["pixel_values"]
213
+ text_inputs = self.tokenizer(
214
+ text, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length
215
+ )
216
+
217
+ # [Validate] Need same number of images and text inputs!
218
+ if pixel_values.shape[0] != text_inputs.input_ids.shape[0]:
219
+ raise ValueError("Batch is malformed; expected same number of images and text inputs!")
220
+
221
+ return BatchFeature(data={**text_inputs, "pixel_values": pixel_values})
222
+
223
+ # === Tokenizer Dispatch Utilities =>> check `PreTrainedTokenizerBase` for documentation ===
224
+ def batch_decode(
225
+ self,
226
+ sequences: Union[List[int], List[List[int]], torch.Tensor, Any], # `Any` = np.ndarray | tf.Tensor
227
+ skip_special_tokens: bool = False,
228
+ clean_up_tokenization_spaces: Optional[bool] = None,
229
+ **kwargs: str,
230
+ ) -> List[str]:
231
+ return self.tokenizer.batch_decode(
232
+ sequences=sequences,
233
+ skip_special_tokens=skip_special_tokens,
234
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
235
+ **kwargs,
236
+ )
237
+
238
+ def decode(
239
+ self,
240
+ token_ids: Union[int, List[int], torch.Tensor, Any], # `Any` = np.ndarray | tf.Tensor
241
+ skip_special_tokens: bool = False,
242
+ clean_up_tokenization_spaces: Optional[bool] = None,
243
+ **kwargs: str,
244
+ ) -> str:
245
+ return self.tokenizer.decode(
246
+ token_ids=token_ids,
247
+ skip_special_tokens=skip_special_tokens,
248
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
249
+ **kwargs,
250
+ )
251
+
252
+ @property
253
+ def model_input_names(self) -> List[str]:
254
+ tokenizer_input_names = self.tokenizer.model_input_names
255
+ image_processor_input_names = self.image_processor.model_input_names
256
+
257
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
processor_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_prismatic.PrismaticProcessor"
4
+ },
5
+ "processor_class": "PrismaticProcessor"
6
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<PAD>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
tokenizer_config.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "32000": {
30
+ "content": "<PAD>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ }
37
+ },
38
+ "auto_map": {
39
+ "AutoProcessor": "processing_prismatic.PrismaticProcessor"
40
+ },
41
+ "bos_token": "<s>",
42
+ "clean_up_tokenization_spaces": false,
43
+ "eos_token": "</s>",
44
+ "legacy": false,
45
+ "model_max_length": 2048,
46
+ "pad_token": "<PAD>",
47
+ "padding_side": "right",
48
+ "processor_class": "PrismaticProcessor",
49
+ "sp_model_kwargs": {},
50
+ "tokenizer_class": "LlamaTokenizer",
51
+ "unk_token": "<unk>",
52
+ "use_default_system_prompt": false
53
+ }