hqfang commited on
Commit
f8fdce7
·
verified ·
1 Parent(s): 8682edd

Upload model.yaml

Browse files
Files changed (1) hide show
  1. model.yaml +255 -0
model.yaml ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_name: molmo
2
+ llm:
3
+ d_model: 3584
4
+ n_heads: 28
5
+ n_kv_heads: 4
6
+ head_dim: null
7
+ qkv_bias: true
8
+ clip_qkv: null
9
+ n_layers: 28
10
+ mlp_ratio: 4
11
+ mlp_hidden_size: 37888
12
+ activation_type: swiglu
13
+ block_type: sequential
14
+ rope: true
15
+ rope_full_precision: true
16
+ rope_theta: 1000000.0
17
+ rope_type: default
18
+ rope_factor: null
19
+ rope_high_freq_factor: null
20
+ rope_low_freq_factor: null
21
+ rope_original_max_position_embeddings: null
22
+ attention_type: sdpa
23
+ float32_attention: true
24
+ attention_dropout: 0.0
25
+ attention_layer_norm: false
26
+ attention_layer_norm_type: olmo
27
+ residual_dropout: 0.1
28
+ response_residual_dropout: 0.0
29
+ layer_norm_type: rms
30
+ layer_norm_with_affine: true
31
+ layer_norm_eps: 1.0e-06
32
+ attention_layer_norm_with_affine: true
33
+ max_sequence_length: 4096
34
+ max_position_embeddings: null
35
+ include_bias: false
36
+ bias_for_layer_norm: null
37
+ norm_after: false
38
+ moe_num_experts: 8
39
+ moe_top_k: 2
40
+ moe_mlp_impl: sparse
41
+ moe_log_expert_assignment: false
42
+ moe_shared_expert: false
43
+ moe_lbl_in_fp32: false
44
+ moe_interleave: false
45
+ moe_loss_weight: 0.1
46
+ moe_zloss_weight: null
47
+ moe_dropless: true
48
+ moe_capacity_factor: 1.25
49
+ embedding_dropout: 0.0
50
+ scale_logits: false
51
+ vocab_size: 152064
52
+ additional_vocab_size: 128
53
+ weight_tying: false
54
+ embedding_size: 152064
55
+ use_position_ids: true
56
+ tokenizer:
57
+ identifier: Qwen/Qwen2.5-7B
58
+ tokenizer_dir: null
59
+ depth_tokens: true
60
+ init_path: gs://mm-olmo/pretrained_llms/qwen2.5-7b.pt
61
+ init_incremental: null
62
+ new_embedding_init_range: 0.02
63
+ initializer_range: 0.02
64
+ normalize_input_embeds: false
65
+ activation_checkpoint: whole_layer
66
+ compile: blocks
67
+ fix_pad_tokenizer: false
68
+ init_std: 0.02
69
+ init_fn: normal
70
+ init_cutoff_factor: null
71
+ vision_backbone:
72
+ vit:
73
+ image_model_type: siglip
74
+ image_default_input_size:
75
+ - 378
76
+ - 378
77
+ image_patch_size: 14
78
+ image_pos_patch_size: 14
79
+ image_emb_dim: 1152
80
+ image_num_heads: 16
81
+ image_num_key_value_heads: 16
82
+ image_num_layers: 27
83
+ image_head_dim: 72
84
+ image_mlp_dim: 4304
85
+ image_mlp_activations: gelu_pytorch_tanh
86
+ image_dropout_rate: 0.0
87
+ image_num_pos: 729
88
+ image_norm_eps: 1.0e-06
89
+ attention_dropout: 0.0
90
+ residual_dropout: 0.0
91
+ initializer_range: 0.02
92
+ float32_attention: true
93
+ attention_type: sdpa
94
+ activation_checkpointing: true
95
+ init_path: gs://mm-olmo/pretrained_image_encoders/siglip2-so400m-14-384.pt
96
+ resize_mode: siglip
97
+ pad_value: 0.0
98
+ normalize: siglip
99
+ image_pooling_2d: attention_meanq
100
+ pooling_attention_mask: false
101
+ image_projector: mlp
102
+ image_padding_embed: null
103
+ vit_layers:
104
+ - -3
105
+ - -9
106
+ skip_unused_layers: true
107
+ image_feature_dropout: 0.0
108
+ connector_activation_checkpointing: true
109
+ compile_vit: blocks
110
+ data_formatter:
111
+ prompt_templates: uber_model
112
+ message_format: role
113
+ system_prompt: demo_or_style
114
+ always_start_with_space: false
115
+ default_inference_len: 65
116
+ select_answer: best
117
+ debug: false
118
+ image_last: false
119
+ format_message_list: null
120
+ p_one_message: 0.0
121
+ mm_preprocessor:
122
+ crop_mode: overlap-and-resize-c2
123
+ max_crops: 8
124
+ max_images: 2
125
+ max_multi_image_crops: 8
126
+ pooling_w: 2
127
+ pooling_h: 2
128
+ overlap_margins:
129
+ - 4
130
+ - 4
131
+ use_col_tokens: true
132
+ loss_token_weighting: root_subsegments
133
+ legacy_image_mask: false
134
+ max_answer_len: null
135
+ img_aug: true
136
+ bi_directional_attn: null
137
+ lora_enable: true
138
+ lora_rank: 32
139
+ lora_alpha: 16
140
+ lora_dropout: 0.0
141
+ lora_bias: none
142
+ norm_stats:
143
+ libero_10_no_noops_modified:
144
+ action:
145
+ mean:
146
+ - 0.01820324920117855
147
+ - 0.05858374014496803
148
+ - -0.05592384561896324
149
+ - 0.004626928828656673
150
+ - 0.00289608770981431
151
+ - -0.007673131301999092
152
+ - 0.5457824468612671
153
+ std:
154
+ - 0.2825464606285095
155
+ - 0.35904666781425476
156
+ - 0.3673802614212036
157
+ - 0.03770702704787254
158
+ - 0.05429719388484955
159
+ - 0.08725254982709885
160
+ - 0.49815231561660767
161
+ max:
162
+ - 0.9375
163
+ - 0.9375
164
+ - 0.9375
165
+ - 0.30000001192092896
166
+ - 0.29357144236564636
167
+ - 0.375
168
+ - 1.0
169
+ min:
170
+ - -0.9375
171
+ - -0.9375
172
+ - -0.9375
173
+ - -0.23642857372760773
174
+ - -0.3053571283817291
175
+ - -0.3675000071525574
176
+ - 0.0
177
+ q01:
178
+ - -0.6348214149475098
179
+ - -0.7741071581840515
180
+ - -0.7633928656578064
181
+ - -0.09749999642372131
182
+ - -0.14819999992847435
183
+ - -0.2742857038974762
184
+ - 0.0
185
+ q99:
186
+ - 0.7714285850524902
187
+ - 0.8464285731315613
188
+ - 0.9375
189
+ - 0.13928571343421936
190
+ - 0.15964286029338837
191
+ - 0.3246428668498993
192
+ - 1.0
193
+ proprio:
194
+ mean:
195
+ - -0.04190658777952194
196
+ - 0.03539430722594261
197
+ - 0.8257141709327698
198
+ - 2.908308267593384
199
+ - -0.5562185049057007
200
+ - -0.16649018228054047
201
+ - 0.0
202
+ - 0.028316624462604523
203
+ - -0.028561657294631004
204
+ std:
205
+ - 0.10743364691734314
206
+ - 0.14424669742584229
207
+ - 0.2572328448295593
208
+ - 0.3441362977027893
209
+ - 1.234421730041504
210
+ - 0.3579835891723633
211
+ - 0.0
212
+ - 0.013308707624673843
213
+ - 0.013174631632864475
214
+ max:
215
+ - 0.21031762659549713
216
+ - 0.39128610491752625
217
+ - 1.3332009315490723
218
+ - 3.6714255809783936
219
+ - 3.560650587081909
220
+ - 1.386339545249939
221
+ - 0.0
222
+ - 0.04160946607589722
223
+ - 0.0013633022317662835
224
+ min:
225
+ - -0.4828203022480011
226
+ - -0.3255046010017395
227
+ - 0.445506751537323
228
+ - 1.1321442127227783
229
+ - -3.641430377960205
230
+ - -1.842738389968872
231
+ - 0.0
232
+ - -0.0010040868073701859
233
+ - -0.04111652821302414
234
+ q01:
235
+ - -0.3899900782108307
236
+ - -0.2838300323486328
237
+ - 0.44795057058334353
238
+ - 1.8810229921340942
239
+ - -2.886677579879761
240
+ - -1.1599004411697387
241
+ - 0.0
242
+ - 0.002066459748893976
243
+ - -0.04001387819647789
244
+ q99:
245
+ - 0.1530261474847791
246
+ - 0.32915401458740223
247
+ - 1.2546923208236693
248
+ - 3.303542451858519
249
+ - 2.7496529006957933
250
+ - 0.6893712210655194
251
+ - 0.0
252
+ - 0.040048558115959164
253
+ - -0.0017598449345678235
254
+ num_transitions: 101469
255
+ num_trajectories: 379