Text Generation
Transformers
Safetensors
zaya
conversational
yury-zyphra commited on
Commit
ea39e47
·
verified ·
1 Parent(s): 28d1e23

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
config.json ADDED
@@ -0,0 +1,452 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_func": "swiglu",
3
+ "activation_func_fp8_input_store": false,
4
+ "add_bias_linear": false,
5
+ "apply_rope_fusion": true,
6
+ "architectures": [
7
+ "ZayaForCausalLM"
8
+ ],
9
+ "attention_bias": false,
10
+ "attention_dropout": 0.0,
11
+ "bias_activation_fusion": true,
12
+ "bos_token_id": 2,
13
+ "cca": true,
14
+ "cca_num_q_heads": [
15
+ 8,
16
+ 0,
17
+ 8,
18
+ 0,
19
+ 8,
20
+ 0,
21
+ 8,
22
+ 0,
23
+ 8,
24
+ 0,
25
+ 8,
26
+ 0,
27
+ 8,
28
+ 0,
29
+ 8,
30
+ 0,
31
+ 8,
32
+ 0,
33
+ 8,
34
+ 0,
35
+ 8,
36
+ 0,
37
+ 8,
38
+ 0,
39
+ 8,
40
+ 0,
41
+ 8,
42
+ 0,
43
+ 8,
44
+ 0,
45
+ 8,
46
+ 0,
47
+ 8,
48
+ 0,
49
+ 8,
50
+ 0,
51
+ 8,
52
+ 0,
53
+ 8,
54
+ 0,
55
+ 8,
56
+ 0,
57
+ 8,
58
+ 0,
59
+ 8,
60
+ 0,
61
+ 8,
62
+ 0,
63
+ 8,
64
+ 0,
65
+ 8,
66
+ 0,
67
+ 8,
68
+ 0,
69
+ 8,
70
+ 0,
71
+ 8,
72
+ 0,
73
+ 8,
74
+ 0,
75
+ 8,
76
+ 0,
77
+ 8,
78
+ 0,
79
+ 8,
80
+ 0,
81
+ 8,
82
+ 0,
83
+ 8,
84
+ 0,
85
+ 8,
86
+ 0,
87
+ 8,
88
+ 0,
89
+ 8,
90
+ 0,
91
+ 8,
92
+ 0,
93
+ 8,
94
+ 0
95
+ ],
96
+ "eos_token_id": 1,
97
+ "ffn_hidden_size_list": [
98
+ 0,
99
+ 4096,
100
+ 0,
101
+ 4096,
102
+ 0,
103
+ 4096,
104
+ 0,
105
+ 4096,
106
+ 0,
107
+ 4096,
108
+ 0,
109
+ 4096,
110
+ 0,
111
+ 4096,
112
+ 0,
113
+ 4096,
114
+ 0,
115
+ 4096,
116
+ 0,
117
+ 4096,
118
+ 0,
119
+ 4096,
120
+ 0,
121
+ 4096,
122
+ 0,
123
+ 4096,
124
+ 0,
125
+ 4096,
126
+ 0,
127
+ 4096,
128
+ 0,
129
+ 4096,
130
+ 0,
131
+ 4096,
132
+ 0,
133
+ 4096,
134
+ 0,
135
+ 4096,
136
+ 0,
137
+ 4096,
138
+ 0,
139
+ 4096,
140
+ 0,
141
+ 4096,
142
+ 0,
143
+ 4096,
144
+ 0,
145
+ 4096,
146
+ 0,
147
+ 4096,
148
+ 0,
149
+ 4096,
150
+ 0,
151
+ 4096,
152
+ 0,
153
+ 4096,
154
+ 0,
155
+ 4096,
156
+ 0,
157
+ 4096,
158
+ 0,
159
+ 4096,
160
+ 0,
161
+ 4096,
162
+ 0,
163
+ 4096,
164
+ 0,
165
+ 4096,
166
+ 0,
167
+ 4096,
168
+ 0,
169
+ 4096,
170
+ 0,
171
+ 4096,
172
+ 0,
173
+ 4096,
174
+ 0,
175
+ 4096,
176
+ 0,
177
+ 4096
178
+ ],
179
+ "fused_add_norm": false,
180
+ "gated_linear_unit": true,
181
+ "hidden_size": 2048,
182
+ "kv_channels": 128,
183
+ "lm_head_bias": false,
184
+ "max_position_embeddings": 32768,
185
+ "model_type": "zaya",
186
+ "moe_router_topk": 1,
187
+ "norm_epsilon": 1e-05,
188
+ "normalization": "RMSNorm",
189
+ "num_attention_heads": 16,
190
+ "num_hidden_layers": 120,
191
+ "num_key_value_heads": 2,
192
+ "num_query_groups_list": [
193
+ 2,
194
+ 0,
195
+ 2,
196
+ 0,
197
+ 2,
198
+ 0,
199
+ 2,
200
+ 0,
201
+ 2,
202
+ 0,
203
+ 2,
204
+ 0,
205
+ 2,
206
+ 0,
207
+ 2,
208
+ 0,
209
+ 2,
210
+ 0,
211
+ 2,
212
+ 0,
213
+ 2,
214
+ 0,
215
+ 2,
216
+ 0,
217
+ 2,
218
+ 0,
219
+ 2,
220
+ 0,
221
+ 2,
222
+ 0,
223
+ 2,
224
+ 0,
225
+ 2,
226
+ 0,
227
+ 2,
228
+ 0,
229
+ 2,
230
+ 0,
231
+ 2,
232
+ 0,
233
+ 2,
234
+ 0,
235
+ 2,
236
+ 0,
237
+ 2,
238
+ 0,
239
+ 2,
240
+ 0,
241
+ 2,
242
+ 0,
243
+ 2,
244
+ 0,
245
+ 2,
246
+ 0,
247
+ 2,
248
+ 0,
249
+ 2,
250
+ 0,
251
+ 2,
252
+ 0,
253
+ 2,
254
+ 0,
255
+ 2,
256
+ 0,
257
+ 2,
258
+ 0,
259
+ 2,
260
+ 0,
261
+ 2,
262
+ 0,
263
+ 2,
264
+ 0,
265
+ 2,
266
+ 0,
267
+ 2,
268
+ 0,
269
+ 2,
270
+ 0,
271
+ 2,
272
+ 0
273
+ ],
274
+ "pad_token_id": 0,
275
+ "residual_in_fp32": false,
276
+ "partial_rotary_factor": 0.5,
277
+ "rope_scaling": false,
278
+ "rope_theta": 1000000,
279
+ "scale_residual_merge": true,
280
+ "sliding_window": null,
281
+ "zaya_high_prec": true,
282
+ "zaya_layers": [
283
+ "a",
284
+ 16,
285
+ "a",
286
+ 16,
287
+ "a",
288
+ 16,
289
+ "a",
290
+ 16,
291
+ "a",
292
+ 16,
293
+ "a",
294
+ 16,
295
+ "a",
296
+ 16,
297
+ "a",
298
+ 16,
299
+ "a",
300
+ 16,
301
+ "a",
302
+ 16,
303
+ "a",
304
+ 16,
305
+ "a",
306
+ 16,
307
+ "a",
308
+ 16,
309
+ "a",
310
+ 16,
311
+ "a",
312
+ 16,
313
+ "a",
314
+ 16,
315
+ "a",
316
+ 16,
317
+ "a",
318
+ 16,
319
+ "a",
320
+ 16,
321
+ "a",
322
+ 16,
323
+ "a",
324
+ 16,
325
+ "a",
326
+ 16,
327
+ "a",
328
+ 16,
329
+ "a",
330
+ 16,
331
+ "a",
332
+ 16,
333
+ "a",
334
+ 16,
335
+ "a",
336
+ 16,
337
+ "a",
338
+ 16,
339
+ "a",
340
+ 16,
341
+ "a",
342
+ 16,
343
+ "a",
344
+ 16,
345
+ "a",
346
+ 16,
347
+ "a",
348
+ 16,
349
+ "a",
350
+ 16,
351
+ "a",
352
+ 16,
353
+ "a",
354
+ 16,
355
+ "a",
356
+ 16,
357
+ "a",
358
+ 16,
359
+ "a",
360
+ 16,
361
+ "a",
362
+ 16
363
+ ],
364
+ "zaya_mlp_expansion": [
365
+ 0,
366
+ 256,
367
+ 0,
368
+ 256,
369
+ 0,
370
+ 256,
371
+ 0,
372
+ 256,
373
+ 0,
374
+ 256,
375
+ 0,
376
+ 256,
377
+ 0,
378
+ 256,
379
+ 0,
380
+ 256,
381
+ 0,
382
+ 256,
383
+ 0,
384
+ 256,
385
+ 0,
386
+ 256,
387
+ 0,
388
+ 256,
389
+ 0,
390
+ 256,
391
+ 0,
392
+ 256,
393
+ 0,
394
+ 256,
395
+ 0,
396
+ 256,
397
+ 0,
398
+ 256,
399
+ 0,
400
+ 256,
401
+ 0,
402
+ 256,
403
+ 0,
404
+ 256,
405
+ 0,
406
+ 256,
407
+ 0,
408
+ 256,
409
+ 0,
410
+ 256,
411
+ 0,
412
+ 256,
413
+ 0,
414
+ 256,
415
+ 0,
416
+ 256,
417
+ 0,
418
+ 256,
419
+ 0,
420
+ 256,
421
+ 0,
422
+ 256,
423
+ 0,
424
+ 256,
425
+ 0,
426
+ 256,
427
+ 0,
428
+ 256,
429
+ 0,
430
+ 256,
431
+ 0,
432
+ 256,
433
+ 0,
434
+ 256,
435
+ 0,
436
+ 256,
437
+ 0,
438
+ 256,
439
+ 0,
440
+ 256,
441
+ 0,
442
+ 256,
443
+ 0,
444
+ 256
445
+ ],
446
+ "zaya_use_eda": true,
447
+ "zaya_use_mod": true,
448
+ "torch_dtype": "bfloat16",
449
+ "transformers_version": "4.57.1",
450
+ "use_cache": true,
451
+ "vocab_size": 262272
452
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 2,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.50.0.dev0"
7
+ }
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45437635a55f0b697c7c5384ca00dcaab27f21f1663cb4c2aa5fc6e0fba75d82
3
+ size 4999499392
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d7976142e4fa55660b87f04ac916cee2749c0ef31e25522a13858cea29c94bd
3
+ size 4998887440
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28a25052cc3c4c961719c56dd077c7fbe9b1bf7f0f88fe5f646f9d3057898cd4
3
+ size 4990498832
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:845d272ce6a11322703675c349cee9cc1c75c02d97a60d62f1a9089cec1da04d
3
+ size 2692425872
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "boi_token": "<start_of_image>",
3
+ "bos_token": {
4
+ "content": "<bos>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "eoi_token": "<end_of_image>",
11
+ "eos_token": {
12
+ "content": "<eos>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "image_token": "<image_soft_token>",
19
+ "pad_token": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "unk_token": {
27
+ "content": "<unk>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b244434a1e668213b5494c816f8077d7b5b64bdac094ea09e7aaf6281b77f00
3
+ size 33384937
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff