sensefvg commited on
Commit
b3f3294
·
verified ·
1 Parent(s): 120295b

upload initial model

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ taozi.wav filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ---
2
+ license: mit
3
+ ---
added_tokens.json ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</audio>": 151937,
3
+ "</box>": 151677,
4
+ "</img>": 151671,
5
+ "</quad>": 151673,
6
+ "</ref>": 151675,
7
+ "</think>": 151668,
8
+ "</tool_call>": 151658,
9
+ "</tool_response>": 151666,
10
+ "<AUDIO_CONTEXT>": 151938,
11
+ "<FAKE_PAD_0>": 151682,
12
+ "<FAKE_PAD_100>": 151782,
13
+ "<FAKE_PAD_101>": 151783,
14
+ "<FAKE_PAD_102>": 151784,
15
+ "<FAKE_PAD_103>": 151785,
16
+ "<FAKE_PAD_104>": 151786,
17
+ "<FAKE_PAD_105>": 151787,
18
+ "<FAKE_PAD_106>": 151788,
19
+ "<FAKE_PAD_107>": 151789,
20
+ "<FAKE_PAD_108>": 151790,
21
+ "<FAKE_PAD_109>": 151791,
22
+ "<FAKE_PAD_10>": 151692,
23
+ "<FAKE_PAD_110>": 151792,
24
+ "<FAKE_PAD_111>": 151793,
25
+ "<FAKE_PAD_112>": 151794,
26
+ "<FAKE_PAD_113>": 151795,
27
+ "<FAKE_PAD_114>": 151796,
28
+ "<FAKE_PAD_115>": 151797,
29
+ "<FAKE_PAD_116>": 151798,
30
+ "<FAKE_PAD_117>": 151799,
31
+ "<FAKE_PAD_118>": 151800,
32
+ "<FAKE_PAD_119>": 151801,
33
+ "<FAKE_PAD_11>": 151693,
34
+ "<FAKE_PAD_120>": 151802,
35
+ "<FAKE_PAD_121>": 151803,
36
+ "<FAKE_PAD_122>": 151804,
37
+ "<FAKE_PAD_123>": 151805,
38
+ "<FAKE_PAD_124>": 151806,
39
+ "<FAKE_PAD_125>": 151807,
40
+ "<FAKE_PAD_126>": 151808,
41
+ "<FAKE_PAD_127>": 151809,
42
+ "<FAKE_PAD_128>": 151810,
43
+ "<FAKE_PAD_129>": 151811,
44
+ "<FAKE_PAD_12>": 151694,
45
+ "<FAKE_PAD_130>": 151812,
46
+ "<FAKE_PAD_131>": 151813,
47
+ "<FAKE_PAD_132>": 151814,
48
+ "<FAKE_PAD_133>": 151815,
49
+ "<FAKE_PAD_134>": 151816,
50
+ "<FAKE_PAD_135>": 151817,
51
+ "<FAKE_PAD_136>": 151818,
52
+ "<FAKE_PAD_137>": 151819,
53
+ "<FAKE_PAD_138>": 151820,
54
+ "<FAKE_PAD_139>": 151821,
55
+ "<FAKE_PAD_13>": 151695,
56
+ "<FAKE_PAD_140>": 151822,
57
+ "<FAKE_PAD_141>": 151823,
58
+ "<FAKE_PAD_142>": 151824,
59
+ "<FAKE_PAD_143>": 151825,
60
+ "<FAKE_PAD_144>": 151826,
61
+ "<FAKE_PAD_145>": 151827,
62
+ "<FAKE_PAD_146>": 151828,
63
+ "<FAKE_PAD_147>": 151829,
64
+ "<FAKE_PAD_148>": 151830,
65
+ "<FAKE_PAD_149>": 151831,
66
+ "<FAKE_PAD_14>": 151696,
67
+ "<FAKE_PAD_150>": 151832,
68
+ "<FAKE_PAD_151>": 151833,
69
+ "<FAKE_PAD_152>": 151834,
70
+ "<FAKE_PAD_153>": 151835,
71
+ "<FAKE_PAD_154>": 151836,
72
+ "<FAKE_PAD_155>": 151837,
73
+ "<FAKE_PAD_156>": 151838,
74
+ "<FAKE_PAD_157>": 151839,
75
+ "<FAKE_PAD_158>": 151840,
76
+ "<FAKE_PAD_159>": 151841,
77
+ "<FAKE_PAD_15>": 151697,
78
+ "<FAKE_PAD_160>": 151842,
79
+ "<FAKE_PAD_161>": 151843,
80
+ "<FAKE_PAD_162>": 151844,
81
+ "<FAKE_PAD_163>": 151845,
82
+ "<FAKE_PAD_164>": 151846,
83
+ "<FAKE_PAD_165>": 151847,
84
+ "<FAKE_PAD_166>": 151848,
85
+ "<FAKE_PAD_167>": 151849,
86
+ "<FAKE_PAD_168>": 151850,
87
+ "<FAKE_PAD_169>": 151851,
88
+ "<FAKE_PAD_16>": 151698,
89
+ "<FAKE_PAD_170>": 151852,
90
+ "<FAKE_PAD_171>": 151853,
91
+ "<FAKE_PAD_172>": 151854,
92
+ "<FAKE_PAD_173>": 151855,
93
+ "<FAKE_PAD_174>": 151856,
94
+ "<FAKE_PAD_175>": 151857,
95
+ "<FAKE_PAD_176>": 151858,
96
+ "<FAKE_PAD_177>": 151859,
97
+ "<FAKE_PAD_178>": 151860,
98
+ "<FAKE_PAD_179>": 151861,
99
+ "<FAKE_PAD_17>": 151699,
100
+ "<FAKE_PAD_180>": 151862,
101
+ "<FAKE_PAD_181>": 151863,
102
+ "<FAKE_PAD_182>": 151864,
103
+ "<FAKE_PAD_183>": 151865,
104
+ "<FAKE_PAD_184>": 151866,
105
+ "<FAKE_PAD_185>": 151867,
106
+ "<FAKE_PAD_186>": 151868,
107
+ "<FAKE_PAD_187>": 151869,
108
+ "<FAKE_PAD_188>": 151870,
109
+ "<FAKE_PAD_189>": 151871,
110
+ "<FAKE_PAD_18>": 151700,
111
+ "<FAKE_PAD_190>": 151872,
112
+ "<FAKE_PAD_191>": 151873,
113
+ "<FAKE_PAD_192>": 151874,
114
+ "<FAKE_PAD_193>": 151875,
115
+ "<FAKE_PAD_194>": 151876,
116
+ "<FAKE_PAD_195>": 151877,
117
+ "<FAKE_PAD_196>": 151878,
118
+ "<FAKE_PAD_197>": 151879,
119
+ "<FAKE_PAD_198>": 151880,
120
+ "<FAKE_PAD_199>": 151881,
121
+ "<FAKE_PAD_19>": 151701,
122
+ "<FAKE_PAD_1>": 151683,
123
+ "<FAKE_PAD_200>": 151882,
124
+ "<FAKE_PAD_201>": 151883,
125
+ "<FAKE_PAD_202>": 151884,
126
+ "<FAKE_PAD_203>": 151885,
127
+ "<FAKE_PAD_204>": 151886,
128
+ "<FAKE_PAD_205>": 151887,
129
+ "<FAKE_PAD_206>": 151888,
130
+ "<FAKE_PAD_207>": 151889,
131
+ "<FAKE_PAD_208>": 151890,
132
+ "<FAKE_PAD_209>": 151891,
133
+ "<FAKE_PAD_20>": 151702,
134
+ "<FAKE_PAD_210>": 151892,
135
+ "<FAKE_PAD_211>": 151893,
136
+ "<FAKE_PAD_212>": 151894,
137
+ "<FAKE_PAD_213>": 151895,
138
+ "<FAKE_PAD_214>": 151896,
139
+ "<FAKE_PAD_215>": 151897,
140
+ "<FAKE_PAD_216>": 151898,
141
+ "<FAKE_PAD_217>": 151899,
142
+ "<FAKE_PAD_218>": 151900,
143
+ "<FAKE_PAD_219>": 151901,
144
+ "<FAKE_PAD_21>": 151703,
145
+ "<FAKE_PAD_220>": 151902,
146
+ "<FAKE_PAD_221>": 151903,
147
+ "<FAKE_PAD_222>": 151904,
148
+ "<FAKE_PAD_223>": 151905,
149
+ "<FAKE_PAD_224>": 151906,
150
+ "<FAKE_PAD_225>": 151907,
151
+ "<FAKE_PAD_226>": 151908,
152
+ "<FAKE_PAD_227>": 151909,
153
+ "<FAKE_PAD_228>": 151910,
154
+ "<FAKE_PAD_229>": 151911,
155
+ "<FAKE_PAD_22>": 151704,
156
+ "<FAKE_PAD_230>": 151912,
157
+ "<FAKE_PAD_231>": 151913,
158
+ "<FAKE_PAD_232>": 151914,
159
+ "<FAKE_PAD_233>": 151915,
160
+ "<FAKE_PAD_234>": 151916,
161
+ "<FAKE_PAD_235>": 151917,
162
+ "<FAKE_PAD_236>": 151918,
163
+ "<FAKE_PAD_237>": 151919,
164
+ "<FAKE_PAD_238>": 151920,
165
+ "<FAKE_PAD_239>": 151921,
166
+ "<FAKE_PAD_23>": 151705,
167
+ "<FAKE_PAD_240>": 151922,
168
+ "<FAKE_PAD_241>": 151923,
169
+ "<FAKE_PAD_242>": 151924,
170
+ "<FAKE_PAD_243>": 151925,
171
+ "<FAKE_PAD_244>": 151926,
172
+ "<FAKE_PAD_245>": 151927,
173
+ "<FAKE_PAD_246>": 151928,
174
+ "<FAKE_PAD_247>": 151929,
175
+ "<FAKE_PAD_248>": 151930,
176
+ "<FAKE_PAD_249>": 151931,
177
+ "<FAKE_PAD_24>": 151706,
178
+ "<FAKE_PAD_250>": 151932,
179
+ "<FAKE_PAD_251>": 151933,
180
+ "<FAKE_PAD_252>": 151934,
181
+ "<FAKE_PAD_253>": 151935,
182
+ "<FAKE_PAD_25>": 151707,
183
+ "<FAKE_PAD_26>": 151708,
184
+ "<FAKE_PAD_27>": 151709,
185
+ "<FAKE_PAD_28>": 151710,
186
+ "<FAKE_PAD_29>": 151711,
187
+ "<FAKE_PAD_2>": 151684,
188
+ "<FAKE_PAD_30>": 151712,
189
+ "<FAKE_PAD_31>": 151713,
190
+ "<FAKE_PAD_32>": 151714,
191
+ "<FAKE_PAD_33>": 151715,
192
+ "<FAKE_PAD_34>": 151716,
193
+ "<FAKE_PAD_35>": 151717,
194
+ "<FAKE_PAD_36>": 151718,
195
+ "<FAKE_PAD_37>": 151719,
196
+ "<FAKE_PAD_38>": 151720,
197
+ "<FAKE_PAD_39>": 151721,
198
+ "<FAKE_PAD_3>": 151685,
199
+ "<FAKE_PAD_40>": 151722,
200
+ "<FAKE_PAD_41>": 151723,
201
+ "<FAKE_PAD_42>": 151724,
202
+ "<FAKE_PAD_43>": 151725,
203
+ "<FAKE_PAD_44>": 151726,
204
+ "<FAKE_PAD_45>": 151727,
205
+ "<FAKE_PAD_46>": 151728,
206
+ "<FAKE_PAD_47>": 151729,
207
+ "<FAKE_PAD_48>": 151730,
208
+ "<FAKE_PAD_49>": 151731,
209
+ "<FAKE_PAD_4>": 151686,
210
+ "<FAKE_PAD_50>": 151732,
211
+ "<FAKE_PAD_51>": 151733,
212
+ "<FAKE_PAD_52>": 151734,
213
+ "<FAKE_PAD_53>": 151735,
214
+ "<FAKE_PAD_54>": 151736,
215
+ "<FAKE_PAD_55>": 151737,
216
+ "<FAKE_PAD_56>": 151738,
217
+ "<FAKE_PAD_57>": 151739,
218
+ "<FAKE_PAD_58>": 151740,
219
+ "<FAKE_PAD_59>": 151741,
220
+ "<FAKE_PAD_5>": 151687,
221
+ "<FAKE_PAD_60>": 151742,
222
+ "<FAKE_PAD_61>": 151743,
223
+ "<FAKE_PAD_62>": 151744,
224
+ "<FAKE_PAD_63>": 151745,
225
+ "<FAKE_PAD_64>": 151746,
226
+ "<FAKE_PAD_65>": 151747,
227
+ "<FAKE_PAD_66>": 151748,
228
+ "<FAKE_PAD_67>": 151749,
229
+ "<FAKE_PAD_68>": 151750,
230
+ "<FAKE_PAD_69>": 151751,
231
+ "<FAKE_PAD_6>": 151688,
232
+ "<FAKE_PAD_70>": 151752,
233
+ "<FAKE_PAD_71>": 151753,
234
+ "<FAKE_PAD_72>": 151754,
235
+ "<FAKE_PAD_73>": 151755,
236
+ "<FAKE_PAD_74>": 151756,
237
+ "<FAKE_PAD_75>": 151757,
238
+ "<FAKE_PAD_76>": 151758,
239
+ "<FAKE_PAD_77>": 151759,
240
+ "<FAKE_PAD_78>": 151760,
241
+ "<FAKE_PAD_79>": 151761,
242
+ "<FAKE_PAD_7>": 151689,
243
+ "<FAKE_PAD_80>": 151762,
244
+ "<FAKE_PAD_81>": 151763,
245
+ "<FAKE_PAD_82>": 151764,
246
+ "<FAKE_PAD_83>": 151765,
247
+ "<FAKE_PAD_84>": 151766,
248
+ "<FAKE_PAD_85>": 151767,
249
+ "<FAKE_PAD_86>": 151768,
250
+ "<FAKE_PAD_87>": 151769,
251
+ "<FAKE_PAD_88>": 151770,
252
+ "<FAKE_PAD_89>": 151771,
253
+ "<FAKE_PAD_8>": 151690,
254
+ "<FAKE_PAD_90>": 151772,
255
+ "<FAKE_PAD_91>": 151773,
256
+ "<FAKE_PAD_92>": 151774,
257
+ "<FAKE_PAD_93>": 151775,
258
+ "<FAKE_PAD_94>": 151776,
259
+ "<FAKE_PAD_95>": 151777,
260
+ "<FAKE_PAD_96>": 151778,
261
+ "<FAKE_PAD_97>": 151779,
262
+ "<FAKE_PAD_98>": 151780,
263
+ "<FAKE_PAD_99>": 151781,
264
+ "<FAKE_PAD_9>": 151691,
265
+ "<FAKE_PAD_PAD_0>": 151940,
266
+ "<FAKE_PAD_PAD_10>": 151950,
267
+ "<FAKE_PAD_PAD_11>": 151951,
268
+ "<FAKE_PAD_PAD_12>": 151952,
269
+ "<FAKE_PAD_PAD_13>": 151953,
270
+ "<FAKE_PAD_PAD_14>": 151954,
271
+ "<FAKE_PAD_PAD_15>": 151955,
272
+ "<FAKE_PAD_PAD_16>": 151956,
273
+ "<FAKE_PAD_PAD_17>": 151957,
274
+ "<FAKE_PAD_PAD_18>": 151958,
275
+ "<FAKE_PAD_PAD_19>": 151959,
276
+ "<FAKE_PAD_PAD_1>": 151941,
277
+ "<FAKE_PAD_PAD_20>": 151960,
278
+ "<FAKE_PAD_PAD_21>": 151961,
279
+ "<FAKE_PAD_PAD_22>": 151962,
280
+ "<FAKE_PAD_PAD_23>": 151963,
281
+ "<FAKE_PAD_PAD_24>": 151964,
282
+ "<FAKE_PAD_PAD_25>": 151965,
283
+ "<FAKE_PAD_PAD_26>": 151966,
284
+ "<FAKE_PAD_PAD_27>": 151967,
285
+ "<FAKE_PAD_PAD_2>": 151942,
286
+ "<FAKE_PAD_PAD_3>": 151943,
287
+ "<FAKE_PAD_PAD_4>": 151944,
288
+ "<FAKE_PAD_PAD_5>": 151945,
289
+ "<FAKE_PAD_PAD_6>": 151946,
290
+ "<FAKE_PAD_PAD_7>": 151947,
291
+ "<FAKE_PAD_PAD_8>": 151948,
292
+ "<FAKE_PAD_PAD_9>": 151949,
293
+ "<IMG_CONTEXT>": 151669,
294
+ "<audio>": 151936,
295
+ "<box>": 151676,
296
+ "<img>": 151670,
297
+ "<interrupt>": 151939,
298
+ "<quad>": 151672,
299
+ "<ref>": 151674,
300
+ "<think>": 151667,
301
+ "<tool_call>": 151657,
302
+ "<tool_response>": 151665,
303
+ "<|action_end|>": 151679,
304
+ "<|action_start|>": 151678,
305
+ "<|box_end|>": 151649,
306
+ "<|box_start|>": 151648,
307
+ "<|endoftext|>": 151643,
308
+ "<|file_sep|>": 151664,
309
+ "<|fim_middle|>": 151660,
310
+ "<|fim_pad|>": 151662,
311
+ "<|fim_prefix|>": 151659,
312
+ "<|fim_suffix|>": 151661,
313
+ "<|im_end|>": 151645,
314
+ "<|im_start|>": 151644,
315
+ "<|image_pad|>": 151655,
316
+ "<|interpreter|>": 151681,
317
+ "<|object_ref_end|>": 151647,
318
+ "<|object_ref_start|>": 151646,
319
+ "<|plugin|>": 151680,
320
+ "<|quad_end|>": 151651,
321
+ "<|quad_start|>": 151650,
322
+ "<|repo_name|>": 151663,
323
+ "<|video_pad|>": 151656,
324
+ "<|vision_end|>": 151653,
325
+ "<|vision_pad|>": 151654,
326
+ "<|vision_start|>": 151652
327
+ }
campplus.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6ac6a63997761ae2997373e2ee1c47040854b4b759ea41ec48e4e42df0f4d73
3
+ size 28303423
config.json ADDED
@@ -0,0 +1,628 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_commit_hash": null,
3
+ "_name_or_path": "InteractiveOmni",
4
+ "architectures": [
5
+ "InteractiveOmniModel"
6
+ ],
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_interactiveomni.InteractiveOmniConfig",
9
+ "AutoModel": "modeling_interactiveomni.InteractiveOmniModel",
10
+ "AutoModelForCausalLM": "modeling_interactiveomni.InteractiveOmniModel"
11
+ },
12
+ "audio_config": {
13
+ "_name_or_path": "openai/whisper-large-v3",
14
+ "activation_dropout": 0.0,
15
+ "activation_function": "gelu",
16
+ "apply_spec_augment": false,
17
+ "architectures": [
18
+ "WhisperForConditionalGeneration"
19
+ ],
20
+ "attention_dropout": 0.0,
21
+ "begin_suppress_tokens": [
22
+ 220,
23
+ 50257
24
+ ],
25
+ "bos_token_id": 50257,
26
+ "classifier_proj_size": 256,
27
+ "d_model": 1280,
28
+ "decoder_attention_heads": 20,
29
+ "decoder_ffn_dim": 5120,
30
+ "decoder_layerdrop": 0.0,
31
+ "decoder_layers": 32,
32
+ "decoder_start_token_id": 50258,
33
+ "dropout": 0.0,
34
+ "encoder_attention_heads": 20,
35
+ "encoder_ffn_dim": 5120,
36
+ "encoder_layerdrop": 0.0,
37
+ "encoder_layers": 32,
38
+ "eos_token_id": 50257,
39
+ "init_std": 0.02,
40
+ "is_encoder_decoder": true,
41
+ "mask_feature_length": 10,
42
+ "mask_feature_min_masks": 0,
43
+ "mask_feature_prob": 0.0,
44
+ "mask_time_length": 10,
45
+ "mask_time_min_masks": 2,
46
+ "mask_time_prob": 0.05,
47
+ "max_length": 448,
48
+ "max_source_positions": 1500,
49
+ "max_target_positions": 448,
50
+ "median_filter_width": 7,
51
+ "model_type": "whisper",
52
+ "num_hidden_layers": 32,
53
+ "num_mel_bins": 128,
54
+ "pad_token_id": 50256,
55
+ "scale_embedding": false,
56
+ "torch_dtype": "float16",
57
+ "transformers_version": "4.36.0.dev0",
58
+ "use_cache": true,
59
+ "use_weighted_layer_sum": false,
60
+ "vocab_size": 51866
61
+ },
62
+ "audio_preprocessor_config": {
63
+ "chunk_length": 30,
64
+ "feature_extractor_type": "WhisperFeatureExtractor",
65
+ "feature_size": 128,
66
+ "hop_length": 160,
67
+ "n_fft": 400,
68
+ "n_samples": 480000,
69
+ "nb_max_frames": 3000,
70
+ "padding_side": "right",
71
+ "padding_value": 0.0,
72
+ "processor_class": "WhisperProcessor",
73
+ "return_attention_mask": false,
74
+ "sampling_rate": 16000
75
+ },
76
+ "downsample_ratio": 0.25,
77
+ "dynamic_image_size": true,
78
+ "force_image_size": 448,
79
+ "llm_config": {
80
+ "_name_or_path": "Qwen/Qwen3-8B",
81
+ "add_cross_attention": false,
82
+ "architectures": [
83
+ "Qwen3ForCausalLM"
84
+ ],
85
+ "attention_bias": false,
86
+ "attention_dropout": 0.0,
87
+ "attn_implementation": "flash_attention_2",
88
+ "bad_words_ids": null,
89
+ "begin_suppress_tokens": null,
90
+ "bias": false,
91
+ "bos_token_id": 151643,
92
+ "chunk_size_feed_forward": 0,
93
+ "cross_attention_hidden_size": null,
94
+ "decoder_start_token_id": null,
95
+ "diversity_penalty": 0.0,
96
+ "do_sample": false,
97
+ "early_stopping": false,
98
+ "encoder_no_repeat_ngram_size": 0,
99
+ "eos_token_id": 151645,
100
+ "exponential_decay_length_penalty": null,
101
+ "finetuning_task": null,
102
+ "forced_bos_token_id": null,
103
+ "forced_eos_token_id": null,
104
+ "head_dim": 128,
105
+ "hidden_act": "silu",
106
+ "hidden_size": 4096,
107
+ "id2label": {
108
+ "0": "LABEL_0",
109
+ "1": "LABEL_1"
110
+ },
111
+ "initializer_range": 0.02,
112
+ "intermediate_size": 12288,
113
+ "is_decoder": false,
114
+ "is_encoder_decoder": false,
115
+ "label2id": {
116
+ "LABEL_0": 0,
117
+ "LABEL_1": 1
118
+ },
119
+ "length_penalty": 1.0,
120
+ "max_length": 20,
121
+ "max_position_embeddings": 40960,
122
+ "max_window_layers": 40,
123
+ "min_length": 0,
124
+ "model_type": "qwen3",
125
+ "no_repeat_ngram_size": 0,
126
+ "num_attention_heads": 32,
127
+ "num_beam_groups": 1,
128
+ "num_beams": 1,
129
+ "num_hidden_layers": 36,
130
+ "num_key_value_heads": 8,
131
+ "num_return_sequences": 1,
132
+ "output_attentions": false,
133
+ "output_hidden_states": false,
134
+ "output_scores": false,
135
+ "pad_token_id": null,
136
+ "prefix": null,
137
+ "problem_type": null,
138
+ "pruned_heads": {},
139
+ "remove_invalid_values": false,
140
+ "repetition_penalty": 1.0,
141
+ "return_dict": true,
142
+ "return_dict_in_generate": false,
143
+ "rms_norm_eps": 1e-06,
144
+ "rope_scaling": {
145
+ "factor": 2.0,
146
+ "type": "dynamic"
147
+ },
148
+ "rope_theta": 1000000.0,
149
+ "sep_token_id": null,
150
+ "sliding_window": null,
151
+ "suppress_tokens": null,
152
+ "task_specific_params": null,
153
+ "temperature": 1.0,
154
+ "tf_legacy_loss": false,
155
+ "tie_encoder_decoder": false,
156
+ "tie_word_embeddings": false,
157
+ "tokenizer_class": null,
158
+ "top_k": 50,
159
+ "top_p": 1.0,
160
+ "torch_dtype": "bfloat16",
161
+ "torchscript": false,
162
+ "transformers_version": "4.51.0",
163
+ "typical_p": 1.0,
164
+ "use_bfloat16": false,
165
+ "use_cache": false,
166
+ "use_sliding_window": false,
167
+ "vocab_size": 151968
168
+ },
169
+ "max_dynamic_patch": 12,
170
+ "min_dynamic_patch": 1,
171
+ "model_type": "interactiveomni",
172
+ "pad2square": false,
173
+ "ps_version": "v2",
174
+ "select_layer": -1,
175
+ "template": "interactiveomni_template",
176
+ "torch_dtype": "bfloat16",
177
+ "transformers_version": null,
178
+ "use_backbone_lora": 0,
179
+ "use_llm_lora": 0,
180
+ "use_thumbnail": true,
181
+ "vision_config": {
182
+ "_name_or_path": "OpenGVLab/InternViT-300M-448px",
183
+ "add_cross_attention": false,
184
+ "architectures": [
185
+ "InternVisionModel"
186
+ ],
187
+ "auto_map": {
188
+ "AutoConfig": "configuration_intern_vit.InternVisionConfig",
189
+ "AutoModel": "modeling_intern_vit.InternVisionModel"
190
+ },
191
+ "attention_dropout": 0.0,
192
+ "drop_path_rate": 0.1,
193
+ "dropout": 0.0,
194
+ "hidden_act": "gelu",
195
+ "hidden_size": 1024,
196
+ "image_size": 448,
197
+ "initializer_factor": 1.0,
198
+ "initializer_range": 0.02,
199
+ "intermediate_size": 4096,
200
+ "layer_norm_eps": 1e-06,
201
+ "model_type": "intern_vit_6b",
202
+ "norm_type": "layer_norm",
203
+ "num_attention_heads": 16,
204
+ "num_channels": 3,
205
+ "num_hidden_layers": 24,
206
+ "qk_normalization": false,
207
+ "qkv_bias": true,
208
+ "torch_dtype": "bfloat16",
209
+ "transformers_version": "4.37.2",
210
+ "use_flash_attn": true
211
+ },
212
+ "flow_config": {
213
+ "_attn_implementation_internal": null,
214
+ "_commit_hash": null,
215
+ "_name_or_path": "",
216
+ "add_cross_attention": false,
217
+ "architectures": [
218
+ "CausalMaskedDiffWithXvec"
219
+ ],
220
+ "bad_words_ids": null,
221
+ "begin_suppress_tokens": null,
222
+ "bos_token_id": null,
223
+ "chunk_size_feed_forward": 0,
224
+ "cross_attention_hidden_size": null,
225
+ "decoder_config": {
226
+ "cfm_params": {
227
+ "inference_cfg_rate": 0.7,
228
+ "reg_loss_type": "l1",
229
+ "sigma_min": 1e-06,
230
+ "solver": "euler",
231
+ "t_scheduler": "cosine",
232
+ "training_cfg_rate": 0.2
233
+ },
234
+ "estimator_config": {
235
+ "act_fn": "gelu",
236
+ "attention_head_dim": 64,
237
+ "causal": true,
238
+ "channels": [
239
+ 256
240
+ ],
241
+ "dropout": 0.0,
242
+ "in_channels": 320,
243
+ "n_blocks": 4,
244
+ "num_heads": 8,
245
+ "num_mid_blocks": 12,
246
+ "out_channels": 80
247
+ },
248
+ "in_channels": 240,
249
+ "n_spks": 1,
250
+ "spk_emb_dim": 80
251
+ },
252
+ "decoder_start_token_id": null,
253
+ "diversity_penalty": 0.0,
254
+ "do_sample": false,
255
+ "early_stopping": false,
256
+ "encoder_config": {
257
+ "attention_dropout_rate": 0.1,
258
+ "attention_heads": 8,
259
+ "dropout_rate": 0.1,
260
+ "input_layer": "linear",
261
+ "input_size": 512,
262
+ "linear_units": 2048,
263
+ "macaron_style": false,
264
+ "normalize_before": true,
265
+ "num_blocks": 6,
266
+ "output_size": 512,
267
+ "pos_enc_layer_type": "rel_pos_espnet",
268
+ "positional_dropout_rate": 0.1,
269
+ "selfattention_layer_type": "rel_selfattn",
270
+ "use_cnn_module": false
271
+ },
272
+ "encoder_no_repeat_ngram_size": 0,
273
+ "eos_token_id": null,
274
+ "exponential_decay_length_penalty": null,
275
+ "finetuning_task": null,
276
+ "forced_bos_token_id": null,
277
+ "forced_eos_token_id": null,
278
+ "id2label": {
279
+ "0": "LABEL_0",
280
+ "1": "LABEL_1"
281
+ },
282
+ "input_frame_rate": 25,
283
+ "input_size": 512,
284
+ "is_decoder": false,
285
+ "is_encoder_decoder": false,
286
+ "label2id": {
287
+ "LABEL_0": 0,
288
+ "LABEL_1": 1
289
+ },
290
+ "length_penalty": 1.0,
291
+ "max_length": 20,
292
+ "min_length": 0,
293
+ "no_repeat_ngram_size": 0,
294
+ "num_beam_groups": 1,
295
+ "num_beams": 1,
296
+ "num_return_sequences": 1,
297
+ "only_mask_loss": true,
298
+ "output_attentions": false,
299
+ "output_hidden_states": false,
300
+ "output_scores": false,
301
+ "output_size": 80,
302
+ "output_type": "mel",
303
+ "pad_token_id": null,
304
+ "pre_lookahead_len": 3,
305
+ "prefix": null,
306
+ "problem_type": null,
307
+ "pruned_heads": {},
308
+ "remove_invalid_values": false,
309
+ "repetition_penalty": 1.0,
310
+ "return_dict": true,
311
+ "return_dict_in_generate": false,
312
+ "sep_token_id": null,
313
+ "spk_embed_dim": 192,
314
+ "suppress_tokens": null,
315
+ "task_specific_params": null,
316
+ "temperature": 1.0,
317
+ "tf_legacy_loss": false,
318
+ "tie_encoder_decoder": false,
319
+ "tie_word_embeddings": true,
320
+ "token_mel_ratio": 2,
321
+ "tokenizer_class": null,
322
+ "top_k": 50,
323
+ "top_p": 1.0,
324
+ "torch_dtype": "float32",
325
+ "torchscript": false,
326
+ "transformers_version": null,
327
+ "typical_p": 1.0,
328
+ "use_bfloat16": false,
329
+ "vocab_size": 6561
330
+ },
331
+ "hifigan_config": {
332
+ "_attn_implementation_internal": null,
333
+ "_commit_hash": null,
334
+ "_name_or_path": "",
335
+ "add_cross_attention": false,
336
+ "architectures": [
337
+ "HiFTGenerator"
338
+ ],
339
+ "audio_limit": 0.99,
340
+ "bad_words_ids": null,
341
+ "base_channels": 512,
342
+ "begin_suppress_tokens": null,
343
+ "bos_token_id": null,
344
+ "chunk_size_feed_forward": 0,
345
+ "cross_attention_hidden_size": null,
346
+ "decoder_start_token_id": null,
347
+ "diversity_penalty": 0.0,
348
+ "do_sample": false,
349
+ "early_stopping": false,
350
+ "encoder_no_repeat_ngram_size": 0,
351
+ "eos_token_id": null,
352
+ "exponential_decay_length_penalty": null,
353
+ "f0_predictor_config": {
354
+ "cond_channels": 512,
355
+ "in_channels": 80,
356
+ "num_class": 1
357
+ },
358
+ "finetuning_task": null,
359
+ "forced_bos_token_id": null,
360
+ "forced_eos_token_id": null,
361
+ "id2label": {
362
+ "0": "LABEL_0",
363
+ "1": "LABEL_1"
364
+ },
365
+ "in_channels": 80,
366
+ "is_decoder": false,
367
+ "is_encoder_decoder": false,
368
+ "istft_params": {
369
+ "hop_len": 4,
370
+ "n_fft": 16
371
+ },
372
+ "label2id": {
373
+ "LABEL_0": 0,
374
+ "LABEL_1": 1
375
+ },
376
+ "length_penalty": 1.0,
377
+ "lrelu_slope": 0.1,
378
+ "max_length": 20,
379
+ "min_length": 0,
380
+ "nb_harmonics": 8,
381
+ "no_repeat_ngram_size": 0,
382
+ "nsf_alpha": 0.1,
383
+ "nsf_sigma": 0.003,
384
+ "nsf_voiced_threshold": 10,
385
+ "num_beam_groups": 1,
386
+ "num_beams": 1,
387
+ "num_return_sequences": 1,
388
+ "output_attentions": false,
389
+ "output_hidden_states": false,
390
+ "output_scores": false,
391
+ "pad_token_id": null,
392
+ "prefix": null,
393
+ "problem_type": null,
394
+ "pruned_heads": {},
395
+ "remove_invalid_values": false,
396
+ "repetition_penalty": 1.0,
397
+ "resblock_dilation_sizes": [
398
+ [
399
+ 1,
400
+ 3,
401
+ 5
402
+ ],
403
+ [
404
+ 1,
405
+ 3,
406
+ 5
407
+ ],
408
+ [
409
+ 1,
410
+ 3,
411
+ 5
412
+ ]
413
+ ],
414
+ "resblock_kernel_sizes": [
415
+ 3,
416
+ 7,
417
+ 11
418
+ ],
419
+ "return_dict": true,
420
+ "return_dict_in_generate": false,
421
+ "sampling_rate": 24000,
422
+ "sep_token_id": null,
423
+ "source_resblock_dilation_sizes": [
424
+ [
425
+ 1,
426
+ 3,
427
+ 5
428
+ ],
429
+ [
430
+ 1,
431
+ 3,
432
+ 5
433
+ ],
434
+ [
435
+ 1,
436
+ 3,
437
+ 5
438
+ ]
439
+ ],
440
+ "source_resblock_kernel_sizes": [
441
+ 7,
442
+ 7,
443
+ 11
444
+ ],
445
+ "suppress_tokens": null,
446
+ "task_specific_params": null,
447
+ "temperature": 1.0,
448
+ "tf_legacy_loss": false,
449
+ "tie_encoder_decoder": false,
450
+ "tie_word_embeddings": true,
451
+ "tokenizer_class": null,
452
+ "top_k": 50,
453
+ "top_p": 1.0,
454
+ "torch_dtype": "float32",
455
+ "torchscript": false,
456
+ "transformers_version": null,
457
+ "typical_p": 1.0,
458
+ "upsample_kernel_sizes": [
459
+ 16,
460
+ 11,
461
+ 7
462
+ ],
463
+ "upsample_rates": [
464
+ 8,
465
+ 5,
466
+ 3
467
+ ],
468
+ "use_bfloat16": false
469
+ },
470
+ "voicelm_config": {
471
+ "_attn_implementation_internal": null,
472
+ "_commit_hash": null,
473
+ "_name_or_path": "",
474
+ "add_cross_attention": false,
475
+ "architectures": null,
476
+ "bad_words_ids": null,
477
+ "begin_suppress_tokens": null,
478
+ "bos_token_id": null,
479
+ "chunk_size_feed_forward": 0,
480
+ "cross_attention_hidden_size": null,
481
+ "decoder_start_token_id": null,
482
+ "diversity_penalty": 0.0,
483
+ "do_sample": false,
484
+ "early_stopping": false,
485
+ "encoder_no_repeat_ngram_size": 0,
486
+ "eos_token_id": null,
487
+ "exponential_decay_length_penalty": null,
488
+ "finetuning_task": null,
489
+ "forced_bos_token_id": null,
490
+ "forced_eos_token_id": null,
491
+ "id2label": {
492
+ "0": "LABEL_0",
493
+ "1": "LABEL_1"
494
+ },
495
+ "is_decoder": false,
496
+ "is_encoder_decoder": false,
497
+ "label2id": {
498
+ "LABEL_0": 0,
499
+ "LABEL_1": 1
500
+ },
501
+ "length_normalized_loss": true,
502
+ "length_penalty": 1.0,
503
+ "llm_config": {
504
+ "add_cross_attention": false,
505
+ "architectures": [
506
+ "Qwen2ForCausalLM"
507
+ ],
508
+ "attention_dropout": 0.0,
509
+ "bad_words_ids": null,
510
+ "begin_suppress_tokens": null,
511
+ "bos_token_id": 151643,
512
+ "chunk_size_feed_forward": 0,
513
+ "cross_attention_hidden_size": null,
514
+ "decoder_start_token_id": null,
515
+ "diversity_penalty": 0.0,
516
+ "do_sample": false,
517
+ "early_stopping": false,
518
+ "encoder_no_repeat_ngram_size": 0,
519
+ "eos_token_id": 151643,
520
+ "exponential_decay_length_penalty": null,
521
+ "finetuning_task": null,
522
+ "forced_bos_token_id": null,
523
+ "forced_eos_token_id": null,
524
+ "hidden_act": "silu",
525
+ "hidden_size": 896,
526
+ "id2label": {
527
+ "0": "LABEL_0",
528
+ "1": "LABEL_1"
529
+ },
530
+ "initializer_range": 0.02,
531
+ "intermediate_size": 4864,
532
+ "is_decoder": false,
533
+ "is_encoder_decoder": false,
534
+ "label2id": {
535
+ "LABEL_0": 0,
536
+ "LABEL_1": 1
537
+ },
538
+ "length_penalty": 1.0,
539
+ "max_length": 20,
540
+ "max_position_embeddings": 32768,
541
+ "max_window_layers": 24,
542
+ "min_length": 0,
543
+ "model_type": "qwen2",
544
+ "no_repeat_ngram_size": 0,
545
+ "num_attention_heads": 14,
546
+ "num_beam_groups": 1,
547
+ "num_beams": 1,
548
+ "num_hidden_layers": 24,
549
+ "num_key_value_heads": 2,
550
+ "num_return_sequences": 1,
551
+ "output_attentions": false,
552
+ "output_hidden_states": false,
553
+ "output_scores": false,
554
+ "pad_token_id": null,
555
+ "prefix": null,
556
+ "problem_type": null,
557
+ "pruned_heads": {},
558
+ "remove_invalid_values": false,
559
+ "repetition_penalty": 1.0,
560
+ "return_dict": true,
561
+ "return_dict_in_generate": false,
562
+ "rms_norm_eps": 1e-06,
563
+ "rope_theta": 1000000.0,
564
+ "sep_token_id": null,
565
+ "sliding_window": 32768,
566
+ "suppress_tokens": null,
567
+ "task_specific_params": null,
568
+ "temperature": 1.0,
569
+ "tf_legacy_loss": false,
570
+ "tie_encoder_decoder": false,
571
+ "tie_word_embeddings": true,
572
+ "tokenizer_class": null,
573
+ "top_k": 50,
574
+ "top_p": 1.0,
575
+ "torch_dtype": "bfloat16",
576
+ "torchscript": false,
577
+ "transformers_version": "4.37.2",
578
+ "typical_p": 1.0,
579
+ "use_bfloat16": false,
580
+ "use_cache": false,
581
+ "use_mrope": false,
582
+ "use_sliding_window": false,
583
+ "vocab_size": 151936
584
+ },
585
+ "llm_input_size": 896,
586
+ "llm_output_size": 896,
587
+ "lsm_weight": 0,
588
+ "max_length": 20,
589
+ "min_length": 0,
590
+ "no_repeat_ngram_size": 0,
591
+ "num_beam_groups": 1,
592
+ "num_beams": 1,
593
+ "num_return_sequences": 1,
594
+ "output_attentions": false,
595
+ "output_hidden_states": false,
596
+ "output_scores": false,
597
+ "pad_token_id": null,
598
+ "prefix": null,
599
+ "problem_type": null,
600
+ "pruned_heads": {},
601
+ "remove_invalid_values": false,
602
+ "repetition_penalty": 1.0,
603
+ "return_dict": true,
604
+ "return_dict_in_generate": false,
605
+ "sampling_config": {
606
+ "tau_r": 0.1,
607
+ "top_k": 15,
608
+ "top_p": 0.7,
609
+ "win_size": 10
610
+ },
611
+ "sep_token_id": null,
612
+ "speech_token_size": 6561,
613
+ "suppress_tokens": null,
614
+ "task_specific_params": null,
615
+ "temperature": 1.0,
616
+ "tf_legacy_loss": false,
617
+ "tie_encoder_decoder": false,
618
+ "tie_word_embeddings": true,
619
+ "tokenizer_class": null,
620
+ "top_k": 50,
621
+ "top_p": 1.0,
622
+ "torch_dtype": null,
623
+ "torchscript": false,
624
+ "transformers_version": null,
625
+ "typical_p": 1.0,
626
+ "use_bfloat16": false
627
+ }
628
+ }
configuration_flow.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # SenseTime
3
+ # Copyright (c) 2025 SenseTime
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+ import copy
7
+
8
+ from transformers.configuration_utils import PretrainedConfig
9
+ from transformers.utils import logging
10
+
11
+ logger = logging.get_logger(__name__)
12
+
13
+ class FlowConfig(PretrainedConfig):
14
+ def __init__(
15
+ self,
16
+ input_size = 512,
17
+ output_size= 80,
18
+ spk_embed_dim = 192,
19
+ output_type = 'mel',
20
+ vocab_size = 6561,
21
+ input_frame_rate = 25,
22
+ only_mask_loss = True,
23
+ token_mel_ratio=2,
24
+ pre_lookahead_len=3,
25
+ encoder_config={'output_size': 512,
26
+ 'attention_heads': 8,
27
+ 'linear_units': 2048,
28
+ 'num_blocks': 6,
29
+ 'dropout_rate': 0.1,
30
+ 'positional_dropout_rate': 0.1,
31
+ 'attention_dropout_rate': 0.1,
32
+ 'normalize_before': True,
33
+ 'input_layer': 'linear',
34
+ 'pos_enc_layer_type': 'rel_pos_espnet',
35
+ 'selfattention_layer_type': 'rel_selfattn',
36
+ 'input_size': 512,
37
+ 'use_cnn_module': False,
38
+ 'macaron_style': False,
39
+ },
40
+ decoder_config={'in_channels': 240,
41
+ 'n_spks': 1,
42
+ 'spk_emb_dim': 80,
43
+ 'cfm_params': {
44
+ 'sigma_min': 1e-06,
45
+ 'solver': 'euler',
46
+ 't_scheduler': 'cosine',
47
+ 'training_cfg_rate': 0.2,
48
+ 'inference_cfg_rate': 0.7,
49
+ 'reg_loss_type': 'l1',
50
+ },
51
+ 'estimator_config':{
52
+ 'in_channels': 320,
53
+ 'out_channels': 80,
54
+ 'causal': True,
55
+ 'channels': [256],
56
+ 'dropout': 0.0,
57
+ 'attention_head_dim': 64,
58
+ 'n_blocks': 4,
59
+ 'num_mid_blocks': 12,
60
+ 'num_heads': 8,
61
+ 'act_fn': 'gelu'
62
+ }
63
+ },
64
+ **kwargs):
65
+ super().__init__(**kwargs)
66
+
67
+ self.encoder_config = encoder_config
68
+ self.decoder_config = decoder_config
69
+
70
+ self.input_size = input_size
71
+ self.output_size = output_size
72
+ self.spk_embed_dim = spk_embed_dim
73
+ self.output_type = output_type
74
+ self.vocab_size = vocab_size
75
+ self.input_frame_rate = input_frame_rate
76
+ self.only_mask_loss = only_mask_loss
77
+ self.token_mel_ratio = token_mel_ratio
78
+ self.pre_lookahead_len = pre_lookahead_len
79
+ pass
80
+
81
+ def to_dict(self):
82
+ """
83
+ Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
84
+
85
+ Returns:
86
+ `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
87
+ """
88
+ output = copy.deepcopy(self.__dict__)
89
+ output['encoder_config'] = self.encoder_config
90
+ output['decoder_config'] = self.decoder_config
91
+
92
+ output['input_size'] = self.input_size
93
+ output['output_size'] = self.output_size
94
+ output['spk_embed_dim'] = self.spk_embed_dim
95
+ output['output_type'] = self.output_type
96
+ output['vocab_size'] = self.vocab_size
97
+ output['input_frame_rate'] = self.input_frame_rate
98
+ output['only_mask_loss'] = self.only_mask_loss
99
+ output['token_mel_ratio'] = self.token_mel_ratio
100
+ output['pre_lookahead_len'] = self.pre_lookahead_len
101
+
102
+ return output
configuration_hifigan.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # SenseTime
3
+ # Copyright (c) 2025 SenseTime
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+ import copy
7
+
8
+ from transformers.configuration_utils import PretrainedConfig
9
+ from transformers.utils import logging
10
+
11
+ logger = logging.get_logger(__name__)
12
+
13
+ class HiFiGanConfig(PretrainedConfig):
14
+ def __init__(
15
+ self,
16
+ in_channels = 80,
17
+ base_channels = 512,
18
+ nb_harmonics = 8,
19
+ sampling_rate =24000,
20
+ nsf_alpha= 0.1,
21
+ nsf_sigma= 0.003,
22
+ nsf_voiced_threshold = 10,
23
+ upsample_rates = [8, 5, 3],
24
+ upsample_kernel_sizes = [16, 11, 7],
25
+ istft_params ={'n_fft': 16,
26
+ 'hop_len': 4,
27
+ },
28
+ resblock_kernel_sizes = [3, 7, 11],
29
+ resblock_dilation_sizes = [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
30
+ source_resblock_kernel_sizes = [7, 7, 11],
31
+ source_resblock_dilation_sizes = [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
32
+ lrelu_slope = 0.1,
33
+ audio_limit =0.99,
34
+ f0_predictor_config={
35
+ 'num_class': 1,
36
+ 'in_channels': 80,
37
+ 'cond_channels': 512
38
+ },
39
+ **kwargs):
40
+ super().__init__(**kwargs)
41
+
42
+ self.in_channels = in_channels
43
+ self.base_channels = base_channels
44
+ self.nb_harmonics = nb_harmonics
45
+ self.sampling_rate = sampling_rate
46
+ self.nsf_alpha = nsf_alpha
47
+ self.nsf_sigma = nsf_sigma
48
+ self.nsf_voiced_threshold = nsf_voiced_threshold
49
+ self.upsample_rates = upsample_rates
50
+ self.upsample_kernel_sizes = upsample_kernel_sizes
51
+ self.istft_params = istft_params
52
+ self.resblock_kernel_sizes = resblock_kernel_sizes
53
+ self.resblock_dilation_sizes= resblock_dilation_sizes
54
+ self.source_resblock_kernel_sizes = source_resblock_kernel_sizes
55
+ self.source_resblock_dilation_sizes = source_resblock_dilation_sizes
56
+ self.lrelu_slope = lrelu_slope
57
+ self.audio_limit = audio_limit
58
+ self.f0_predictor_config = f0_predictor_config
59
+ pass
60
+
61
+
62
+ def to_dict(self):
63
+ """
64
+ Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
65
+
66
+ Returns:
67
+ `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
68
+ """
69
+ output = copy.deepcopy(self.__dict__)
70
+ output['in_channels'] = self.in_channels
71
+ output['base_channels'] = self.base_channels
72
+ output['nb_harmonics'] = self.nb_harmonics
73
+ output['sampling_rate'] = self.sampling_rate
74
+ output['nsf_alpha'] = self.nsf_alpha
75
+ output['nsf_sigma'] = self.nsf_sigma
76
+ output['nsf_voiced_threshold'] = self.nsf_voiced_threshold
77
+ output['upsample_rates'] = self.upsample_rates
78
+ output['upsample_kernel_sizes'] = self.upsample_kernel_sizes
79
+ output['istft_params'] = self.istft_params
80
+ output['resblock_kernel_sizes'] = self.resblock_kernel_sizes
81
+ output['resblock_dilation_sizes'] = self.resblock_dilation_sizes
82
+ output['source_resblock_dilation_sizes'] = self.source_resblock_dilation_sizes
83
+ output['lrelu_slope'] = self.lrelu_slope
84
+ output['audio_limit'] = self.audio_limit
85
+ output['f0_predictor_config'] = self.f0_predictor_config
86
+
87
+ return output
configuration_interactiveomni.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # SenseTime
3
+ # Copyright (c) 2025 SenseTime
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+
7
+ import copy
8
+
9
+ from transformers.configuration_utils import PretrainedConfig
10
+ from transformers.utils import logging
11
+ from transformers import LlamaConfig, Qwen2Config, Qwen3Config
12
+
13
+ from .configuration_intern_vit import InternVisionConfig
14
+ from .configuration_whisper import WhisperConfig
15
+ from .configuration_voicelm import VoiceLMConfig
16
+ from .configuration_flow import FlowConfig
17
+ from .configuration_hifigan import HiFiGanConfig
18
+
19
+ logger = logging.get_logger(__name__)
20
+
21
+ class InteractiveOmniConfig(PretrainedConfig):
22
+ model_type = 'interactiveomni'
23
+ is_composition = True
24
+
25
+ def __init__(
26
+ self,
27
+ vision_config=None,
28
+ llm_config=None,
29
+ audio_config=None,
30
+ voicelm_config=None,
31
+ flow_config=None,
32
+ hifigan_config=None,
33
+ use_backbone_lora=0,
34
+ use_llm_lora=0,
35
+ pad2square=False,
36
+ select_layer=-4,
37
+ force_image_size=None,
38
+ downsample_ratio=0.5,
39
+ template=None,
40
+ dynamic_image_size=False,
41
+ use_thumbnail=False,
42
+ ps_version='v1',
43
+ min_dynamic_patch=1,
44
+ max_dynamic_patch=6,
45
+ **kwargs):
46
+ super().__init__(**kwargs)
47
+
48
+ if vision_config is None:
49
+ vision_config = {}
50
+ logger.info('vision_config is None. Initializing the InternVisionConfig with default values.')
51
+
52
+ if llm_config is None:
53
+ llm_config = {}
54
+ logger.info('llm_config is None. Initializing the Qwen3Config as default values.')
55
+
56
+ if audio_config is None:
57
+ audio_config = {}
58
+ logger.info('audio_config is None. Initializing the WhisperConfig as default values.')
59
+
60
+ if voicelm_config is None:
61
+ voicelm_config = {}
62
+ logger.info('voicelm_config is None. Initializing the VoiceLMConfig as default values')
63
+
64
+ if flow_config is None:
65
+ flow_config = {}
66
+ logger.info('flow_config is None. Initializing the FlowConfig as default values')
67
+
68
+ if hifigan_config is None:
69
+ hifigan_config = {}
70
+ logger.info('hifigan_config is None. Initializing the HiFiGanConfig as default values')
71
+
72
+ self.vision_config = InternVisionConfig(**vision_config)
73
+ self.audio_config = WhisperConfig(**audio_config)
74
+ self.llm_config = Qwen3Config(**llm_config)
75
+ self.voicelm_config = VoiceLMConfig(**voicelm_config)
76
+ self.flow_config = FlowConfig(**flow_config)
77
+ self.hifigan_config = HiFiGanConfig(**hifigan_config)
78
+ self.use_backbone_lora = use_backbone_lora
79
+ self.use_llm_lora = use_llm_lora
80
+ self.pad2square = pad2square
81
+ self.select_layer = select_layer
82
+ self.force_image_size = force_image_size
83
+ self.downsample_ratio = downsample_ratio
84
+ self.template = template
85
+ self.dynamic_image_size = dynamic_image_size
86
+ self.use_thumbnail = use_thumbnail
87
+ self.ps_version = ps_version # pixel shuffle version
88
+ self.min_dynamic_patch = min_dynamic_patch
89
+ self.max_dynamic_patch = max_dynamic_patch
90
+
91
+ logger.info(f'vision_select_layer: {self.select_layer}')
92
+ logger.info(f'ps_version: {self.ps_version}')
93
+ logger.info(f'min_dynamic_patch: {self.min_dynamic_patch}')
94
+ logger.info(f'max_dynamic_patch: {self.max_dynamic_patch}')
95
+ pass
96
+
97
+ def to_dict(self):
98
+ """
99
+ Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
100
+
101
+ Returns:
102
+ `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
103
+ """
104
+ output = copy.deepcopy(self.__dict__)
105
+ output['vision_config'] = self.vision_config.to_dict()
106
+ output['audio_config'] = self.audio_config.to_dict()
107
+ output['llm_config'] = self.llm_config.to_dict()
108
+ output['voicelm_config'] = self.voicelm_config.to_dict()
109
+ output['flow_config'] = self.flow_config.to_dict()
110
+ output['hifigan_config'] = self.hifigan_config.to_dict()
111
+ output['model_type'] = self.__class__.model_type
112
+ output['use_backbone_lora'] = self.use_backbone_lora
113
+ output['use_llm_lora'] = self.use_llm_lora
114
+ output['pad2square'] = self.pad2square
115
+ output['select_layer'] = self.select_layer
116
+ output['force_image_size'] = self.force_image_size
117
+ output['downsample_ratio'] = self.downsample_ratio
118
+ output['template'] = self.template
119
+ output['dynamic_image_size'] = self.dynamic_image_size
120
+ output['use_thumbnail'] = self.use_thumbnail
121
+ output['ps_version'] = self.ps_version
122
+ output['min_dynamic_patch'] = self.min_dynamic_patch
123
+ output['max_dynamic_patch'] = self.max_dynamic_patch
124
+
125
+ return output
configuration_intern_vit.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # InternVL
3
+ # Copyright (c) 2023 OpenGVLab
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+ import os
7
+ from typing import Union
8
+
9
+ from transformers.configuration_utils import PretrainedConfig
10
+ from transformers.utils import logging
11
+
12
+ logger = logging.get_logger(__name__)
13
+
14
+
15
+ class InternVisionConfig(PretrainedConfig):
16
+ r"""
17
+ This is the configuration class to store the configuration of a [`InternVisionModel`]. It is used to
18
+ instantiate a vision encoder according to the specified arguments, defining the model architecture.
19
+
20
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
21
+ documentation from [`PretrainedConfig`] for more information.
22
+
23
+ Args:
24
+ num_channels (`int`, *optional*, defaults to 3):
25
+ Number of color channels in the input images (e.g., 3 for RGB).
26
+ patch_size (`int`, *optional*, defaults to 14):
27
+ The size (resolution) of each patch.
28
+ image_size (`int`, *optional*, defaults to 224):
29
+ The size (resolution) of each image.
30
+ qkv_bias (`bool`, *optional*, defaults to `False`):
31
+ Whether to add a bias to the queries and values in the self-attention layers.
32
+ hidden_size (`int`, *optional*, defaults to 3200):
33
+ Dimensionality of the encoder layers and the pooler layer.
34
+ num_attention_heads (`int`, *optional*, defaults to 25):
35
+ Number of attention heads for each attention layer in the Transformer encoder.
36
+ intermediate_size (`int`, *optional*, defaults to 12800):
37
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
38
+ qk_normalization (`bool`, *optional*, defaults to `True`):
39
+ Whether to normalize the queries and keys in the self-attention layers.
40
+ num_hidden_layers (`int`, *optional*, defaults to 48):
41
+ Number of hidden layers in the Transformer encoder.
42
+ use_flash_attn (`bool`, *optional*, defaults to `True`):
43
+ Whether to use flash attention mechanism.
44
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
45
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
46
+ `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
47
+ layer_norm_eps (`float`, *optional*, defaults to 1e-6):
48
+ The epsilon used by the layer normalization layers.
49
+ dropout (`float`, *optional*, defaults to 0.0):
50
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
51
+ drop_path_rate (`float`, *optional*, defaults to 0.0):
52
+ Dropout rate for stochastic depth.
53
+ attention_dropout (`float`, *optional*, defaults to 0.0):
54
+ The dropout ratio for the attention probabilities.
55
+ initializer_range (`float`, *optional*, defaults to 0.02):
56
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
57
+ initializer_factor (`float`, *optional*, defaults to 0.1):
58
+ A factor for layer scale.
59
+ """
60
+
61
+ model_type = 'intern_vit_6b'
62
+
63
+ def __init__(
64
+ self,
65
+ num_channels=3,
66
+ patch_size=14,
67
+ image_size=224,
68
+ qkv_bias=False,
69
+ hidden_size=3200,
70
+ num_attention_heads=25,
71
+ intermediate_size=12800,
72
+ qk_normalization=True,
73
+ num_hidden_layers=48,
74
+ use_flash_attn=True,
75
+ hidden_act='gelu',
76
+ norm_type='rms_norm',
77
+ layer_norm_eps=1e-6,
78
+ dropout=0.0,
79
+ drop_path_rate=0.0,
80
+ attention_dropout=0.0,
81
+ initializer_range=0.02,
82
+ initializer_factor=0.1,
83
+ **kwargs,
84
+ ):
85
+ super().__init__(**kwargs)
86
+
87
+ self.hidden_size = hidden_size
88
+ self.intermediate_size = intermediate_size
89
+ self.dropout = dropout
90
+ self.drop_path_rate = drop_path_rate
91
+ self.num_hidden_layers = num_hidden_layers
92
+ self.num_attention_heads = num_attention_heads
93
+ self.num_channels = num_channels
94
+ self.patch_size = patch_size
95
+ self.image_size = image_size
96
+ self.initializer_range = initializer_range
97
+ self.initializer_factor = initializer_factor
98
+ self.attention_dropout = attention_dropout
99
+ self.layer_norm_eps = layer_norm_eps
100
+ self.hidden_act = hidden_act
101
+ self.norm_type = norm_type
102
+ self.qkv_bias = qkv_bias
103
+ self.qk_normalization = qk_normalization
104
+ self.use_flash_attn = use_flash_attn
105
+
106
+ @classmethod
107
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> 'PretrainedConfig':
108
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
109
+
110
+ if 'vision_config' in config_dict:
111
+ config_dict = config_dict['vision_config']
112
+
113
+ if 'model_type' in config_dict and hasattr(cls, 'model_type') and config_dict['model_type'] != cls.model_type:
114
+ logger.warning(
115
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
116
+ f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
117
+ )
118
+
119
+ return cls.from_dict(config_dict, **kwargs)
configuration_voicelm.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # SenseTime
3
+ # Copyright (c) 2025 SenseTime
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+ import copy
7
+
8
+ from transformers.configuration_utils import PretrainedConfig
9
+ from transformers.utils import logging
10
+ from transformers import LlamaConfig, Qwen2Config
11
+
12
+ logger = logging.get_logger(__name__)
13
+
14
+ class VoiceLMConfig(PretrainedConfig):
15
+ def __init__(
16
+ self,
17
+ llm_input_size = 896,
18
+ llm_output_size = 896,
19
+ speech_token_size = 6561,
20
+ length_normalized_loss = True,
21
+ lsm_weight = 0,
22
+ llm_config=None,
23
+ sampling_config={
24
+ 'top_p': 0.8,
25
+ 'top_k': 25,
26
+ 'win_size': 10,
27
+ 'tau_r': 0.1,
28
+ },
29
+ **kwargs):
30
+ super().__init__(**kwargs)
31
+
32
+ self.llm_input_size = llm_input_size
33
+ self.llm_output_size = llm_output_size
34
+ self.speech_token_size = speech_token_size
35
+ self.length_normalized_loss = length_normalized_loss
36
+ self.lsm_weight = lsm_weight
37
+ self.sampling_config = sampling_config
38
+
39
+ if llm_config is None:
40
+ llm_config = {}
41
+ logger.info('llm_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`).')
42
+
43
+ self.llm_config = Qwen2Config(**llm_config)
44
+ pass
45
+
46
+ def to_dict(self):
47
+ """
48
+ Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
49
+
50
+ Returns:
51
+ `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
52
+ """
53
+ output = copy.deepcopy(self.__dict__)
54
+ output['llm_input_size'] = self.llm_input_size
55
+ output['llm_output_size'] = self.llm_output_size
56
+ output['speech_token_size'] = self.speech_token_size
57
+ output['length_normalized_loss'] = self.length_normalized_loss
58
+ output['lsm_weight'] = self.lsm_weight
59
+ output['sampling_config'] = self.sampling_config
60
+ output['llm_config'] = self.llm_config.to_dict()
61
+
62
+ return output
63
+
configuration_whisper.py ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """ Whisper model configuration"""
16
+
17
+ from collections import OrderedDict
18
+ from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
19
+
20
+ from transformers.configuration_utils import PretrainedConfig
21
+ from transformers.onnx import OnnxConfig, OnnxSeq2SeqConfigWithPast
22
+ from transformers.utils import logging
23
+
24
+
25
+ if TYPE_CHECKING:
26
+ from transformers.feature_extraction_utils import FeatureExtractionMixin
27
+ from transformers.tokenization_utils_base import PreTrainedTokenizerBase
28
+ from transformers.utils import TensorType
29
+
30
+ logger = logging.get_logger(__name__)
31
+
32
+
33
+ # fmt: off
34
+ NON_SPEECH_TOKENS = [
35
+ 1, 2, 7, 8, 9, 10, 14, 25,
36
+ 26, 27, 28, 29, 31, 58, 59, 60, 61, 62,
37
+ 63, 90, 91, 92, 93, 357, 366, 438, 532, 685,
38
+ 705, 796, 930, 1058, 1220, 1267, 1279, 1303, 1343, 1377,
39
+ 1391, 1635, 1782, 1875, 2162, 2361, 2488, 3467, 4008, 4211,
40
+ 4600, 4808, 5299, 5855, 6329, 7203, 9609, 9959, 10563, 10786,
41
+ 11420, 11709, 11907, 13163, 13697, 13700, 14808, 15306, 16410, 16791,
42
+ 17992, 19203, 19510, 20724, 22305, 22935, 27007, 30109, 30420, 33409,
43
+ 34949, 40283, 40493, 40549, 47282, 49146, 50257, 50359, 50360, 50361
44
+ ]
45
+ NON_SPEECH_TOKENS_MULTI = [
46
+ 1, 2, 7, 8, 9, 10, 14, 25,
47
+ 26, 27, 28, 29, 31, 58, 59, 60, 61, 62,
48
+ 63, 90, 91, 92, 93, 359, 503, 522, 542, 873,
49
+ 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627,
50
+ 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647,
51
+ 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793,
52
+ 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675,
53
+ 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865,
54
+ 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362
55
+ ]
56
+ # fmt: on
57
+
58
+
59
+ class WhisperConfig(PretrainedConfig):
60
+ r"""
61
+ This is the configuration class to store the configuration of a [`WhisperModel`]. It is used to instantiate a
62
+ Whisper model according to the specified arguments, defining the model architecture. Instantiating a configuration
63
+ with the defaults will yield a similar configuration to that of the Whisper
64
+ [openai/whisper-tiny](https://huggingface.co/openai/whisper-tiny) architecture.
65
+
66
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
67
+ documentation from [`PretrainedConfig`] for more information.
68
+
69
+
70
+ Args:
71
+ vocab_size (`int`, *optional*, defaults to 51865):
72
+ Vocabulary size of the Whisper model. Defines the number of different tokens that can be represented by the
73
+ `decoder_input_ids` passed when calling [`WhisperModel`]
74
+ num_mel_bins (`int`, *optional*, defaults to 80):
75
+ Number of mel features used per input features. Should correspond to the value used in the
76
+ `WhisperProcessor` class.
77
+ encoder_layers (`int`, *optional*, defaults to 4):
78
+ Number of encoder layers.
79
+ decoder_layers (`int`, *optional*, defaults to 4):
80
+ Number of decoder layers.
81
+ encoder_attention_heads (`int`, *optional*, defaults to 6):
82
+ Number of attention heads for each attention layer in the Transformer encoder.
83
+ decoder_attention_heads (`int`, *optional*, defaults to 6):
84
+ Number of attention heads for each attention layer in the Transformer decoder.
85
+ encoder_ffn_dim (`int`, *optional*, defaults to 1536):
86
+ Dimensionality of the "intermediate" (often named feed-forward) layer in encoder.
87
+ decoder_ffn_dim (`int`, *optional*, defaults to 1536):
88
+ Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
89
+ encoder_layerdrop (`float`, *optional*, defaults to 0.0):
90
+ The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
91
+ for more details.
92
+ decoder_layerdrop (`float`, *optional*, defaults to 0.0):
93
+ The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
94
+ for more details.
95
+ decoder_start_token_id (`int`, *optional*, defaults to 50257):
96
+ Corresponds to the "<|startoftranscript|>" token, which is automatically used when no `decoder_input_ids`
97
+ are provided to the `generate` function. It is used to guide the model`s generation process depending on
98
+ the task.
99
+ use_cache (`bool`, *optional*, defaults to `True`):
100
+ Whether or not the model should return the last key/values attentions (not used by all models).
101
+ is_encoder_decoder (`bool`, *optional*, defaults to `True`):
102
+ Whether the model is used as an encoder/decoder or not.
103
+ activation_function (`str`, *optional*, defaults to `"gelu"`):
104
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
105
+ `"relu"`, `"silu"` and `"gelu_new"` are supported.
106
+ d_model (`int`, *optional*, defaults to 384):
107
+ Dimensionality of the layers.
108
+ dropout (`float`, *optional*, defaults to 0.1):
109
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
110
+ attention_dropout (`float`, *optional*, defaults to 0.0):
111
+ The dropout ratio for the attention probabilities.
112
+ activation_dropout (`float`, *optional*, defaults to 0.0):
113
+ The dropout ratio for activations inside the fully connected layer.
114
+ init_std (`float`, *optional*, defaults to 0.02):
115
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
116
+ scale_embedding (`bool`, *optional*, defaults to False):
117
+ Scale embeddings by diving by sqrt(d_model).
118
+ max_source_positions (`int`, *optional*, defaults to 1500):
119
+ The maximum sequence length of log-mel filter-bank features that this model might ever be used with.
120
+ max_target_positions (`int`, *optional*, defaults to 448):
121
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
122
+ just in case (e.g., 512 or 1024 or 2048).
123
+ pad_token_id (`int`, *optional*, defaults to 50256):
124
+ Padding token id.
125
+ bos_token_id (`int`, *optional*, defaults to 50256):
126
+ Begin of stream token id.
127
+ eos_token_id (`int`, *optional*, defaults to 50256):
128
+ End of stream token id.
129
+ suppress_tokens (`List[int]`, *optional*):
130
+ A list containing the non-speech tokens that will be used by the logit processor in the `generate`
131
+ function. NON_SPEECH_TOKENS and NON_SPEECH_TOKENS_MULTI each correspond to the `english-only` and the
132
+ `multilingual` model.
133
+ begin_suppress_tokens (`List[int]`, *optional*, defaults to `[220,50256]`):
134
+ A list containing tokens that will be supressed at the beginning of the sampling process. Initialized as
135
+ the token for `" "` (`blank_token_id`) and the `eos_token_id`
136
+ use_weighted_layer_sum (`bool`, *optional*, defaults to `False`):
137
+ Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
138
+ instance of [`WhisperForAudioClassification`].
139
+ classifier_proj_size (`int`, *optional*, defaults to 256):
140
+ Dimensionality of the projection before token mean-pooling for classification. Only relevant when using an
141
+ instance of [`WhisperForAudioClassification`].
142
+ apply_spec_augment (`bool`, *optional*, defaults to `False`):
143
+ Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see
144
+ [SpecAugment: A Simple Data Augmentation Method for Automatic Speech
145
+ Recognition](https://arxiv.org/abs/1904.08779).
146
+ mask_time_prob (`float`, *optional*, defaults to 0.05):
147
+ Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
148
+ procecure generates `mask_time_prob*len(time_axis)/mask_time_length` independent masks over the axis. If
149
+ reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
150
+ masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
151
+ actual percentage of masked vectors. This is only relevant if `apply_spec_augment == True`.
152
+ mask_time_length (`int`, *optional*, defaults to 10):
153
+ Length of vector span along the time axis.
154
+ mask_time_min_masks (`int`, *optional*, defaults to 2),:
155
+ The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step,
156
+ irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length <
157
+ mask_time_min_masks''
158
+ mask_feature_prob (`float`, *optional*, defaults to 0.0):
159
+ Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
160
+ masking procecure generates `mask_feature_prob*len(feature_axis)/mask_time_length` independent masks over
161
+ the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
162
+ span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
163
+ may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
164
+ True`.
165
+ mask_feature_length (`int`, *optional*, defaults to 10):
166
+ Length of vector span along the feature axis.
167
+ mask_feature_min_masks (`int`, *optional*, defaults to 0),:
168
+ The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
169
+ step, irrespectively of `mask_feature_prob`. Only relevant if
170
+ `mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks`.
171
+ median_filter_width (`int`, *optional*, defaults to 7):
172
+ Width of the median filter used to smoothen to cross-attention outputs when computing token timestamps.
173
+ Should be an odd number.
174
+
175
+ Example:
176
+
177
+ ```python
178
+ >>> from transformers import WhisperConfig, WhisperModel
179
+
180
+ >>> # Initializing a Whisper tiny style configuration
181
+ >>> configuration = WhisperConfig()
182
+
183
+ >>> # Initializing a model (with random weights) from the tiny style configuration
184
+ >>> model = WhisperModel(configuration)
185
+
186
+ >>> # Accessing the model configuration
187
+ >>> configuration = model.config
188
+ ```"""
189
+
190
+ model_type = "whisper"
191
+ keys_to_ignore_at_inference = ["past_key_values"]
192
+ attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
193
+
194
+ def __init__(
195
+ self,
196
+ vocab_size=51865,
197
+ num_mel_bins=80,
198
+ encoder_layers=4,
199
+ encoder_attention_heads=6,
200
+ decoder_layers=4,
201
+ decoder_attention_heads=6,
202
+ decoder_ffn_dim=1536,
203
+ encoder_ffn_dim=1536,
204
+ encoder_layerdrop=0.0,
205
+ decoder_layerdrop=0.0,
206
+ decoder_start_token_id=50257,
207
+ use_cache=True,
208
+ is_encoder_decoder=True,
209
+ activation_function="gelu",
210
+ d_model=384,
211
+ dropout=0.0,
212
+ attention_dropout=0.0,
213
+ activation_dropout=0.0,
214
+ init_std=0.02,
215
+ scale_embedding=False,
216
+ max_source_positions=1500,
217
+ max_target_positions=448,
218
+ pad_token_id=50256,
219
+ bos_token_id=50256,
220
+ eos_token_id=50256,
221
+ suppress_tokens=None,
222
+ begin_suppress_tokens=[220, 50256],
223
+ use_weighted_layer_sum=False,
224
+ classifier_proj_size=256,
225
+ apply_spec_augment=False,
226
+ mask_time_prob=0.05,
227
+ mask_time_length=10,
228
+ mask_time_min_masks=2,
229
+ mask_feature_prob=0.0,
230
+ mask_feature_length=10,
231
+ mask_feature_min_masks=0,
232
+ median_filter_width=7,
233
+ **kwargs,
234
+ ):
235
+ self.vocab_size = vocab_size
236
+ self.num_mel_bins = num_mel_bins
237
+ self.d_model = d_model
238
+ self.encoder_layers = encoder_layers
239
+ self.encoder_attention_heads = encoder_attention_heads
240
+ self.decoder_layers = decoder_layers
241
+ self.decoder_attention_heads = decoder_attention_heads
242
+ self.decoder_ffn_dim = decoder_ffn_dim
243
+ self.encoder_ffn_dim = encoder_ffn_dim
244
+ self.dropout = dropout
245
+ self.attention_dropout = attention_dropout
246
+ self.activation_dropout = activation_dropout
247
+ self.activation_function = activation_function
248
+ self.init_std = init_std
249
+ self.encoder_layerdrop = encoder_layerdrop
250
+ self.decoder_layerdrop = decoder_layerdrop
251
+ self.use_cache = use_cache
252
+ self.num_hidden_layers = encoder_layers
253
+ self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
254
+ self.max_source_positions = max_source_positions
255
+ self.max_target_positions = max_target_positions
256
+
257
+ # Audio Classification-specific parameters. Feel free to ignore for other classes.
258
+ self.classifier_proj_size = classifier_proj_size
259
+ self.use_weighted_layer_sum = use_weighted_layer_sum
260
+
261
+ # fine-tuning config parameters for SpecAugment: https://arxiv.org/abs/1904.08779
262
+ self.apply_spec_augment = apply_spec_augment
263
+ self.mask_time_prob = mask_time_prob
264
+ self.mask_time_length = mask_time_length
265
+ self.mask_time_min_masks = mask_time_min_masks
266
+ self.mask_feature_prob = mask_feature_prob
267
+ self.mask_feature_length = mask_feature_length
268
+ self.mask_feature_min_masks = mask_feature_min_masks
269
+
270
+ self.median_filter_width = median_filter_width
271
+
272
+ super().__init__(
273
+ pad_token_id=pad_token_id,
274
+ bos_token_id=bos_token_id,
275
+ eos_token_id=eos_token_id,
276
+ is_encoder_decoder=is_encoder_decoder,
277
+ decoder_start_token_id=decoder_start_token_id,
278
+ suppress_tokens=suppress_tokens,
279
+ begin_suppress_tokens=begin_suppress_tokens,
280
+ **kwargs,
281
+ )
282
+
283
+
284
+ class WhisperOnnxConfig(OnnxSeq2SeqConfigWithPast):
285
+ @property
286
+ def inputs(self) -> Mapping[str, Mapping[int, str]]:
287
+ common_inputs = OrderedDict(
288
+ [
289
+ ("input_features", {0: "batch", 1: "feature_size", 2: "encoder_sequence"}),
290
+ ]
291
+ )
292
+ if self.use_past:
293
+ common_inputs["decoder_input_ids"] = {0: "batch"}
294
+ else:
295
+ common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"}
296
+
297
+ if self.use_past:
298
+ self.fill_with_past_key_values_(common_inputs, direction="inputs")
299
+
300
+ return common_inputs
301
+
302
+ def generate_dummy_inputs(
303
+ self,
304
+ preprocessor: Union["PreTrainedTokenizerBase", "FeatureExtractionMixin"],
305
+ batch_size: int = -1,
306
+ seq_length: int = -1,
307
+ is_pair: bool = False,
308
+ framework: Optional["TensorType"] = None,
309
+ sampling_rate: int = 22050,
310
+ time_duration: float = 5.0,
311
+ frequency: int = 220,
312
+ ) -> Mapping[str, Any]:
313
+ dummy_inputs = OrderedDict()
314
+ encoder_inputs = OnnxConfig.generate_dummy_inputs(
315
+ self,
316
+ preprocessor=preprocessor.feature_extractor,
317
+ batch_size=batch_size,
318
+ framework=framework,
319
+ sampling_rate=sampling_rate,
320
+ time_duration=time_duration,
321
+ frequency=frequency,
322
+ )
323
+ encoder_sequence_length = encoder_inputs["input_features"].shape[2]
324
+ seq_length = encoder_sequence_length // 2 if self.use_past else seq_length
325
+
326
+ decoder_inputs = super().generate_dummy_inputs(
327
+ preprocessor.tokenizer, batch_size, seq_length, is_pair, framework
328
+ )
329
+
330
+ dummy_inputs["input_features"] = encoder_inputs.pop("input_features")
331
+ dummy_inputs["decoder_input_ids"] = decoder_inputs.pop("decoder_input_ids")
332
+
333
+ if "past_key_values" in decoder_inputs:
334
+ dummy_inputs["past_key_values"] = decoder_inputs.pop("past_key_values")
335
+
336
+ return dummy_inputs
337
+
338
+ @property
339
+ def atol_for_validation(self) -> float:
340
+ return 1e-3
conversation.py ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Conversation prompt templates.
3
+
4
+ We kindly request that you import fastchat instead of copying this file if you wish to use it.
5
+ If you have any changes in mind, please contribute back so the community can benefit collectively and continue to maintain these valuable templates.
6
+ """
7
+
8
+ import dataclasses
9
+ from enum import IntEnum, auto
10
+ from typing import Any, Dict, List, Tuple, Union
11
+
12
+
13
+ class SeparatorStyle(IntEnum):
14
+ """Separator styles."""
15
+
16
+ ADD_COLON_SINGLE = auto()
17
+ ADD_COLON_TWO = auto()
18
+ ADD_COLON_SPACE_SINGLE = auto()
19
+ NO_COLON_SINGLE = auto()
20
+ NO_COLON_TWO = auto()
21
+ ADD_NEW_LINE_SINGLE = auto()
22
+ LLAMA2 = auto()
23
+ CHATGLM = auto()
24
+ CHATML = auto()
25
+ CHATINTERN = auto()
26
+ DOLLY = auto()
27
+ RWKV = auto()
28
+ PHOENIX = auto()
29
+ ROBIN = auto()
30
+ FALCON_CHAT = auto()
31
+ CHATGLM3 = auto()
32
+ MPT = auto()
33
+
34
+
35
+ @dataclasses.dataclass
36
+ class Conversation:
37
+ """A class that manages prompt templates and keeps all conversation history."""
38
+
39
+ # The name of this template
40
+ name: str
41
+ # The template of the system prompt
42
+ system_template: str = '{system_message}'
43
+ # The system message
44
+ system_message: str = ''
45
+ # The names of two roles
46
+ roles: Tuple[str] = ('USER', 'ASSISTANT')
47
+ # All messages. Each item is (role, message).
48
+ messages: List[List[str]] = ()
49
+ # The number of few shot examples
50
+ offset: int = 0
51
+ # The separator style and configurations
52
+ sep_style: SeparatorStyle = SeparatorStyle.ADD_COLON_SINGLE
53
+ sep: str = '\n'
54
+ sep2: str = None
55
+ # Stop criteria (the default one is EOS token)
56
+ stop_str: Union[str, List[str]] = None
57
+ # Stops generation if meeting any token in this list
58
+ stop_token_ids: List[int] = None
59
+
60
+ def get_prompt(self) -> str:
61
+ """Get the prompt for generation."""
62
+ system_prompt = self.system_template.format(system_message=self.system_message)
63
+ if self.sep_style == SeparatorStyle.ADD_COLON_SINGLE:
64
+ ret = system_prompt + self.sep
65
+ for role, message in self.messages:
66
+ if message:
67
+ ret += role + ': ' + message + self.sep
68
+ else:
69
+ ret += role + ':'
70
+ return ret
71
+ elif self.sep_style == SeparatorStyle.ADD_COLON_TWO:
72
+ seps = [self.sep, self.sep2]
73
+ ret = system_prompt + seps[0]
74
+ for i, (role, message) in enumerate(self.messages):
75
+ if message:
76
+ ret += role + ': ' + message + seps[i % 2]
77
+ else:
78
+ ret += role + ':'
79
+ return ret
80
+ elif self.sep_style == SeparatorStyle.ADD_COLON_SPACE_SINGLE:
81
+ ret = system_prompt + self.sep
82
+ for role, message in self.messages:
83
+ if message:
84
+ ret += role + ': ' + message + self.sep
85
+ else:
86
+ ret += role + ': ' # must be end with a space
87
+ return ret
88
+ elif self.sep_style == SeparatorStyle.ADD_NEW_LINE_SINGLE:
89
+ ret = '' if system_prompt == '' else system_prompt + self.sep
90
+ for role, message in self.messages:
91
+ if message:
92
+ ret += role + '\n' + message + self.sep
93
+ else:
94
+ ret += role + '\n'
95
+ return ret
96
+ elif self.sep_style == SeparatorStyle.NO_COLON_SINGLE:
97
+ ret = system_prompt
98
+ for role, message in self.messages:
99
+ if message:
100
+ ret += role + message + self.sep
101
+ else:
102
+ ret += role
103
+ return ret
104
+ elif self.sep_style == SeparatorStyle.NO_COLON_TWO:
105
+ seps = [self.sep, self.sep2]
106
+ ret = system_prompt
107
+ for i, (role, message) in enumerate(self.messages):
108
+ if message:
109
+ ret += role + message + seps[i % 2]
110
+ else:
111
+ ret += role
112
+ return ret
113
+ elif self.sep_style == SeparatorStyle.RWKV:
114
+ ret = system_prompt
115
+ for i, (role, message) in enumerate(self.messages):
116
+ if message:
117
+ ret += (
118
+ role
119
+ + ': '
120
+ + message.replace('\r\n', '\n').replace('\n\n', '\n')
121
+ )
122
+ ret += '\n\n'
123
+ else:
124
+ ret += role + ':'
125
+ return ret
126
+ elif self.sep_style == SeparatorStyle.LLAMA2:
127
+ seps = [self.sep, self.sep2]
128
+ if self.system_message:
129
+ ret = system_prompt
130
+ else:
131
+ ret = '[INST] '
132
+ for i, (role, message) in enumerate(self.messages):
133
+ tag = self.roles[i % 2]
134
+ if message:
135
+ if i == 0:
136
+ ret += message + ' '
137
+ else:
138
+ ret += tag + ' ' + message + seps[i % 2]
139
+ else:
140
+ ret += tag
141
+ return ret
142
+ elif self.sep_style == SeparatorStyle.CHATGLM:
143
+ # source: https://huggingface.co/THUDM/chatglm-6b/blob/1d240ba371910e9282298d4592532d7f0f3e9f3e/modeling_chatglm.py#L1302-L1308
144
+ # source2: https://huggingface.co/THUDM/chatglm2-6b/blob/e186c891cf64310ac66ef10a87e6635fa6c2a579/modeling_chatglm.py#L926
145
+ round_add_n = 1 if self.name == 'chatglm2' else 0
146
+ if system_prompt:
147
+ ret = system_prompt + self.sep
148
+ else:
149
+ ret = ''
150
+
151
+ for i, (role, message) in enumerate(self.messages):
152
+ if i % 2 == 0:
153
+ ret += f'[Round {i//2 + round_add_n}]{self.sep}'
154
+
155
+ if message:
156
+ ret += f'{role}:{message}{self.sep}'
157
+ else:
158
+ ret += f'{role}:'
159
+ return ret
160
+ elif self.sep_style == SeparatorStyle.CHATML:
161
+ ret = '' if system_prompt == '' else system_prompt + self.sep + '\n'
162
+ for role, message in self.messages:
163
+ if message:
164
+ ret += role + '\n' + message + self.sep + '\n'
165
+ else:
166
+ ret += role + '\n'
167
+ return ret
168
+ elif self.sep_style == SeparatorStyle.CHATGLM3:
169
+ ret = ''
170
+ if self.system_message:
171
+ ret += system_prompt
172
+ for role, message in self.messages:
173
+ if message:
174
+ ret += role + '\n' + ' ' + message
175
+ else:
176
+ ret += role
177
+ return ret
178
+ elif self.sep_style == SeparatorStyle.CHATINTERN:
179
+ # source: https://huggingface.co/internlm/internlm-chat-7b-8k/blob/bd546fa984b4b0b86958f56bf37f94aa75ab8831/modeling_internlm.py#L771
180
+ seps = [self.sep, self.sep2]
181
+ ret = system_prompt
182
+ for i, (role, message) in enumerate(self.messages):
183
+ # if i % 2 == 0:
184
+ # ret += "<s>"
185
+ if message:
186
+ ret += role + ':' + message + seps[i % 2] + '\n'
187
+ else:
188
+ ret += role + ':'
189
+ return ret
190
+ elif self.sep_style == SeparatorStyle.DOLLY:
191
+ seps = [self.sep, self.sep2]
192
+ ret = system_prompt
193
+ for i, (role, message) in enumerate(self.messages):
194
+ if message:
195
+ ret += role + ':\n' + message + seps[i % 2]
196
+ if i % 2 == 1:
197
+ ret += '\n\n'
198
+ else:
199
+ ret += role + ':\n'
200
+ return ret
201
+ elif self.sep_style == SeparatorStyle.PHOENIX:
202
+ ret = system_prompt
203
+ for role, message in self.messages:
204
+ if message:
205
+ ret += role + ': ' + '<s>' + message + '</s>'
206
+ else:
207
+ ret += role + ': ' + '<s>'
208
+ return ret
209
+ elif self.sep_style == SeparatorStyle.ROBIN:
210
+ ret = system_prompt + self.sep
211
+ for role, message in self.messages:
212
+ if message:
213
+ ret += role + ':\n' + message + self.sep
214
+ else:
215
+ ret += role + ':\n'
216
+ return ret
217
+ elif self.sep_style == SeparatorStyle.FALCON_CHAT:
218
+ ret = ''
219
+ if self.system_message:
220
+ ret += system_prompt + self.sep
221
+ for role, message in self.messages:
222
+ if message:
223
+ ret += role + ': ' + message + self.sep
224
+ else:
225
+ ret += role + ':'
226
+
227
+ return ret
228
+ elif self.sep_style == SeparatorStyle.MPT:
229
+ if self.system_message == '':
230
+ ret = ''
231
+ else:
232
+ ret = system_prompt + self.sep
233
+ for role, message in self.messages:
234
+ if message:
235
+ if type(message) is tuple:
236
+ message, _, _ = message
237
+ ret += role + message + self.sep
238
+ else:
239
+ ret += role
240
+ return ret
241
+ else:
242
+ raise ValueError(f'Invalid style: {self.sep_style}')
243
+
244
+ def set_system_message(self, system_message: str):
245
+ """Set the system message."""
246
+ self.system_message = system_message
247
+
248
+ def append_message(self, role: str, message: str):
249
+ """Append a new message."""
250
+ self.messages.append([role, message])
251
+
252
+ def update_last_message(self, message: str):
253
+ """Update the last output.
254
+
255
+ The last message is typically set to be None when constructing the prompt,
256
+ so we need to update it in-place after getting the response from a model.
257
+ """
258
+ self.messages[-1][1] = message
259
+
260
+ def to_gradio_chatbot(self):
261
+ """Convert the conversation to gradio chatbot format."""
262
+ ret = []
263
+ for i, (role, msg) in enumerate(self.messages[self.offset :]):
264
+ if i % 2 == 0:
265
+ ret.append([msg, None])
266
+ else:
267
+ ret[-1][-1] = msg
268
+ return ret
269
+
270
+ def to_openai_api_messages(self):
271
+ """Convert the conversation to OpenAI chat completion format."""
272
+ ret = [{'role': 'system', 'content': self.system_message}]
273
+
274
+ for i, (_, msg) in enumerate(self.messages[self.offset :]):
275
+ if i % 2 == 0:
276
+ ret.append({'role': 'user', 'content': msg})
277
+ else:
278
+ if msg is not None:
279
+ ret.append({'role': 'assistant', 'content': msg})
280
+ return ret
281
+
282
+ def copy(self):
283
+ return Conversation(
284
+ name=self.name,
285
+ system_template=self.system_template,
286
+ system_message=self.system_message,
287
+ roles=self.roles,
288
+ messages=[[x, y] for x, y in self.messages],
289
+ offset=self.offset,
290
+ sep_style=self.sep_style,
291
+ sep=self.sep,
292
+ sep2=self.sep2,
293
+ stop_str=self.stop_str,
294
+ stop_token_ids=self.stop_token_ids,
295
+ )
296
+
297
+ def dict(self):
298
+ return {
299
+ 'template_name': self.name,
300
+ 'system_message': self.system_message,
301
+ 'roles': self.roles,
302
+ 'messages': self.messages,
303
+ 'offset': self.offset,
304
+ }
305
+
306
+
307
+ # A global registry for all conversation templates
308
+ conv_templates: Dict[str, Conversation] = {}
309
+
310
+
311
+ def register_conv_template(template: Conversation, override: bool = False):
312
+ """Register a new conversation template."""
313
+ if not override:
314
+ assert (
315
+ template.name not in conv_templates
316
+ ), f'{template.name} has been registered.'
317
+
318
+ conv_templates[template.name] = template
319
+
320
+
321
+ def get_conv_template(name: str) -> Conversation:
322
+ """Get a conversation template."""
323
+ return conv_templates[name].copy()
324
+
325
+
326
+ register_conv_template(
327
+ Conversation(
328
+ name='interactiveomni_template',
329
+ system_template='<|im_start|>system\n{system_message}',
330
+ system_message='You are a highly advanced multimodal conversational AI designed for human-like interaction. You can perceive auditory, visual, speech, and textual inputs, and generate text and speech.',
331
+ roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
332
+ sep_style=SeparatorStyle.MPT,
333
+ sep='<|im_end|>\n',
334
+ stop_token_ids=[
335
+ 2,
336
+ 92543,
337
+ 92542
338
+ ]
339
+ )
340
+ )
generation_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "transformers_version": "4.51.3"
4
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00005.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5b2da752eea0e481167b8203c4b792c8cd7b5f4dfe44490a577b8ed5db6ee15
3
+ size 4990472920
model-00002-of-00005.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9fb6caa54bb12b742ba39f1d44963057aa2cdc177206f39ccabb4a61a5922d27
3
+ size 4999848424
model-00003-of-00005.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:849eeeb4f6b5233a4d4749eabacd79375f3ac4340c0057fdc85d93af65e4c45d
3
+ size 4983071360
model-00004-of-00005.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10149f10dbd934bc38e316409cd12432aeb21061e35bbc754c8d70c387c2d6ee
3
+ size 4999999724
model-00005-of-00005.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c57621a543541dc6e0fd8aa9f7bfcae153ddfd549a570435f106467d37654b0
3
+ size 129569282
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
modeling_flow.py ADDED
@@ -0,0 +1,2318 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # SenseTime
3
+ # Copyright (c) 2025 SenseTime
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+ from transformers.modeling_utils import PreTrainedModel
7
+ from typing import Dict, Tuple, Optional, Union, Any
8
+ from torch import nn
9
+ from torch.nn import functional as F
10
+ import torch
11
+ import copy
12
+ from omegaconf import DictConfig
13
+ import threading
14
+ import math
15
+ from abc import ABC
16
+
17
+ from diffusers.models.activations import get_activation
18
+ from einops import pack, rearrange, repeat
19
+ from diffusers.utils.torch_utils import maybe_allow_in_graph
20
+ from diffusers.models.attention import (
21
+ GEGLU,
22
+ GELU,
23
+ AdaLayerNorm,
24
+ AdaLayerNormZero,
25
+ ApproximateGELU,
26
+ )
27
+ from diffusers.models.attention_processor import Attention
28
+ from diffusers.models.lora import LoRACompatibleLinear
29
+
30
+ from .configuration_flow import FlowConfig
31
+
32
+ def subsequent_chunk_mask(
33
+ size: int,
34
+ chunk_size: int,
35
+ num_left_chunks: int = -1,
36
+ device: torch.device = torch.device("cpu"),
37
+ ) -> torch.Tensor:
38
+ """Create mask for subsequent steps (size, size) with chunk size,
39
+ this is for streaming encoder
40
+
41
+ Args:
42
+ size (int): size of mask
43
+ chunk_size (int): size of chunk
44
+ num_left_chunks (int): number of left chunks
45
+ <0: use full chunk
46
+ >=0: use num_left_chunks
47
+ device (torch.device): "cpu" or "cuda" or torch.Tensor.device
48
+
49
+ Returns:
50
+ torch.Tensor: mask
51
+
52
+ Examples:
53
+ >>> subsequent_chunk_mask(4, 2)
54
+ [[1, 1, 0, 0],
55
+ [1, 1, 0, 0],
56
+ [1, 1, 1, 1],
57
+ [1, 1, 1, 1]]
58
+ """
59
+ # NOTE this modified implementation meets onnx export requirements, but it doesn't support num_left_chunks
60
+ # actually this is not needed after we have inference cache implemented, will remove it later
61
+ pos_idx = torch.arange(size, device=device)
62
+ block_value = (torch.div(pos_idx, chunk_size, rounding_mode='trunc') + 1) * chunk_size
63
+ ret = pos_idx.unsqueeze(0) < block_value.unsqueeze(1)
64
+ return ret
65
+
66
+ def add_optional_chunk_mask(xs: torch.Tensor,
67
+ masks: torch.Tensor,
68
+ use_dynamic_chunk: bool,
69
+ use_dynamic_left_chunk: bool,
70
+ decoding_chunk_size: int,
71
+ static_chunk_size: int,
72
+ num_decoding_left_chunks: int,
73
+ enable_full_context: bool = True):
74
+ """ Apply optional mask for encoder.
75
+
76
+ Args:
77
+ xs (torch.Tensor): padded input, (B, L, D), L for max length
78
+ mask (torch.Tensor): mask for xs, (B, 1, L)
79
+ use_dynamic_chunk (bool): whether to use dynamic chunk or not
80
+ use_dynamic_left_chunk (bool): whether to use dynamic left chunk for
81
+ training.
82
+ decoding_chunk_size (int): decoding chunk size for dynamic chunk, it's
83
+ 0: default for training, use random dynamic chunk.
84
+ <0: for decoding, use full chunk.
85
+ >0: for decoding, use fixed chunk size as set.
86
+ static_chunk_size (int): chunk size for static chunk training/decoding
87
+ if it's greater than 0, if use_dynamic_chunk is true,
88
+ this parameter will be ignored
89
+ num_decoding_left_chunks: number of left chunks, this is for decoding,
90
+ the chunk size is decoding_chunk_size.
91
+ >=0: use num_decoding_left_chunks
92
+ <0: use all left chunks
93
+ enable_full_context (bool):
94
+ True: chunk size is either [1, 25] or full context(max_len)
95
+ False: chunk size ~ U[1, 25]
96
+
97
+ Returns:
98
+ torch.Tensor: chunk mask of the input xs.
99
+ """
100
+ # Whether to use chunk mask or not
101
+ if use_dynamic_chunk:
102
+ max_len = xs.size(1)
103
+ if decoding_chunk_size < 0:
104
+ chunk_size = max_len
105
+ num_left_chunks = -1
106
+ elif decoding_chunk_size > 0:
107
+ chunk_size = decoding_chunk_size
108
+ num_left_chunks = num_decoding_left_chunks
109
+ else:
110
+ # chunk size is either [1, 25] or full context(max_len).
111
+ # Since we use 4 times subsampling and allow up to 1s(100 frames)
112
+ # delay, the maximum frame is 100 / 4 = 25.
113
+ chunk_size = torch.randint(1, max_len, (1, )).item()
114
+ num_left_chunks = -1
115
+ if chunk_size > max_len // 2 and enable_full_context:
116
+ chunk_size = max_len
117
+ else:
118
+ chunk_size = chunk_size % 25 + 1
119
+ if use_dynamic_left_chunk:
120
+ max_left_chunks = (max_len - 1) // chunk_size
121
+ num_left_chunks = torch.randint(0, max_left_chunks,
122
+ (1, )).item()
123
+ chunk_masks = subsequent_chunk_mask(xs.size(1), chunk_size,
124
+ num_left_chunks,
125
+ xs.device) # (L, L)
126
+ chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L)
127
+ chunk_masks = masks & chunk_masks # (B, L, L)
128
+ elif static_chunk_size > 0:
129
+ num_left_chunks = num_decoding_left_chunks
130
+ chunk_masks = subsequent_chunk_mask(xs.size(1), static_chunk_size,
131
+ num_left_chunks,
132
+ xs.device) # (L, L)
133
+ chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L)
134
+ chunk_masks = masks & chunk_masks # (B, L, L)
135
+ else:
136
+ chunk_masks = masks
137
+ return chunk_masks
138
+
139
+ def mask_to_bias(mask: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
140
+ assert mask.dtype == torch.bool
141
+ assert dtype in [torch.float32, torch.bfloat16, torch.float16]
142
+ mask = mask.to(dtype)
143
+ # attention mask bias
144
+ # NOTE(Mddct): torch.finfo jit issues
145
+ # chunk_masks = (1.0 - chunk_masks) * torch.finfo(dtype).min
146
+ mask = (1.0 - mask) * torch.finfo(dtype).min
147
+ return mask
148
+
149
+ def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor:
150
+ """Make mask tensor containing indices of padded part.
151
+
152
+ See description of make_non_pad_mask.
153
+
154
+ Args:
155
+ lengths (torch.Tensor): Batch of lengths (B,).
156
+ Returns:
157
+ torch.Tensor: Mask tensor containing indices of padded part.
158
+
159
+ Examples:
160
+ >>> lengths = [5, 3, 2]
161
+ >>> make_pad_mask(lengths)
162
+ masks = [[0, 0, 0, 0 ,0],
163
+ [0, 0, 0, 1, 1],
164
+ [0, 0, 1, 1, 1]]
165
+ """
166
+ batch_size = lengths.size(0)
167
+ max_len = max_len if max_len > 0 else lengths.max().item()
168
+ seq_range = torch.arange(0,
169
+ max_len,
170
+ dtype=torch.int64,
171
+ device=lengths.device)
172
+ seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
173
+ seq_length_expand = lengths.unsqueeze(-1)
174
+ mask = seq_range_expand >= seq_length_expand
175
+ return mask
176
+
177
+ class Swish(torch.nn.Module):
178
+ """Construct an Swish object."""
179
+
180
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
181
+ """Return Swish activation function."""
182
+ return x * torch.sigmoid(x)
183
+
184
+ class BASECFM(torch.nn.Module, ABC):
185
+ def __init__(
186
+ self,
187
+ n_feats,
188
+ cfm_params,
189
+ n_spks=1,
190
+ spk_emb_dim=128,
191
+ ):
192
+ super().__init__()
193
+ self.n_feats = n_feats
194
+ self.n_spks = n_spks
195
+ self.spk_emb_dim = spk_emb_dim
196
+ self.solver = cfm_params.solver
197
+ if hasattr(cfm_params, "sigma_min"):
198
+ self.sigma_min = cfm_params.sigma_min
199
+ else:
200
+ self.sigma_min = 1e-4
201
+
202
+ self.estimator = None
203
+
204
+ @torch.inference_mode()
205
+ def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None):
206
+ """Forward diffusion
207
+
208
+ Args:
209
+ mu (torch.Tensor): output of encoder
210
+ shape: (batch_size, n_feats, mel_timesteps)
211
+ mask (torch.Tensor): output_mask
212
+ shape: (batch_size, 1, mel_timesteps)
213
+ n_timesteps (int): number of diffusion steps
214
+ temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
215
+ spks (torch.Tensor, optional): speaker ids. Defaults to None.
216
+ shape: (batch_size, spk_emb_dim)
217
+ cond: Not used but kept for future purposes
218
+
219
+ Returns:
220
+ sample: generated mel-spectrogram
221
+ shape: (batch_size, n_feats, mel_timesteps)
222
+ """
223
+ z = torch.randn_like(mu) * temperature
224
+ t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device)
225
+ return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond)
226
+
227
+ def solve_euler(self, x, t_span, mu, mask, spks, cond):
228
+ """
229
+ Fixed euler solver for ODEs.
230
+ Args:
231
+ x (torch.Tensor): random noise
232
+ t_span (torch.Tensor): n_timesteps interpolated
233
+ shape: (n_timesteps + 1,)
234
+ mu (torch.Tensor): output of encoder
235
+ shape: (batch_size, n_feats, mel_timesteps)
236
+ mask (torch.Tensor): output_mask
237
+ shape: (batch_size, 1, mel_timesteps)
238
+ spks (torch.Tensor, optional): speaker ids. Defaults to None.
239
+ shape: (batch_size, spk_emb_dim)
240
+ cond: Not used but kept for future purposes
241
+ """
242
+ t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
243
+
244
+ # I am storing this because I can later plot it by putting a debugger here and saving it to a file
245
+ # Or in future might add like a return_all_steps flag
246
+ sol = []
247
+
248
+ for step in range(1, len(t_span)):
249
+ dphi_dt = self.estimator(x, mask, mu, t, spks, cond)
250
+
251
+ x = x + dt * dphi_dt
252
+ t = t + dt
253
+ sol.append(x)
254
+ if step < len(t_span) - 1:
255
+ dt = t_span[step + 1] - t
256
+
257
+ return sol[-1]
258
+
259
+ def compute_loss(self, x1, mask, mu, spks=None, cond=None):
260
+ """Computes diffusion loss
261
+
262
+ Args:
263
+ x1 (torch.Tensor): Target
264
+ shape: (batch_size, n_feats, mel_timesteps)
265
+ mask (torch.Tensor): target mask
266
+ shape: (batch_size, 1, mel_timesteps)
267
+ mu (torch.Tensor): output of encoder
268
+ shape: (batch_size, n_feats, mel_timesteps)
269
+ spks (torch.Tensor, optional): speaker embedding. Defaults to None.
270
+ shape: (batch_size, spk_emb_dim)
271
+
272
+ Returns:
273
+ loss: conditional flow matching loss
274
+ y: conditional flow
275
+ shape: (batch_size, n_feats, mel_timesteps)
276
+ """
277
+ b, _, t = mu.shape
278
+
279
+ # random timestep
280
+ t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype)
281
+ # sample noise p(x_0)
282
+ z = torch.randn_like(x1)
283
+
284
+ y = (1 - (1 - self.sigma_min) * t) * z + t * x1
285
+ u = x1 - (1 - self.sigma_min) * z
286
+
287
+ loss = F.mse_loss(self.estimator(y, mask, mu, t.squeeze(), spks), u, reduction="sum") / (
288
+ torch.sum(mask) * u.shape[1]
289
+ )
290
+ return loss, y
291
+
292
+ class Transpose(torch.nn.Module):
293
+ def __init__(self, dim0: int, dim1: int):
294
+ super().__init__()
295
+ self.dim0 = dim0
296
+ self.dim1 = dim1
297
+
298
+ def forward(self, x: torch.Tensor):
299
+ x = torch.transpose(x, self.dim0, self.dim1)
300
+ return x
301
+
302
+
303
+ class Block1D(torch.nn.Module):
304
+ def __init__(self, dim, dim_out, groups=8):
305
+ super().__init__()
306
+ self.block = torch.nn.Sequential(
307
+ torch.nn.Conv1d(dim, dim_out, 3, padding=1),
308
+ torch.nn.GroupNorm(groups, dim_out),
309
+ nn.Mish(),
310
+ )
311
+
312
+ def forward(self, x, mask):
313
+ output = self.block(x * mask)
314
+ return output * mask
315
+
316
+ class CausalBlock1D(Block1D):
317
+ def __init__(self, dim: int, dim_out: int):
318
+ super(CausalBlock1D, self).__init__(dim, dim_out)
319
+ self.block = torch.nn.Sequential(
320
+ CausalConv1d(dim, dim_out, 3),
321
+ Transpose(1, 2),
322
+ nn.LayerNorm(dim_out),
323
+ Transpose(1, 2),
324
+ nn.Mish(),
325
+ )
326
+
327
+ def forward(self, x: torch.Tensor, mask: torch.Tensor):
328
+ output = self.block(x * mask)
329
+ return output * mask
330
+
331
+ class ResnetBlock1D(torch.nn.Module):
332
+ def __init__(self, dim, dim_out, time_emb_dim, groups=8):
333
+ super().__init__()
334
+ self.mlp = torch.nn.Sequential(nn.Mish(), torch.nn.Linear(time_emb_dim, dim_out))
335
+
336
+ self.block1 = Block1D(dim, dim_out, groups=groups)
337
+ self.block2 = Block1D(dim_out, dim_out, groups=groups)
338
+
339
+ self.res_conv = torch.nn.Conv1d(dim, dim_out, 1)
340
+
341
+ def forward(self, x, mask, time_emb):
342
+ h = self.block1(x, mask)
343
+ h += self.mlp(time_emb).unsqueeze(-1)
344
+ h = self.block2(h, mask)
345
+ output = h + self.res_conv(x * mask)
346
+ return output
347
+
348
+
349
+ class CausalResnetBlock1D(ResnetBlock1D):
350
+ def __init__(self, dim: int, dim_out: int, time_emb_dim: int, groups: int = 8):
351
+ super(CausalResnetBlock1D, self).__init__(dim, dim_out, time_emb_dim, groups)
352
+ self.block1 = CausalBlock1D(dim, dim_out)
353
+ self.block2 = CausalBlock1D(dim_out, dim_out)
354
+
355
+
356
+ class CausalConv1d(torch.nn.Conv1d):
357
+ def __init__(
358
+ self,
359
+ in_channels: int,
360
+ out_channels: int,
361
+ kernel_size: int,
362
+ stride: int = 1,
363
+ dilation: int = 1,
364
+ groups: int = 1,
365
+ bias: bool = True,
366
+ padding_mode: str = 'zeros',
367
+ device=None,
368
+ dtype=None
369
+ ) -> None:
370
+ super(CausalConv1d, self).__init__(in_channels, out_channels,
371
+ kernel_size, stride,
372
+ padding=0, dilation=dilation,
373
+ groups=groups, bias=bias,
374
+ padding_mode=padding_mode,
375
+ device=device, dtype=dtype)
376
+ assert stride == 1
377
+ self.causal_padding = (kernel_size - 1, 0)
378
+
379
+ def forward(self, x: torch.Tensor):
380
+ x = F.pad(x, self.causal_padding)
381
+ x = super(CausalConv1d, self).forward(x)
382
+ return x
383
+
384
+ class ResnetBlock1D(torch.nn.Module):
385
+ def __init__(self, dim, dim_out, time_emb_dim, groups=8):
386
+ super().__init__()
387
+ self.mlp = torch.nn.Sequential(nn.Mish(), torch.nn.Linear(time_emb_dim, dim_out))
388
+
389
+ self.block1 = Block1D(dim, dim_out, groups=groups)
390
+ self.block2 = Block1D(dim_out, dim_out, groups=groups)
391
+
392
+ self.res_conv = torch.nn.Conv1d(dim, dim_out, 1)
393
+
394
+ def forward(self, x, mask, time_emb):
395
+ h = self.block1(x, mask)
396
+ h += self.mlp(time_emb).unsqueeze(-1)
397
+ h = self.block2(h, mask)
398
+ output = h + self.res_conv(x * mask)
399
+ return output
400
+
401
+ class SinusoidalPosEmb(torch.nn.Module):
402
+ def __init__(self, dim):
403
+ super().__init__()
404
+ self.dim = dim
405
+ assert self.dim % 2 == 0, "SinusoidalPosEmb requires dim to be even"
406
+
407
+ def forward(self, x, scale=1000):
408
+ if x.ndim < 1:
409
+ x = x.unsqueeze(0)
410
+ device = x.device
411
+ half_dim = self.dim // 2
412
+ emb = math.log(10000) / (half_dim - 1)
413
+ emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb)
414
+ emb = scale * x.unsqueeze(1) * emb.unsqueeze(0)
415
+ emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
416
+ return emb
417
+
418
+ class SnakeBeta(nn.Module):
419
+ """
420
+ A modified Snake function which uses separate parameters for the magnitude of the periodic components
421
+ Shape:
422
+ - Input: (B, C, T)
423
+ - Output: (B, C, T), same shape as the input
424
+ Parameters:
425
+ - alpha - trainable parameter that controls frequency
426
+ - beta - trainable parameter that controls magnitude
427
+ References:
428
+ - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
429
+ https://arxiv.org/abs/2006.08195
430
+ Examples:
431
+ >>> a1 = snakebeta(256)
432
+ >>> x = torch.randn(256)
433
+ >>> x = a1(x)
434
+ """
435
+
436
+ def __init__(self, in_features, out_features, alpha=1.0, alpha_trainable=True, alpha_logscale=True):
437
+ """
438
+ Initialization.
439
+ INPUT:
440
+ - in_features: shape of the input
441
+ - alpha - trainable parameter that controls frequency
442
+ - beta - trainable parameter that controls magnitude
443
+ alpha is initialized to 1 by default, higher values = higher-frequency.
444
+ beta is initialized to 1 by default, higher values = higher-magnitude.
445
+ alpha will be trained along with the rest of your model.
446
+ """
447
+ super().__init__()
448
+ self.in_features = out_features if isinstance(out_features, list) else [out_features]
449
+ self.proj = LoRACompatibleLinear(in_features, out_features)
450
+
451
+ # initialize alpha
452
+ self.alpha_logscale = alpha_logscale
453
+ if self.alpha_logscale: # log scale alphas initialized to zeros
454
+ self.alpha = nn.Parameter(torch.zeros(self.in_features) * alpha)
455
+ self.beta = nn.Parameter(torch.zeros(self.in_features) * alpha)
456
+ else: # linear scale alphas initialized to ones
457
+ self.alpha = nn.Parameter(torch.ones(self.in_features) * alpha)
458
+ self.beta = nn.Parameter(torch.ones(self.in_features) * alpha)
459
+
460
+ self.alpha.requires_grad = alpha_trainable
461
+ self.beta.requires_grad = alpha_trainable
462
+
463
+ self.no_div_by_zero = 0.000000001
464
+
465
+ def forward(self, x):
466
+ """
467
+ Forward pass of the function.
468
+ Applies the function to the input elementwise.
469
+ SnakeBeta ∶= x + 1/b * sin^2 (xa)
470
+ """
471
+ x = self.proj(x)
472
+ if self.alpha_logscale:
473
+ alpha = torch.exp(self.alpha)
474
+ beta = torch.exp(self.beta)
475
+ else:
476
+ alpha = self.alpha
477
+ beta = self.beta
478
+
479
+ x = x + (1.0 / (beta + self.no_div_by_zero)) * torch.pow(torch.sin(x * alpha), 2)
480
+
481
+ return x
482
+
483
+ class FeedForward(nn.Module):
484
+ r"""
485
+ A feed-forward layer.
486
+
487
+ Parameters:
488
+ dim (`int`): The number of channels in the input.
489
+ dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
490
+ mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
491
+ dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
492
+ activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
493
+ final_dropout (`bool` *optional*, defaults to False): Apply a final dropout.
494
+ """
495
+
496
+ def __init__(
497
+ self,
498
+ dim: int,
499
+ dim_out: Optional[int] = None,
500
+ mult: int = 4,
501
+ dropout: float = 0.0,
502
+ activation_fn: str = "geglu",
503
+ final_dropout: bool = False,
504
+ ):
505
+ super().__init__()
506
+ inner_dim = int(dim * mult)
507
+ dim_out = dim_out if dim_out is not None else dim
508
+
509
+ if activation_fn == "gelu":
510
+ act_fn = GELU(dim, inner_dim)
511
+ if activation_fn == "gelu-approximate":
512
+ act_fn = GELU(dim, inner_dim, approximate="tanh")
513
+ elif activation_fn == "geglu":
514
+ act_fn = GEGLU(dim, inner_dim)
515
+ elif activation_fn == "geglu-approximate":
516
+ act_fn = ApproximateGELU(dim, inner_dim)
517
+ elif activation_fn == "snakebeta":
518
+ act_fn = SnakeBeta(dim, inner_dim)
519
+
520
+ self.net = nn.ModuleList([])
521
+ # project in
522
+ self.net.append(act_fn)
523
+ # project dropout
524
+ self.net.append(nn.Dropout(dropout))
525
+ # project out
526
+ self.net.append(LoRACompatibleLinear(inner_dim, dim_out))
527
+ # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
528
+ if final_dropout:
529
+ self.net.append(nn.Dropout(dropout))
530
+
531
+ def forward(self, hidden_states):
532
+ for module in self.net:
533
+ hidden_states = module(hidden_states)
534
+ return hidden_states
535
+
536
+ @maybe_allow_in_graph
537
+ class BasicTransformerBlock(nn.Module):
538
+ r"""
539
+ A basic Transformer block.
540
+
541
+ Parameters:
542
+ dim (`int`): The number of channels in the input and output.
543
+ num_attention_heads (`int`): The number of heads to use for multi-head attention.
544
+ attention_head_dim (`int`): The number of channels in each head.
545
+ dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
546
+ cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
547
+ only_cross_attention (`bool`, *optional*):
548
+ Whether to use only cross-attention layers. In this case two cross attention layers are used.
549
+ double_self_attention (`bool`, *optional*):
550
+ Whether to use two self-attention layers. In this case no cross attention layers are used.
551
+ activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
552
+ num_embeds_ada_norm (:
553
+ obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
554
+ attention_bias (:
555
+ obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
556
+ """
557
+
558
+ def __init__(
559
+ self,
560
+ dim: int,
561
+ num_attention_heads: int,
562
+ attention_head_dim: int,
563
+ dropout=0.0,
564
+ cross_attention_dim: Optional[int] = None,
565
+ activation_fn: str = "geglu",
566
+ num_embeds_ada_norm: Optional[int] = None,
567
+ attention_bias: bool = False,
568
+ only_cross_attention: bool = False,
569
+ double_self_attention: bool = False,
570
+ upcast_attention: bool = False,
571
+ norm_elementwise_affine: bool = True,
572
+ norm_type: str = "layer_norm",
573
+ final_dropout: bool = False,
574
+ ):
575
+ super().__init__()
576
+ self.only_cross_attention = only_cross_attention
577
+
578
+ self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
579
+ self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
580
+
581
+ if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
582
+ raise ValueError(
583
+ f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
584
+ f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
585
+ )
586
+
587
+ # Define 3 blocks. Each block has its own normalization layer.
588
+ # 1. Self-Attn
589
+ if self.use_ada_layer_norm:
590
+ self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
591
+ elif self.use_ada_layer_norm_zero:
592
+ self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
593
+ else:
594
+ self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
595
+ self.attn1 = Attention(
596
+ query_dim=dim,
597
+ heads=num_attention_heads,
598
+ dim_head=attention_head_dim,
599
+ dropout=dropout,
600
+ bias=attention_bias,
601
+ cross_attention_dim=cross_attention_dim if only_cross_attention else None,
602
+ upcast_attention=upcast_attention,
603
+ )
604
+
605
+ # 2. Cross-Attn
606
+ if cross_attention_dim is not None or double_self_attention:
607
+ # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
608
+ # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
609
+ # the second cross attention block.
610
+ self.norm2 = (
611
+ AdaLayerNorm(dim, num_embeds_ada_norm)
612
+ if self.use_ada_layer_norm
613
+ else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
614
+ )
615
+ self.attn2 = Attention(
616
+ query_dim=dim,
617
+ cross_attention_dim=cross_attention_dim if not double_self_attention else None,
618
+ heads=num_attention_heads,
619
+ dim_head=attention_head_dim,
620
+ dropout=dropout,
621
+ bias=attention_bias,
622
+ upcast_attention=upcast_attention,
623
+ # scale_qk=False, # uncomment this to not to use flash attention
624
+ ) # is self-attn if encoder_hidden_states is none
625
+ else:
626
+ self.norm2 = None
627
+ self.attn2 = None
628
+
629
+ # 3. Feed-forward
630
+ self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
631
+ self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout)
632
+
633
+ # let chunk size default to None
634
+ self._chunk_size = None
635
+ self._chunk_dim = 0
636
+
637
+ def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int):
638
+ # Sets chunk feed-forward
639
+ self._chunk_size = chunk_size
640
+ self._chunk_dim = dim
641
+
642
+ def forward(
643
+ self,
644
+ hidden_states: torch.FloatTensor,
645
+ attention_mask: Optional[torch.FloatTensor] = None,
646
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
647
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
648
+ timestep: Optional[torch.LongTensor] = None,
649
+ cross_attention_kwargs: Dict[str, Any] = None,
650
+ class_labels: Optional[torch.LongTensor] = None,
651
+ ):
652
+ # Notice that normalization is always applied before the real computation in the following blocks.
653
+ # 1. Self-Attention
654
+ if self.use_ada_layer_norm:
655
+ norm_hidden_states = self.norm1(hidden_states, timestep)
656
+ elif self.use_ada_layer_norm_zero:
657
+ norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
658
+ hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
659
+ )
660
+ else:
661
+ norm_hidden_states = self.norm1(hidden_states)
662
+
663
+ cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
664
+
665
+ attn_output = self.attn1(
666
+ norm_hidden_states,
667
+ encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
668
+ attention_mask=encoder_attention_mask if self.only_cross_attention else attention_mask,
669
+ **cross_attention_kwargs,
670
+ )
671
+ if self.use_ada_layer_norm_zero:
672
+ attn_output = gate_msa.unsqueeze(1) * attn_output
673
+ hidden_states = attn_output + hidden_states
674
+
675
+ # 2. Cross-Attention
676
+ if self.attn2 is not None:
677
+ norm_hidden_states = (
678
+ self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
679
+ )
680
+
681
+ attn_output = self.attn2(
682
+ norm_hidden_states,
683
+ encoder_hidden_states=encoder_hidden_states,
684
+ attention_mask=encoder_attention_mask,
685
+ **cross_attention_kwargs,
686
+ )
687
+ hidden_states = attn_output + hidden_states
688
+
689
+ # 3. Feed-forward
690
+ norm_hidden_states = self.norm3(hidden_states)
691
+
692
+ if self.use_ada_layer_norm_zero:
693
+ norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
694
+
695
+ if self._chunk_size is not None:
696
+ # "feed_forward_chunk_size" can be used to save memory
697
+ if norm_hidden_states.shape[self._chunk_dim] % self._chunk_size != 0:
698
+ raise ValueError(
699
+ f"`hidden_states` dimension to be chunked: {norm_hidden_states.shape[self._chunk_dim]} has to be divisible by chunk size: {self._chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`."
700
+ )
701
+
702
+ num_chunks = norm_hidden_states.shape[self._chunk_dim] // self._chunk_size
703
+ ff_output = torch.cat(
704
+ [self.ff(hid_slice) for hid_slice in norm_hidden_states.chunk(num_chunks, dim=self._chunk_dim)],
705
+ dim=self._chunk_dim,
706
+ )
707
+ else:
708
+ ff_output = self.ff(norm_hidden_states)
709
+
710
+ if self.use_ada_layer_norm_zero:
711
+ ff_output = gate_mlp.unsqueeze(1) * ff_output
712
+
713
+ hidden_states = ff_output + hidden_states
714
+
715
+ return hidden_states
716
+
717
+ class Downsample1D(nn.Module):
718
+ def __init__(self, dim):
719
+ super().__init__()
720
+ self.conv = torch.nn.Conv1d(dim, dim, 3, 2, 1)
721
+
722
+ def forward(self, x):
723
+ return self.conv(x)
724
+
725
+
726
+ class TimestepEmbedding(nn.Module):
727
+ def __init__(
728
+ self,
729
+ in_channels: int,
730
+ time_embed_dim: int,
731
+ act_fn: str = "silu",
732
+ out_dim: int = None,
733
+ post_act_fn: Optional[str] = None,
734
+ cond_proj_dim=None,
735
+ ):
736
+ super().__init__()
737
+
738
+ self.linear_1 = nn.Linear(in_channels, time_embed_dim)
739
+
740
+ if cond_proj_dim is not None:
741
+ self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias=False)
742
+ else:
743
+ self.cond_proj = None
744
+
745
+ self.act = get_activation(act_fn)
746
+
747
+ if out_dim is not None:
748
+ time_embed_dim_out = out_dim
749
+ else:
750
+ time_embed_dim_out = time_embed_dim
751
+ self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out)
752
+
753
+ if post_act_fn is None:
754
+ self.post_act = None
755
+ else:
756
+ self.post_act = get_activation(post_act_fn)
757
+
758
+ def forward(self, sample, condition=None):
759
+ if condition is not None:
760
+ sample = sample + self.cond_proj(condition)
761
+ sample = self.linear_1(sample)
762
+
763
+ if self.act is not None:
764
+ sample = self.act(sample)
765
+
766
+ sample = self.linear_2(sample)
767
+
768
+ if self.post_act is not None:
769
+ sample = self.post_act(sample)
770
+ return sample
771
+
772
+ class ConditionalDecoder(nn.Module):
773
+ def __init__(
774
+ self,
775
+ in_channels,
776
+ out_channels,
777
+ causal=False,
778
+ channels=(256, 256),
779
+ dropout=0.05,
780
+ attention_head_dim=64,
781
+ n_blocks=1,
782
+ num_mid_blocks=2,
783
+ num_heads=4,
784
+ act_fn="snake",
785
+ ):
786
+ """
787
+ This decoder requires an input with the same shape of the target. So, if your text content
788
+ is shorter or longer than the outputs, please re-sampling it before feeding to the decoder.
789
+ """
790
+ super().__init__()
791
+ channels = tuple(channels)
792
+ self.in_channels = in_channels
793
+ self.out_channels = out_channels
794
+ self.causal = causal
795
+ self.time_embeddings = SinusoidalPosEmb(in_channels)
796
+ time_embed_dim = channels[0] * 4
797
+ self.time_mlp = TimestepEmbedding(
798
+ in_channels=in_channels,
799
+ time_embed_dim=time_embed_dim,
800
+ act_fn="silu",
801
+ )
802
+ self.down_blocks = nn.ModuleList([])
803
+ self.mid_blocks = nn.ModuleList([])
804
+ self.up_blocks = nn.ModuleList([])
805
+
806
+ output_channel = in_channels
807
+ for i in range(len(channels)): # pylint: disable=consider-using-enumerate
808
+ input_channel = output_channel
809
+ output_channel = channels[i]
810
+ is_last = i == len(channels) - 1
811
+ resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) if self.causal else \
812
+ ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
813
+ transformer_blocks = nn.ModuleList(
814
+ [
815
+ BasicTransformerBlock(
816
+ dim=output_channel,
817
+ num_attention_heads=num_heads,
818
+ attention_head_dim=attention_head_dim,
819
+ dropout=dropout,
820
+ activation_fn=act_fn,
821
+ )
822
+ for _ in range(n_blocks)
823
+ ]
824
+ )
825
+ downsample = (
826
+ Downsample1D(output_channel) if not is_last else
827
+ CausalConv1d(output_channel, output_channel, 3) if self.causal else nn.Conv1d(output_channel, output_channel, 3, padding=1)
828
+ )
829
+ self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample]))
830
+
831
+ for _ in range(num_mid_blocks):
832
+ input_channel = channels[-1]
833
+ out_channels = channels[-1]
834
+ resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) if self.causal else \
835
+ ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
836
+
837
+ transformer_blocks = nn.ModuleList(
838
+ [
839
+ BasicTransformerBlock(
840
+ dim=output_channel,
841
+ num_attention_heads=num_heads,
842
+ attention_head_dim=attention_head_dim,
843
+ dropout=dropout,
844
+ activation_fn=act_fn,
845
+ )
846
+ for _ in range(n_blocks)
847
+ ]
848
+ )
849
+
850
+ self.mid_blocks.append(nn.ModuleList([resnet, transformer_blocks]))
851
+
852
+ channels = channels[::-1] + (channels[0],)
853
+ for i in range(len(channels) - 1):
854
+ input_channel = channels[i] * 2
855
+ output_channel = channels[i + 1]
856
+ is_last = i == len(channels) - 2
857
+ resnet = CausalResnetBlock1D(
858
+ dim=input_channel,
859
+ dim_out=output_channel,
860
+ time_emb_dim=time_embed_dim,
861
+ ) if self.causal else ResnetBlock1D(
862
+ dim=input_channel,
863
+ dim_out=output_channel,
864
+ time_emb_dim=time_embed_dim,
865
+ )
866
+ transformer_blocks = nn.ModuleList(
867
+ [
868
+ BasicTransformerBlock(
869
+ dim=output_channel,
870
+ num_attention_heads=num_heads,
871
+ attention_head_dim=attention_head_dim,
872
+ dropout=dropout,
873
+ activation_fn=act_fn,
874
+ )
875
+ for _ in range(n_blocks)
876
+ ]
877
+ )
878
+ upsample = (
879
+ Upsample1D(output_channel, use_conv_transpose=True)
880
+ if not is_last
881
+ else CausalConv1d(output_channel, output_channel, 3) if self.causal else nn.Conv1d(output_channel, output_channel, 3, padding=1)
882
+ )
883
+ self.up_blocks.append(nn.ModuleList([resnet, transformer_blocks, upsample]))
884
+ self.final_block = CausalBlock1D(channels[-1], channels[-1]) if self.causal else Block1D(channels[-1], channels[-1])
885
+ self.final_proj = nn.Conv1d(channels[-1], self.out_channels, 1)
886
+ self.initialize_weights()
887
+
888
+ def initialize_weights(self):
889
+ for m in self.modules():
890
+ if isinstance(m, nn.Conv1d):
891
+ nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
892
+ if m.bias is not None:
893
+ nn.init.constant_(m.bias, 0)
894
+ elif isinstance(m, nn.GroupNorm):
895
+ nn.init.constant_(m.weight, 1)
896
+ nn.init.constant_(m.bias, 0)
897
+ elif isinstance(m, nn.Linear):
898
+ nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
899
+ if m.bias is not None:
900
+ nn.init.constant_(m.bias, 0)
901
+
902
+ def forward(self, x, mask, mu, t, spks=None, cond=None):
903
+ """Forward pass of the UNet1DConditional model.
904
+
905
+ Args:
906
+ x (torch.Tensor): shape (batch_size, in_channels, time)
907
+ mask (_type_): shape (batch_size, 1, time)
908
+ t (_type_): shape (batch_size)
909
+ spks (_type_, optional): shape: (batch_size, condition_channels). Defaults to None.
910
+ cond (_type_, optional): placeholder for future use. Defaults to None.
911
+
912
+ Raises:
913
+ ValueError: _description_
914
+ ValueError: _description_
915
+
916
+ Returns:
917
+ _type_: _description_
918
+ """
919
+
920
+ t = self.time_embeddings(t).to(t.dtype)
921
+ t = self.time_mlp(t)
922
+
923
+ x = pack([x, mu], "b * t")[0]
924
+
925
+ if spks is not None:
926
+ spks = repeat(spks, "b c -> b c t", t=x.shape[-1])
927
+ x = pack([x, spks], "b * t")[0]
928
+ if cond is not None:
929
+ x = pack([x, cond], "b * t")[0]
930
+
931
+ hiddens = []
932
+ masks = [mask]
933
+ for resnet, transformer_blocks, downsample in self.down_blocks:
934
+ mask_down = masks[-1]
935
+ x = resnet(x, mask_down, t)
936
+ x = rearrange(x, "b c t -> b t c").contiguous()
937
+ # attn_mask = torch.matmul(mask_down.transpose(1, 2).contiguous(), mask_down)
938
+ attn_mask = add_optional_chunk_mask(x, mask_down.bool(), False, False, 0, self.static_chunk_size, -1)
939
+ attn_mask = mask_to_bias(attn_mask == 1, x.dtype)
940
+ for transformer_block in transformer_blocks:
941
+ x = transformer_block(
942
+ hidden_states=x,
943
+ attention_mask=attn_mask,
944
+ timestep=t,
945
+ )
946
+ x = rearrange(x, "b t c -> b c t").contiguous()
947
+ hiddens.append(x) # Save hidden states for skip connections
948
+ x = downsample(x * mask_down)
949
+ masks.append(mask_down[:, :, ::2])
950
+ masks = masks[:-1]
951
+ mask_mid = masks[-1]
952
+
953
+ for resnet, transformer_blocks in self.mid_blocks:
954
+ x = resnet(x, mask_mid, t)
955
+ x = rearrange(x, "b c t -> b t c").contiguous()
956
+ # attn_mask = torch.matmul(mask_mid.transpose(1, 2).contiguous(), mask_mid)
957
+ attn_mask = add_optional_chunk_mask(x, mask_mid.bool(), False, False, 0, self.static_chunk_size, -1)
958
+ attn_mask = mask_to_bias(attn_mask == 1, x.dtype)
959
+ for transformer_block in transformer_blocks:
960
+ x = transformer_block(
961
+ hidden_states=x,
962
+ attention_mask=attn_mask,
963
+ timestep=t,
964
+ )
965
+ x = rearrange(x, "b t c -> b c t").contiguous()
966
+
967
+ for resnet, transformer_blocks, upsample in self.up_blocks:
968
+ mask_up = masks.pop()
969
+ skip = hiddens.pop()
970
+ x = pack([x[:, :, :skip.shape[-1]], skip], "b * t")[0]
971
+ x = resnet(x, mask_up, t)
972
+ x = rearrange(x, "b c t -> b t c").contiguous()
973
+ # attn_mask = torch.matmul(mask_up.transpose(1, 2).contiguous(), mask_up)
974
+ attn_mask = add_optional_chunk_mask(x, mask_up.bool(), False, False, 0, self.static_chunk_size, -1)
975
+ attn_mask = mask_to_bias(attn_mask == 1, x.dtype)
976
+ for transformer_block in transformer_blocks:
977
+ x = transformer_block(
978
+ hidden_states=x,
979
+ attention_mask=attn_mask,
980
+ timestep=t,
981
+ )
982
+ x = rearrange(x, "b t c -> b c t").contiguous()
983
+ x = upsample(x * mask_up)
984
+ x = self.final_block(x, mask_up)
985
+ output = self.final_proj(x * mask_up)
986
+ return output * mask
987
+
988
+ class ConditionalCFM(BASECFM):
989
+ def __init__(self, in_channels=240, cfm_params=None, n_spks=1, spk_emb_dim=64, estimator_config= None):
990
+ super().__init__(
991
+ n_feats=in_channels,
992
+ cfm_params=cfm_params,
993
+ n_spks=n_spks,
994
+ spk_emb_dim=spk_emb_dim,
995
+ )
996
+ self.t_scheduler = cfm_params.t_scheduler
997
+ self.training_cfg_rate = cfm_params.training_cfg_rate
998
+ self.inference_cfg_rate = cfm_params.inference_cfg_rate
999
+ in_channels = in_channels + (spk_emb_dim if n_spks > 0 else 0)
1000
+ # Just change the architecture of the estimator here
1001
+ self.estimator = ConditionalDecoder(**estimator_config)
1002
+ self.lock = threading.Lock()
1003
+
1004
+ @torch.inference_mode()
1005
+ def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None, prompt_len=0, flow_cache=torch.zeros(1, 80, 0, 2)):
1006
+ """Forward diffusion
1007
+
1008
+ Args:
1009
+ mu (torch.Tensor): output of encoder
1010
+ shape: (batch_size, n_feats, mel_timesteps)
1011
+ mask (torch.Tensor): output_mask
1012
+ shape: (batch_size, 1, mel_timesteps)
1013
+ n_timesteps (int): number of diffusion steps
1014
+ temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
1015
+ spks (torch.Tensor, optional): speaker ids. Defaults to None.
1016
+ shape: (batch_size, spk_emb_dim)
1017
+ cond: Not used but kept for future purposes
1018
+
1019
+ Returns:
1020
+ sample: generated mel-spectrogram
1021
+ shape: (batch_size, n_feats, mel_timesteps)
1022
+ """
1023
+
1024
+ z = torch.randn_like(mu).to(mu.device).to(mu.dtype) * temperature
1025
+ cache_size = flow_cache.shape[2]
1026
+ # fix prompt and overlap part mu and z
1027
+ if cache_size != 0:
1028
+ z[:, :, :cache_size] = flow_cache[:, :, :, 0]
1029
+ mu[:, :, :cache_size] = flow_cache[:, :, :, 1]
1030
+ z_cache = torch.concat([z[:, :, :prompt_len], z[:, :, -34:]], dim=2)
1031
+ mu_cache = torch.concat([mu[:, :, :prompt_len], mu[:, :, -34:]], dim=2)
1032
+ flow_cache = torch.stack([z_cache, mu_cache], dim=-1)
1033
+
1034
+ t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device, dtype=mu.dtype)
1035
+ if self.t_scheduler == 'cosine':
1036
+ t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
1037
+ return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond), flow_cache
1038
+
1039
+ def solve_euler(self, x, t_span, mu, mask, spks, cond):
1040
+ """
1041
+ Fixed euler solver for ODEs.
1042
+ Args:
1043
+ x (torch.Tensor): random noise
1044
+ t_span (torch.Tensor): n_timesteps interpolated
1045
+ shape: (n_timesteps + 1,)
1046
+ mu (torch.Tensor): output of encoder
1047
+ shape: (batch_size, n_feats, mel_timesteps)
1048
+ mask (torch.Tensor): output_mask
1049
+ shape: (batch_size, 1, mel_timesteps)
1050
+ spks (torch.Tensor, optional): speaker ids. Defaults to None.
1051
+ shape: (batch_size, spk_emb_dim)
1052
+ cond: Not used but kept for future purposes
1053
+ """
1054
+ t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
1055
+ t = t.unsqueeze(dim=0)
1056
+
1057
+ # I am storing this because I can later plot it by putting a debugger here and saving it to a file
1058
+ # Or in future might add like a return_all_steps flag
1059
+ sol = []
1060
+
1061
+ # Do not use concat, it may cause memory format changed and trt infer with wrong results!
1062
+ x_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype)
1063
+ mask_in = torch.zeros([2, 1, x.size(2)], device=x.device, dtype=x.dtype)
1064
+ mu_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype)
1065
+ t_in = torch.zeros([2], device=x.device, dtype=x.dtype)
1066
+ spks_in = torch.zeros([2, 80], device=x.device, dtype=x.dtype)
1067
+ cond_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype)
1068
+ for step in range(1, len(t_span)):
1069
+ # Classifier-Free Guidance inference introduced in VoiceBox
1070
+ x_in[:] = x
1071
+ mask_in[:] = mask
1072
+ mu_in[0] = mu
1073
+ t_in[:] = t.unsqueeze(0)
1074
+ spks_in[0] = spks
1075
+ cond_in[0] = cond
1076
+ dphi_dt = self.forward_estimator(
1077
+ x_in, mask_in,
1078
+ mu_in, t_in,
1079
+ spks_in,
1080
+ cond_in
1081
+ )
1082
+ dphi_dt, cfg_dphi_dt = torch.split(dphi_dt, [x.size(0), x.size(0)], dim=0)
1083
+ dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt - self.inference_cfg_rate * cfg_dphi_dt)
1084
+ x = x + dt * dphi_dt
1085
+ t = t + dt
1086
+ sol.append(x)
1087
+ if step < len(t_span) - 1:
1088
+ dt = t_span[step + 1] - t
1089
+
1090
+ return sol[-1].float()
1091
+
1092
+ def forward_estimator(self, x, mask, mu, t, spks, cond):
1093
+ if isinstance(self.estimator, torch.nn.Module):
1094
+ return self.estimator.forward(x, mask, mu, t, spks, cond)
1095
+ else:
1096
+ with self.lock:
1097
+ self.estimator.set_input_shape('x', (2, 80, x.size(2)))
1098
+ self.estimator.set_input_shape('mask', (2, 1, x.size(2)))
1099
+ self.estimator.set_input_shape('mu', (2, 80, x.size(2)))
1100
+ self.estimator.set_input_shape('t', (2,))
1101
+ self.estimator.set_input_shape('spks', (2, 80))
1102
+ self.estimator.set_input_shape('cond', (2, 80, x.size(2)))
1103
+ # run trt engine
1104
+ self.estimator.execute_v2([x.contiguous().data_ptr(),
1105
+ mask.contiguous().data_ptr(),
1106
+ mu.contiguous().data_ptr(),
1107
+ t.contiguous().data_ptr(),
1108
+ spks.contiguous().data_ptr(),
1109
+ cond.contiguous().data_ptr(),
1110
+ x.data_ptr()])
1111
+ return x
1112
+
1113
+ def compute_loss(self, x1, mask, mu, spks=None, cond=None):
1114
+ """Computes diffusion loss
1115
+
1116
+ Args:
1117
+ x1 (torch.Tensor): Target
1118
+ shape: (batch_size, n_feats, mel_timesteps)
1119
+ mask (torch.Tensor): target mask
1120
+ shape: (batch_size, 1, mel_timesteps)
1121
+ mu (torch.Tensor): output of encoder
1122
+ shape: (batch_size, n_feats, mel_timesteps)
1123
+ spks (torch.Tensor, optional): speaker embedding. Defaults to None.
1124
+ shape: (batch_size, spk_emb_dim)
1125
+
1126
+ Returns:
1127
+ loss: conditional flow matching loss
1128
+ y: conditional flow
1129
+ shape: (batch_size, n_feats, mel_timesteps)
1130
+ """
1131
+ b, _, t = mu.shape
1132
+
1133
+ # random timestep
1134
+ t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype)
1135
+ if self.t_scheduler == 'cosine':
1136
+ t = 1 - torch.cos(t * 0.5 * torch.pi)
1137
+ # sample noise p(x_0)
1138
+ z = torch.randn_like(x1)
1139
+
1140
+ y = (1 - (1 - self.sigma_min) * t) * z + t * x1
1141
+ u = x1 - (1 - self.sigma_min) * z
1142
+
1143
+ # during training, we randomly drop condition to trade off mode coverage and sample fidelity
1144
+ if self.training_cfg_rate > 0:
1145
+ cfg_mask = torch.rand(b, device=x1.device) > self.training_cfg_rate
1146
+ mu = mu * cfg_mask.view(-1, 1, 1)
1147
+ spks = spks * cfg_mask.view(-1, 1)
1148
+ cond = cond * cfg_mask.view(-1, 1, 1)
1149
+
1150
+ pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond)
1151
+ loss = F.mse_loss(pred * mask, u * mask, reduction="sum") / (torch.sum(mask) * u.shape[1])
1152
+ return loss, y
1153
+
1154
+
1155
+ class CausalConditionalCFM(ConditionalCFM):
1156
+ def __init__(self, in_channels=240, cfm_params=None, n_spks=1, spk_emb_dim=64, estimator_config = None):
1157
+ super().__init__(in_channels, cfm_params, n_spks, spk_emb_dim, estimator_config)
1158
+ self.rand_noise = torch.randn([1, 80, 50 * 300])
1159
+
1160
+ @torch.inference_mode()
1161
+ def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None):
1162
+ """Forward diffusion
1163
+
1164
+ Args:
1165
+ mu (torch.Tensor): output of encoder
1166
+ shape: (batch_size, n_feats, mel_timesteps)
1167
+ mask (torch.Tensor): output_mask
1168
+ shape: (batch_size, 1, mel_timesteps)
1169
+ n_timesteps (int): number of diffusion steps
1170
+ temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
1171
+ spks (torch.Tensor, optional): speaker ids. Defaults to None.
1172
+ shape: (batch_size, spk_emb_dim)
1173
+ cond: Not used but kept for future purposes
1174
+
1175
+ Returns:
1176
+ sample: generated mel-spectrogram
1177
+ shape: (batch_size, n_feats, mel_timesteps)
1178
+ """
1179
+
1180
+ z = self.rand_noise[:, :, :mu.size(2)].to(mu.device).to(mu.dtype) * temperature
1181
+ # fix prompt and overlap part mu and z
1182
+ t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device, dtype=mu.dtype)
1183
+ if self.t_scheduler == 'cosine':
1184
+ t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
1185
+ return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond), None
1186
+
1187
+ class PositionwiseFeedForward(torch.nn.Module):
1188
+ """Positionwise feed forward layer.
1189
+
1190
+ FeedForward are appied on each position of the sequence.
1191
+ The output dim is same with the input dim.
1192
+
1193
+ Args:
1194
+ idim (int): Input dimenstion.
1195
+ hidden_units (int): The number of hidden units.
1196
+ dropout_rate (float): Dropout rate.
1197
+ activation (torch.nn.Module): Activation function
1198
+ """
1199
+
1200
+ def __init__(
1201
+ self,
1202
+ idim: int,
1203
+ hidden_units: int,
1204
+ dropout_rate: float,
1205
+ activation: torch.nn.Module = torch.nn.ReLU(),
1206
+ ):
1207
+ """Construct a PositionwiseFeedForward object."""
1208
+ super(PositionwiseFeedForward, self).__init__()
1209
+ self.w_1 = torch.nn.Linear(idim, hidden_units)
1210
+ self.activation = activation
1211
+ self.dropout = torch.nn.Dropout(dropout_rate)
1212
+ self.w_2 = torch.nn.Linear(hidden_units, idim)
1213
+
1214
+ def forward(self, xs: torch.Tensor) -> torch.Tensor:
1215
+ """Forward function.
1216
+
1217
+ Args:
1218
+ xs: input tensor (B, L, D)
1219
+ Returns:
1220
+ output tensor, (B, L, D)
1221
+ """
1222
+ return self.w_2(self.dropout(self.activation(self.w_1(xs))))
1223
+
1224
+ class ConformerEncoderLayer(nn.Module):
1225
+ """Encoder layer module.
1226
+ Args:
1227
+ size (int): Input dimension.
1228
+ self_attn (torch.nn.Module): Self-attention module instance.
1229
+ `MultiHeadedAttention` or `RelPositionMultiHeadedAttention`
1230
+ instance can be used as the argument.
1231
+ feed_forward (torch.nn.Module): Feed-forward module instance.
1232
+ `PositionwiseFeedForward` instance can be used as the argument.
1233
+ feed_forward_macaron (torch.nn.Module): Additional feed-forward module
1234
+ instance.
1235
+ `PositionwiseFeedForward` instance can be used as the argument.
1236
+ conv_module (torch.nn.Module): Convolution module instance.
1237
+ `ConvlutionModule` instance can be used as the argument.
1238
+ dropout_rate (float): Dropout rate.
1239
+ normalize_before (bool):
1240
+ True: use layer_norm before each sub-block.
1241
+ False: use layer_norm after each sub-block.
1242
+ """
1243
+
1244
+ def __init__(
1245
+ self,
1246
+ size: int,
1247
+ self_attn: torch.nn.Module,
1248
+ feed_forward: Optional[nn.Module] = None,
1249
+ feed_forward_macaron: Optional[nn.Module] = None,
1250
+ conv_module: Optional[nn.Module] = None,
1251
+ dropout_rate: float = 0.1,
1252
+ normalize_before: bool = True,
1253
+ ):
1254
+ """Construct an EncoderLayer object."""
1255
+ super().__init__()
1256
+ self.self_attn = self_attn
1257
+ self.feed_forward = feed_forward
1258
+ self.feed_forward_macaron = feed_forward_macaron
1259
+ self.conv_module = conv_module
1260
+ self.norm_ff = nn.LayerNorm(size, eps=1e-12) # for the FNN module
1261
+ self.norm_mha = nn.LayerNorm(size, eps=1e-12) # for the MHA module
1262
+ if feed_forward_macaron is not None:
1263
+ self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-12)
1264
+ self.ff_scale = 0.5
1265
+ else:
1266
+ self.ff_scale = 1.0
1267
+ if self.conv_module is not None:
1268
+ self.norm_conv = nn.LayerNorm(size, eps=1e-12) # for the CNN module
1269
+ self.norm_final = nn.LayerNorm(
1270
+ size, eps=1e-12) # for the final output of the block
1271
+ self.dropout = nn.Dropout(dropout_rate)
1272
+ self.size = size
1273
+ self.normalize_before = normalize_before
1274
+
1275
+ def forward(
1276
+ self,
1277
+ x: torch.Tensor,
1278
+ mask: torch.Tensor,
1279
+ pos_emb: torch.Tensor,
1280
+ mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
1281
+ att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
1282
+ cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
1283
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
1284
+ """Compute encoded features.
1285
+
1286
+ Args:
1287
+ x (torch.Tensor): (#batch, time, size)
1288
+ mask (torch.Tensor): Mask tensor for the input (#batch, time,time),
1289
+ (0, 0, 0) means fake mask.
1290
+ pos_emb (torch.Tensor): positional encoding, must not be None
1291
+ for ConformerEncoderLayer.
1292
+ mask_pad (torch.Tensor): batch padding mask used for conv module.
1293
+ (#batch, 1,time), (0, 0, 0) means fake mask.
1294
+ att_cache (torch.Tensor): Cache tensor of the KEY & VALUE
1295
+ (#batch=1, head, cache_t1, d_k * 2), head * d_k == size.
1296
+ cnn_cache (torch.Tensor): Convolution cache in conformer layer
1297
+ (#batch=1, size, cache_t2)
1298
+ Returns:
1299
+ torch.Tensor: Output tensor (#batch, time, size).
1300
+ torch.Tensor: Mask tensor (#batch, time, time).
1301
+ torch.Tensor: att_cache tensor,
1302
+ (#batch=1, head, cache_t1 + time, d_k * 2).
1303
+ torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2).
1304
+ """
1305
+
1306
+ # whether to use macaron style
1307
+ if self.feed_forward_macaron is not None:
1308
+ residual = x
1309
+ if self.normalize_before:
1310
+ x = self.norm_ff_macaron(x)
1311
+ x = residual + self.ff_scale * self.dropout(
1312
+ self.feed_forward_macaron(x))
1313
+ if not self.normalize_before:
1314
+ x = self.norm_ff_macaron(x)
1315
+
1316
+ # multi-headed self-attention module
1317
+ residual = x
1318
+ if self.normalize_before:
1319
+ x = self.norm_mha(x)
1320
+ x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb,
1321
+ att_cache)
1322
+ x = residual + self.dropout(x_att)
1323
+ if not self.normalize_before:
1324
+ x = self.norm_mha(x)
1325
+
1326
+ # convolution module
1327
+ # Fake new cnn cache here, and then change it in conv_module
1328
+ new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device)
1329
+ if self.conv_module is not None:
1330
+ residual = x
1331
+ if self.normalize_before:
1332
+ x = self.norm_conv(x)
1333
+ x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache)
1334
+ x = residual + self.dropout(x)
1335
+
1336
+ if not self.normalize_before:
1337
+ x = self.norm_conv(x)
1338
+
1339
+ # feed forward module
1340
+ residual = x
1341
+ if self.normalize_before:
1342
+ x = self.norm_ff(x)
1343
+
1344
+ x = residual + self.ff_scale * self.dropout(self.feed_forward(x))
1345
+ if not self.normalize_before:
1346
+ x = self.norm_ff(x)
1347
+
1348
+ if self.conv_module is not None:
1349
+ x = self.norm_final(x)
1350
+
1351
+ return x, mask, new_att_cache, new_cnn_cache
1352
+
1353
+ class ConvolutionModule(nn.Module):
1354
+ """ConvolutionModule in Conformer model."""
1355
+
1356
+ def __init__(self,
1357
+ channels: int,
1358
+ kernel_size: int = 15,
1359
+ activation: nn.Module = nn.ReLU(),
1360
+ norm: str = "batch_norm",
1361
+ causal: bool = False,
1362
+ bias: bool = True):
1363
+ """Construct an ConvolutionModule object.
1364
+ Args:
1365
+ channels (int): The number of channels of conv layers.
1366
+ kernel_size (int): Kernel size of conv layers.
1367
+ causal (int): Whether use causal convolution or not
1368
+ """
1369
+ super().__init__()
1370
+
1371
+ self.pointwise_conv1 = nn.Conv1d(
1372
+ channels,
1373
+ 2 * channels,
1374
+ kernel_size=1,
1375
+ stride=1,
1376
+ padding=0,
1377
+ bias=bias,
1378
+ )
1379
+ # self.lorder is used to distinguish if it's a causal convolution,
1380
+ # if self.lorder > 0: it's a causal convolution, the input will be
1381
+ # padded with self.lorder frames on the left in forward.
1382
+ # else: it's a symmetrical convolution
1383
+ if causal:
1384
+ padding = 0
1385
+ self.lorder = kernel_size - 1
1386
+ else:
1387
+ # kernel_size should be an odd number for none causal convolution
1388
+ assert (kernel_size - 1) % 2 == 0
1389
+ padding = (kernel_size - 1) // 2
1390
+ self.lorder = 0
1391
+ self.depthwise_conv = nn.Conv1d(
1392
+ channels,
1393
+ channels,
1394
+ kernel_size,
1395
+ stride=1,
1396
+ padding=padding,
1397
+ groups=channels,
1398
+ bias=bias,
1399
+ )
1400
+
1401
+ assert norm in ['batch_norm', 'layer_norm']
1402
+ if norm == "batch_norm":
1403
+ self.use_layer_norm = False
1404
+ self.norm = nn.BatchNorm1d(channels)
1405
+ else:
1406
+ self.use_layer_norm = True
1407
+ self.norm = nn.LayerNorm(channels)
1408
+
1409
+ self.pointwise_conv2 = nn.Conv1d(
1410
+ channels,
1411
+ channels,
1412
+ kernel_size=1,
1413
+ stride=1,
1414
+ padding=0,
1415
+ bias=bias,
1416
+ )
1417
+ self.activation = activation
1418
+
1419
+ def forward(
1420
+ self,
1421
+ x: torch.Tensor,
1422
+ mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
1423
+ cache: torch.Tensor = torch.zeros((0, 0, 0)),
1424
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
1425
+ """Compute convolution module.
1426
+ Args:
1427
+ x (torch.Tensor): Input tensor (#batch, time, channels).
1428
+ mask_pad (torch.Tensor): used for batch padding (#batch, 1, time),
1429
+ (0, 0, 0) means fake mask.
1430
+ cache (torch.Tensor): left context cache, it is only
1431
+ used in causal convolution (#batch, channels, cache_t),
1432
+ (0, 0, 0) meas fake cache.
1433
+ Returns:
1434
+ torch.Tensor: Output tensor (#batch, time, channels).
1435
+ """
1436
+ # exchange the temporal dimension and the feature dimension
1437
+ x = x.transpose(1, 2) # (#batch, channels, time)
1438
+
1439
+ # mask batch padding
1440
+ if mask_pad.size(2) > 0: # time > 0
1441
+ x.masked_fill_(~mask_pad, 0.0)
1442
+
1443
+ if self.lorder > 0:
1444
+ if cache.size(2) == 0: # cache_t == 0
1445
+ x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0)
1446
+ else:
1447
+ assert cache.size(0) == x.size(0) # equal batch
1448
+ assert cache.size(1) == x.size(1) # equal channel
1449
+ x = torch.cat((cache, x), dim=2)
1450
+ assert (x.size(2) > self.lorder)
1451
+ new_cache = x[:, :, -self.lorder:]
1452
+ else:
1453
+ # It's better we just return None if no cache is required,
1454
+ # However, for JIT export, here we just fake one tensor instead of
1455
+ # None.
1456
+ new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device)
1457
+
1458
+ # GLU mechanism
1459
+ x = self.pointwise_conv1(x) # (batch, 2*channel, dim)
1460
+ x = nn.functional.glu(x, dim=1) # (batch, channel, dim)
1461
+
1462
+ # 1D Depthwise Conv
1463
+ x = self.depthwise_conv(x)
1464
+ if self.use_layer_norm:
1465
+ x = x.transpose(1, 2)
1466
+ x = self.activation(self.norm(x))
1467
+ if self.use_layer_norm:
1468
+ x = x.transpose(1, 2)
1469
+ x = self.pointwise_conv2(x)
1470
+ # mask batch padding
1471
+ if mask_pad.size(2) > 0: # time > 0
1472
+ x.masked_fill_(~mask_pad, 0.0)
1473
+
1474
+ return x.transpose(1, 2), new_cache
1475
+
1476
+ class Upsample1D(nn.Module):
1477
+ """A 1D upsampling layer with an optional convolution.
1478
+
1479
+ Parameters:
1480
+ channels (`int`):
1481
+ number of channels in the inputs and outputs.
1482
+ use_conv (`bool`, default `False`):
1483
+ option to use a convolution.
1484
+ use_conv_transpose (`bool`, default `False`):
1485
+ option to use a convolution transpose.
1486
+ out_channels (`int`, optional):
1487
+ number of output channels. Defaults to `channels`.
1488
+ """
1489
+
1490
+ def __init__(self, channels: int, out_channels: int, stride: int = 2):
1491
+ super().__init__()
1492
+ self.channels = channels
1493
+ self.out_channels = out_channels
1494
+ self.stride = stride
1495
+ # In this mode, first repeat interpolate, than conv with stride=1
1496
+ self.conv = nn.Conv1d(self.channels, self.out_channels, stride * 2 + 1, stride=1, padding=0)
1497
+
1498
+ def forward(self, inputs: torch.Tensor, input_lengths: torch.Tensor):
1499
+ outputs = F.interpolate(inputs, scale_factor=float(self.stride), mode="nearest")
1500
+ outputs = F.pad(outputs, (self.stride * 2, 0), value=0.0)
1501
+ outputs = self.conv(outputs)
1502
+ return outputs, input_lengths * self.stride
1503
+
1504
+
1505
+ class PreLookaheadLayer(nn.Module):
1506
+ def __init__(self, channels: int, pre_lookahead_len: int = 1):
1507
+ super().__init__()
1508
+ self.channels = channels
1509
+ self.pre_lookahead_len = pre_lookahead_len
1510
+ self.conv1 = nn.Conv1d(
1511
+ channels, channels,
1512
+ kernel_size=pre_lookahead_len + 1,
1513
+ stride=1, padding=0,
1514
+ )
1515
+ self.conv2 = nn.Conv1d(
1516
+ channels, channels,
1517
+ kernel_size=3, stride=1, padding=0,
1518
+ )
1519
+
1520
+ def forward(self, inputs: torch.Tensor) -> torch.Tensor:
1521
+ """
1522
+ inputs: (batch_size, seq_len, channels)
1523
+ """
1524
+ outputs = inputs.transpose(1, 2).contiguous()
1525
+ # look ahead
1526
+ outputs = F.pad(outputs, (0, self.pre_lookahead_len), mode='constant', value=0.0)
1527
+ outputs = F.leaky_relu(self.conv1(outputs))
1528
+ # outputs
1529
+ outputs = F.pad(outputs, (2, 0), mode='constant', value=0.0)
1530
+ outputs = self.conv2(outputs)
1531
+ outputs = outputs.transpose(1, 2).contiguous()
1532
+
1533
+ # residual connection
1534
+ outputs = outputs + inputs
1535
+ return outputs
1536
+
1537
+ class BaseSubsampling(torch.nn.Module):
1538
+
1539
+ def __init__(self):
1540
+ super().__init__()
1541
+ self.right_context = 0
1542
+ self.subsampling_rate = 1
1543
+
1544
+ def position_encoding(self, offset: Union[int, torch.Tensor],
1545
+ size: int) -> torch.Tensor:
1546
+ return self.pos_enc.position_encoding(offset, size)
1547
+
1548
+ class LinearNoSubsampling(BaseSubsampling):
1549
+ """Linear transform the input without subsampling
1550
+
1551
+ Args:
1552
+ idim (int): Input dimension.
1553
+ odim (int): Output dimension.
1554
+ dropout_rate (float): Dropout rate.
1555
+
1556
+ """
1557
+
1558
+ def __init__(self, idim: int, odim: int, dropout_rate: float,
1559
+ pos_enc_class: torch.nn.Module):
1560
+ """Construct an linear object."""
1561
+ super().__init__()
1562
+ self.out = torch.nn.Sequential(
1563
+ torch.nn.Linear(idim, odim),
1564
+ torch.nn.LayerNorm(odim, eps=1e-5),
1565
+ torch.nn.Dropout(dropout_rate),
1566
+ )
1567
+ self.pos_enc = pos_enc_class
1568
+ self.right_context = 0
1569
+ self.subsampling_rate = 1
1570
+
1571
+ def forward(
1572
+ self,
1573
+ x: torch.Tensor,
1574
+ x_mask: torch.Tensor,
1575
+ offset: Union[int, torch.Tensor] = 0
1576
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
1577
+ """Input x.
1578
+
1579
+ Args:
1580
+ x (torch.Tensor): Input tensor (#batch, time, idim).
1581
+ x_mask (torch.Tensor): Input mask (#batch, 1, time).
1582
+
1583
+ Returns:
1584
+ torch.Tensor: linear input tensor (#batch, time', odim),
1585
+ where time' = time .
1586
+ torch.Tensor: linear input mask (#batch, 1, time'),
1587
+ where time' = time .
1588
+
1589
+ """
1590
+ x = self.out(x)
1591
+ x, pos_emb = self.pos_enc(x, offset)
1592
+ return x, pos_emb, x_mask
1593
+
1594
+ class EspnetRelPositionalEncoding(torch.nn.Module):
1595
+ """Relative positional encoding module (new implementation).
1596
+
1597
+ Details can be found in https://github.com/espnet/espnet/pull/2816.
1598
+
1599
+ See : Appendix B in https://arxiv.org/abs/1901.02860
1600
+
1601
+ Args:
1602
+ d_model (int): Embedding dimension.
1603
+ dropout_rate (float): Dropout rate.
1604
+ max_len (int): Maximum input length.
1605
+
1606
+ """
1607
+
1608
+ def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000):
1609
+ """Construct an PositionalEncoding object."""
1610
+ super(EspnetRelPositionalEncoding, self).__init__()
1611
+ self.d_model = d_model
1612
+ self.xscale = math.sqrt(self.d_model)
1613
+ self.dropout = torch.nn.Dropout(p=dropout_rate)
1614
+ self.pe = None
1615
+ self.extend_pe(torch.tensor(0.0).expand(1, max_len))
1616
+
1617
+ def extend_pe(self, x: torch.Tensor):
1618
+ """Reset the positional encodings."""
1619
+ if self.pe is not None:
1620
+ # self.pe contains both positive and negative parts
1621
+ # the length of self.pe is 2 * input_len - 1
1622
+ if self.pe.size(1) >= x.size(1) * 2 - 1:
1623
+ if self.pe.dtype != x.dtype or self.pe.device != x.device:
1624
+ self.pe = self.pe.to(dtype=x.dtype, device=x.device)
1625
+ return
1626
+ # Suppose `i` means to the position of query vecotr and `j` means the
1627
+ # position of key vector. We use position relative positions when keys
1628
+ # are to the left (i>j) and negative relative positions otherwise (i<j).
1629
+ pe_positive = torch.zeros(x.size(1), self.d_model)
1630
+ pe_negative = torch.zeros(x.size(1), self.d_model)
1631
+ position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
1632
+ div_term = torch.exp(
1633
+ torch.arange(0, self.d_model, 2, dtype=torch.float32)
1634
+ * -(math.log(10000.0) / self.d_model)
1635
+ )
1636
+ pe_positive[:, 0::2] = torch.sin(position * div_term)
1637
+ pe_positive[:, 1::2] = torch.cos(position * div_term)
1638
+ pe_negative[:, 0::2] = torch.sin(-1 * position * div_term)
1639
+ pe_negative[:, 1::2] = torch.cos(-1 * position * div_term)
1640
+
1641
+ # Reserve the order of positive indices and concat both positive and
1642
+ # negative indices. This is used to support the shifting trick
1643
+ # as in https://arxiv.org/abs/1901.02860
1644
+ pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
1645
+ pe_negative = pe_negative[1:].unsqueeze(0)
1646
+ pe = torch.cat([pe_positive, pe_negative], dim=1)
1647
+ self.pe = pe.to(device=x.device, dtype=x.dtype)
1648
+
1649
+ def forward(self, x: torch.Tensor, offset: Union[int, torch.Tensor] = 0) \
1650
+ -> Tuple[torch.Tensor, torch.Tensor]:
1651
+ """Add positional encoding.
1652
+
1653
+ Args:
1654
+ x (torch.Tensor): Input tensor (batch, time, `*`).
1655
+
1656
+ Returns:
1657
+ torch.Tensor: Encoded tensor (batch, time, `*`).
1658
+
1659
+ """
1660
+ self.extend_pe(x)
1661
+ x = x * self.xscale
1662
+ pos_emb = self.position_encoding(size=x.size(1), offset=offset)
1663
+ return self.dropout(x), self.dropout(pos_emb)
1664
+
1665
+ def position_encoding(self,
1666
+ offset: Union[int, torch.Tensor],
1667
+ size: int) -> torch.Tensor:
1668
+ """ For getting encoding in a streaming fashion
1669
+
1670
+ Attention!!!!!
1671
+ we apply dropout only once at the whole utterance level in a none
1672
+ streaming way, but will call this function several times with
1673
+ increasing input size in a streaming scenario, so the dropout will
1674
+ be applied several times.
1675
+
1676
+ Args:
1677
+ offset (int or torch.tensor): start offset
1678
+ size (int): required size of position encoding
1679
+
1680
+ Returns:
1681
+ torch.Tensor: Corresponding encoding
1682
+ """
1683
+ pos_emb = self.pe[
1684
+ :,
1685
+ self.pe.size(1) // 2 - size + 1: self.pe.size(1) // 2 + size,
1686
+ ]
1687
+ return pos_emb
1688
+
1689
+
1690
+ class MultiHeadedAttention(nn.Module):
1691
+ """Multi-Head Attention layer.
1692
+
1693
+ Args:
1694
+ n_head (int): The number of heads.
1695
+ n_feat (int): The number of features.
1696
+ dropout_rate (float): Dropout rate.
1697
+
1698
+ """
1699
+
1700
+ def __init__(self,
1701
+ n_head: int,
1702
+ n_feat: int,
1703
+ dropout_rate: float,
1704
+ key_bias: bool = True):
1705
+ """Construct an MultiHeadedAttention object."""
1706
+ super().__init__()
1707
+ assert n_feat % n_head == 0
1708
+ # We assume d_v always equals d_k
1709
+ self.d_k = n_feat // n_head
1710
+ self.h = n_head
1711
+ self.linear_q = nn.Linear(n_feat, n_feat)
1712
+ self.linear_k = nn.Linear(n_feat, n_feat, bias=key_bias)
1713
+ self.linear_v = nn.Linear(n_feat, n_feat)
1714
+ self.linear_out = nn.Linear(n_feat, n_feat)
1715
+ self.dropout = nn.Dropout(p=dropout_rate)
1716
+
1717
+ def forward_qkv(
1718
+ self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
1719
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
1720
+ """Transform query, key and value.
1721
+
1722
+ Args:
1723
+ query (torch.Tensor): Query tensor (#batch, time1, size).
1724
+ key (torch.Tensor): Key tensor (#batch, time2, size).
1725
+ value (torch.Tensor): Value tensor (#batch, time2, size).
1726
+
1727
+ Returns:
1728
+ torch.Tensor: Transformed query tensor, size
1729
+ (#batch, n_head, time1, d_k).
1730
+ torch.Tensor: Transformed key tensor, size
1731
+ (#batch, n_head, time2, d_k).
1732
+ torch.Tensor: Transformed value tensor, size
1733
+ (#batch, n_head, time2, d_k).
1734
+
1735
+ """
1736
+ n_batch = query.size(0)
1737
+ q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)
1738
+ k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k)
1739
+ v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k)
1740
+ q = q.transpose(1, 2) # (batch, head, time1, d_k)
1741
+ k = k.transpose(1, 2) # (batch, head, time2, d_k)
1742
+ v = v.transpose(1, 2) # (batch, head, time2, d_k)
1743
+
1744
+ return q, k, v
1745
+
1746
+ def forward_attention(
1747
+ self,
1748
+ value: torch.Tensor,
1749
+ scores: torch.Tensor,
1750
+ mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool)
1751
+ ) -> torch.Tensor:
1752
+ """Compute attention context vector.
1753
+
1754
+ Args:
1755
+ value (torch.Tensor): Transformed value, size
1756
+ (#batch, n_head, time2, d_k).
1757
+ scores (torch.Tensor): Attention score, size
1758
+ (#batch, n_head, time1, time2).
1759
+ mask (torch.Tensor): Mask, size (#batch, 1, time2) or
1760
+ (#batch, time1, time2), (0, 0, 0) means fake mask.
1761
+
1762
+ Returns:
1763
+ torch.Tensor: Transformed value (#batch, time1, d_model)
1764
+ weighted by the attention score (#batch, time1, time2).
1765
+
1766
+ """
1767
+ n_batch = value.size(0)
1768
+ # NOTE(xcsong): When will `if mask.size(2) > 0` be True?
1769
+ # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the
1770
+ # 1st chunk to ease the onnx export.]
1771
+ # 2. pytorch training
1772
+ if mask.size(2) > 0: # time2 > 0
1773
+ mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2)
1774
+ # For last chunk, time2 might be larger than scores.size(-1)
1775
+ mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2)
1776
+ scores = scores.masked_fill(mask, -float('inf'))
1777
+ attn = torch.softmax(scores, dim=-1).masked_fill(
1778
+ mask, 0.0) # (batch, head, time1, time2)
1779
+ # NOTE(xcsong): When will `if mask.size(2) > 0` be False?
1780
+ # 1. onnx(16/-1, -1/-1, 16/0)
1781
+ # 2. jit (16/-1, -1/-1, 16/0, 16/4)
1782
+ else:
1783
+ attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2)
1784
+
1785
+ p_attn = self.dropout(attn)
1786
+ x = torch.matmul(p_attn, value) # (batch, head, time1, d_k)
1787
+ x = (x.transpose(1, 2).contiguous().view(n_batch, -1,
1788
+ self.h * self.d_k)
1789
+ ) # (batch, time1, d_model)
1790
+
1791
+ return self.linear_out(x) # (batch, time1, d_model)
1792
+
1793
+ def forward(
1794
+ self,
1795
+ query: torch.Tensor,
1796
+ key: torch.Tensor,
1797
+ value: torch.Tensor,
1798
+ mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
1799
+ pos_emb: torch.Tensor = torch.empty(0),
1800
+ cache: torch.Tensor = torch.zeros((0, 0, 0, 0))
1801
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
1802
+ """Compute scaled dot product attention.
1803
+
1804
+ Args:
1805
+ query (torch.Tensor): Query tensor (#batch, time1, size).
1806
+ key (torch.Tensor): Key tensor (#batch, time2, size).
1807
+ value (torch.Tensor): Value tensor (#batch, time2, size).
1808
+ mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
1809
+ (#batch, time1, time2).
1810
+ 1.When applying cross attention between decoder and encoder,
1811
+ the batch padding mask for input is in (#batch, 1, T) shape.
1812
+ 2.When applying self attention of encoder,
1813
+ the mask is in (#batch, T, T) shape.
1814
+ 3.When applying self attention of decoder,
1815
+ the mask is in (#batch, L, L) shape.
1816
+ 4.If the different position in decoder see different block
1817
+ of the encoder, such as Mocha, the passed in mask could be
1818
+ in (#batch, L, T) shape. But there is no such case in current
1819
+ CosyVoice.
1820
+ cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2),
1821
+ where `cache_t == chunk_size * num_decoding_left_chunks`
1822
+ and `head * d_k == size`
1823
+
1824
+
1825
+ Returns:
1826
+ torch.Tensor: Output tensor (#batch, time1, d_model).
1827
+ torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2)
1828
+ where `cache_t == chunk_size * num_decoding_left_chunks`
1829
+ and `head * d_k == size`
1830
+
1831
+ """
1832
+ q, k, v = self.forward_qkv(query, key, value)
1833
+
1834
+ # NOTE(xcsong):
1835
+ # when export onnx model, for 1st chunk, we feed
1836
+ # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode)
1837
+ # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode).
1838
+ # In all modes, `if cache.size(0) > 0` will alwayse be `True`
1839
+ # and we will always do splitting and
1840
+ # concatnation(this will simplify onnx export). Note that
1841
+ # it's OK to concat & split zero-shaped tensors(see code below).
1842
+ # when export jit model, for 1st chunk, we always feed
1843
+ # cache(0, 0, 0, 0) since jit supports dynamic if-branch.
1844
+ # >>> a = torch.ones((1, 2, 0, 4))
1845
+ # >>> b = torch.ones((1, 2, 3, 4))
1846
+ # >>> c = torch.cat((a, b), dim=2)
1847
+ # >>> torch.equal(b, c) # True
1848
+ # >>> d = torch.split(a, 2, dim=-1)
1849
+ # >>> torch.equal(d[0], d[1]) # True
1850
+ if cache.size(0) > 0:
1851
+ key_cache, value_cache = torch.split(cache,
1852
+ cache.size(-1) // 2,
1853
+ dim=-1)
1854
+ k = torch.cat([key_cache, k], dim=2)
1855
+ v = torch.cat([value_cache, v], dim=2)
1856
+ # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's
1857
+ # non-trivial to calculate `next_cache_start` here.
1858
+ new_cache = torch.cat((k, v), dim=-1)
1859
+
1860
+ scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
1861
+ return self.forward_attention(v, scores, mask), new_cache
1862
+
1863
+
1864
+ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
1865
+ """Multi-Head Attention layer with relative position encoding.
1866
+ Paper: https://arxiv.org/abs/1901.02860
1867
+ Args:
1868
+ n_head (int): The number of heads.
1869
+ n_feat (int): The number of features.
1870
+ dropout_rate (float): Dropout rate.
1871
+ """
1872
+
1873
+ def __init__(self,
1874
+ n_head: int,
1875
+ n_feat: int,
1876
+ dropout_rate: float,
1877
+ key_bias: bool = True):
1878
+ """Construct an RelPositionMultiHeadedAttention object."""
1879
+ super().__init__(n_head, n_feat, dropout_rate, key_bias)
1880
+ # linear transformation for positional encoding
1881
+ self.linear_pos = nn.Linear(n_feat, n_feat, bias=False)
1882
+ # these two learnable bias are used in matrix c and matrix d
1883
+ # as described in https://arxiv.org/abs/1901.02860 Section 3.3
1884
+ self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k))
1885
+ self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k))
1886
+ torch.nn.init.xavier_uniform_(self.pos_bias_u)
1887
+ torch.nn.init.xavier_uniform_(self.pos_bias_v)
1888
+
1889
+ def rel_shift(self, x: torch.Tensor) -> torch.Tensor:
1890
+ """Compute relative positional encoding.
1891
+
1892
+ Args:
1893
+ x (torch.Tensor): Input tensor (batch, head, time1, 2*time1-1).
1894
+ time1 means the length of query vector.
1895
+
1896
+ Returns:
1897
+ torch.Tensor: Output tensor.
1898
+
1899
+ """
1900
+ zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1),
1901
+ device=x.device,
1902
+ dtype=x.dtype)
1903
+ x_padded = torch.cat([zero_pad, x], dim=-1)
1904
+
1905
+ x_padded = x_padded.view(x.size()[0],
1906
+ x.size()[1],
1907
+ x.size(3) + 1, x.size(2))
1908
+ x = x_padded[:, :, 1:].view_as(x)[
1909
+ :, :, :, : x.size(-1) // 2 + 1
1910
+ ] # only keep the positions from 0 to time2
1911
+ return x
1912
+
1913
+ def forward(
1914
+ self,
1915
+ query: torch.Tensor,
1916
+ key: torch.Tensor,
1917
+ value: torch.Tensor,
1918
+ mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
1919
+ pos_emb: torch.Tensor = torch.empty(0),
1920
+ cache: torch.Tensor = torch.zeros((0, 0, 0, 0))
1921
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
1922
+ """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
1923
+ Args:
1924
+ query (torch.Tensor): Query tensor (#batch, time1, size).
1925
+ key (torch.Tensor): Key tensor (#batch, time2, size).
1926
+ value (torch.Tensor): Value tensor (#batch, time2, size).
1927
+ mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
1928
+ (#batch, time1, time2), (0, 0, 0) means fake mask.
1929
+ pos_emb (torch.Tensor): Positional embedding tensor
1930
+ (#batch, time2, size).
1931
+ cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2),
1932
+ where `cache_t == chunk_size * num_decoding_left_chunks`
1933
+ and `head * d_k == size`
1934
+ Returns:
1935
+ torch.Tensor: Output tensor (#batch, time1, d_model).
1936
+ torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2)
1937
+ where `cache_t == chunk_size * num_decoding_left_chunks`
1938
+ and `head * d_k == size`
1939
+ """
1940
+ q, k, v = self.forward_qkv(query, key, value)
1941
+ q = q.transpose(1, 2) # (batch, time1, head, d_k)
1942
+
1943
+ # NOTE(xcsong):
1944
+ # when export onnx model, for 1st chunk, we feed
1945
+ # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode)
1946
+ # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode).
1947
+ # In all modes, `if cache.size(0) > 0` will alwayse be `True`
1948
+ # and we will always do splitting and
1949
+ # concatnation(this will simplify onnx export). Note that
1950
+ # it's OK to concat & split zero-shaped tensors(see code below).
1951
+ # when export jit model, for 1st chunk, we always feed
1952
+ # cache(0, 0, 0, 0) since jit supports dynamic if-branch.
1953
+ # >>> a = torch.ones((1, 2, 0, 4))
1954
+ # >>> b = torch.ones((1, 2, 3, 4))
1955
+ # >>> c = torch.cat((a, b), dim=2)
1956
+ # >>> torch.equal(b, c) # True
1957
+ # >>> d = torch.split(a, 2, dim=-1)
1958
+ # >>> torch.equal(d[0], d[1]) # True
1959
+ if cache.size(0) > 0:
1960
+ key_cache, value_cache = torch.split(cache,
1961
+ cache.size(-1) // 2,
1962
+ dim=-1)
1963
+ k = torch.cat([key_cache, k], dim=2)
1964
+ v = torch.cat([value_cache, v], dim=2)
1965
+ # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's
1966
+ # non-trivial to calculate `next_cache_start` here.
1967
+ new_cache = torch.cat((k, v), dim=-1)
1968
+
1969
+ n_batch_pos = pos_emb.size(0)
1970
+ p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
1971
+ p = p.transpose(1, 2) # (batch, head, time1, d_k)
1972
+
1973
+ # (batch, head, time1, d_k)
1974
+ q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2)
1975
+ # (batch, head, time1, d_k)
1976
+ q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2)
1977
+
1978
+ # compute attention score
1979
+ # first compute matrix a and matrix c
1980
+ # as described in https://arxiv.org/abs/1901.02860 Section 3.3
1981
+ # (batch, head, time1, time2)
1982
+ matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1))
1983
+
1984
+ # compute matrix b and matrix d
1985
+ # (batch, head, time1, time2)
1986
+ matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1))
1987
+ # NOTE(Xiang Lyu): Keep rel_shift since espnet rel_pos_emb is used
1988
+ if matrix_ac.shape != matrix_bd.shape:
1989
+ matrix_bd = self.rel_shift(matrix_bd)
1990
+
1991
+ scores = (matrix_ac + matrix_bd) / math.sqrt(
1992
+ self.d_k) # (batch, head, time1, time2)
1993
+
1994
+ return self.forward_attention(v, scores, mask), new_cache
1995
+
1996
+ class UpsampleConformerEncoder(torch.nn.Module):
1997
+
1998
+ def __init__(
1999
+ self,
2000
+ input_size: int,
2001
+ output_size: int = 256,
2002
+ attention_heads: int = 4,
2003
+ linear_units: int = 2048,
2004
+ num_blocks: int = 6,
2005
+ dropout_rate: float = 0.1,
2006
+ positional_dropout_rate: float = 0.1,
2007
+ attention_dropout_rate: float = 0.0,
2008
+ input_layer: str = "conv2d",
2009
+ pos_enc_layer_type: str = "rel_pos",
2010
+ normalize_before: bool = True,
2011
+ static_chunk_size: int = 0,
2012
+ use_dynamic_chunk: bool = False,
2013
+ global_cmvn: torch.nn.Module = None,
2014
+ use_dynamic_left_chunk: bool = False,
2015
+ positionwise_conv_kernel_size: int = 1,
2016
+ macaron_style: bool = True,
2017
+ selfattention_layer_type: str = "rel_selfattn",
2018
+ activation_type: str = "swish",
2019
+ use_cnn_module: bool = True,
2020
+ cnn_module_kernel: int = 15,
2021
+ causal: bool = False,
2022
+ cnn_module_norm: str = "batch_norm",
2023
+ key_bias: bool = True,
2024
+ gradient_checkpointing: bool = False,
2025
+ ):
2026
+ """
2027
+ Args:
2028
+ input_size (int): input dim
2029
+ output_size (int): dimension of attention
2030
+ attention_heads (int): the number of heads of multi head attention
2031
+ linear_units (int): the hidden units number of position-wise feed
2032
+ forward
2033
+ num_blocks (int): the number of decoder blocks
2034
+ dropout_rate (float): dropout rate
2035
+ attention_dropout_rate (float): dropout rate in attention
2036
+ positional_dropout_rate (float): dropout rate after adding
2037
+ positional encoding
2038
+ input_layer (str): input layer type.
2039
+ optional [linear, conv2d, conv2d6, conv2d8]
2040
+ pos_enc_layer_type (str): Encoder positional encoding layer type.
2041
+ opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos]
2042
+ normalize_before (bool):
2043
+ True: use layer_norm before each sub-block of a layer.
2044
+ False: use layer_norm after each sub-block of a layer.
2045
+ static_chunk_size (int): chunk size for static chunk training and
2046
+ decoding
2047
+ use_dynamic_chunk (bool): whether use dynamic chunk size for
2048
+ training or not, You can only use fixed chunk(chunk_size > 0)
2049
+ or dyanmic chunk size(use_dynamic_chunk = True)
2050
+ global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module
2051
+ use_dynamic_left_chunk (bool): whether use dynamic left chunk in
2052
+ dynamic chunk training
2053
+ key_bias: whether use bias in attention.linear_k, False for whisper models.
2054
+ gradient_checkpointing: rerunning a forward-pass segment for each
2055
+ checkpointed segment during backward.
2056
+ """
2057
+ super().__init__()
2058
+ self._output_size = output_size
2059
+
2060
+ self.global_cmvn = global_cmvn
2061
+ # self.embed = COSYVOICE_SUBSAMPLE_CLASSES[input_layer](
2062
+ self.embed = LinearNoSubsampling(
2063
+ input_size,
2064
+ output_size,
2065
+ dropout_rate,
2066
+ # COSYVOICE_EMB_CLASSES[pos_enc_layer_type](
2067
+ EspnetRelPositionalEncoding(
2068
+ output_size,
2069
+ positional_dropout_rate,
2070
+ ),
2071
+ )
2072
+
2073
+ self.normalize_before = normalize_before
2074
+ self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5)
2075
+ self.static_chunk_size = static_chunk_size
2076
+ self.use_dynamic_chunk = use_dynamic_chunk
2077
+ self.use_dynamic_left_chunk = use_dynamic_left_chunk
2078
+ self.gradient_checkpointing = gradient_checkpointing
2079
+ # COSYVOICE_ACTIVATION_CLASSES[activation_type]()
2080
+ activation = getattr(torch.nn, "SiLU", Swish)()
2081
+ # self-attention module definition
2082
+ encoder_selfattn_layer_args = (
2083
+ attention_heads,
2084
+ output_size,
2085
+ attention_dropout_rate,
2086
+ key_bias,
2087
+ )
2088
+ # feed-forward module definition
2089
+ positionwise_layer_args = (
2090
+ output_size,
2091
+ linear_units,
2092
+ dropout_rate,
2093
+ activation,
2094
+ )
2095
+ # convolution module definition
2096
+ convolution_layer_args = (output_size, cnn_module_kernel, activation,
2097
+ cnn_module_norm, causal)
2098
+ self.pre_lookahead_layer = PreLookaheadLayer(channels=512, pre_lookahead_len=3)
2099
+ self.encoders = torch.nn.ModuleList([
2100
+ ConformerEncoderLayer(
2101
+ output_size,
2102
+ # COSYVOICE_ATTENTION_CLASSES[selfattention_layer_type](
2103
+ RelPositionMultiHeadedAttention(
2104
+ *encoder_selfattn_layer_args),
2105
+ PositionwiseFeedForward(*positionwise_layer_args),
2106
+ PositionwiseFeedForward(
2107
+ *positionwise_layer_args) if macaron_style else None,
2108
+ ConvolutionModule(
2109
+ *convolution_layer_args) if use_cnn_module else None,
2110
+ dropout_rate,
2111
+ normalize_before,
2112
+ ) for _ in range(num_blocks)
2113
+ ])
2114
+ self.up_layer = Upsample1D(channels=512, out_channels=512, stride=2)
2115
+ # self.up_embed = COSYVOICE_SUBSAMPLE_CLASSES[input_layer](
2116
+ self.up_embed = LinearNoSubsampling(
2117
+ input_size,
2118
+ output_size,
2119
+ dropout_rate,
2120
+ # COSYVOICE_EMB_CLASSES[pos_enc_layer_type](
2121
+ EspnetRelPositionalEncoding(
2122
+ output_size,
2123
+ positional_dropout_rate,
2124
+ ),
2125
+ )
2126
+ self.up_encoders = torch.nn.ModuleList([
2127
+ ConformerEncoderLayer(
2128
+ output_size,
2129
+ # COSYVOICE_ATTENTION_CLASSES[selfattention_layer_type](
2130
+ RelPositionMultiHeadedAttention(
2131
+ *encoder_selfattn_layer_args),
2132
+ PositionwiseFeedForward(*positionwise_layer_args),
2133
+ PositionwiseFeedForward(
2134
+ *positionwise_layer_args) if macaron_style else None,
2135
+ ConvolutionModule(
2136
+ *convolution_layer_args) if use_cnn_module else None,
2137
+ dropout_rate,
2138
+ normalize_before,
2139
+ ) for _ in range(4)
2140
+ ])
2141
+
2142
+ def output_size(self) -> int:
2143
+ return self._output_size
2144
+
2145
+ def forward(
2146
+ self,
2147
+ xs: torch.Tensor,
2148
+ xs_lens: torch.Tensor,
2149
+ decoding_chunk_size: int = 0,
2150
+ num_decoding_left_chunks: int = -1,
2151
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
2152
+ """Embed positions in tensor.
2153
+
2154
+ Args:
2155
+ xs: padded input tensor (B, T, D)
2156
+ xs_lens: input length (B)
2157
+ decoding_chunk_size: decoding chunk size for dynamic chunk
2158
+ 0: default for training, use random dynamic chunk.
2159
+ <0: for decoding, use full chunk.
2160
+ >0: for decoding, use fixed chunk size as set.
2161
+ num_decoding_left_chunks: number of left chunks, this is for decoding,
2162
+ the chunk size is decoding_chunk_size.
2163
+ >=0: use num_decoding_left_chunks
2164
+ <0: use all left chunks
2165
+ Returns:
2166
+ encoder output tensor xs, and subsampled masks
2167
+ xs: padded output tensor (B, T' ~= T/subsample_rate, D)
2168
+ masks: torch.Tensor batch padding mask after subsample
2169
+ (B, 1, T' ~= T/subsample_rate)
2170
+ NOTE(xcsong):
2171
+ We pass the `__call__` method of the modules instead of `forward` to the
2172
+ checkpointing API because `__call__` attaches all the hooks of the module.
2173
+ https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2
2174
+ """
2175
+ T = xs.size(1)
2176
+ masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T)
2177
+ if self.global_cmvn is not None:
2178
+ xs = self.global_cmvn(xs)
2179
+ xs, pos_emb, masks = self.embed(xs, masks)
2180
+ mask_pad = masks # (B, 1, T/subsample_rate)
2181
+ chunk_masks = add_optional_chunk_mask(xs, masks,
2182
+ self.use_dynamic_chunk,
2183
+ self.use_dynamic_left_chunk,
2184
+ decoding_chunk_size,
2185
+ self.static_chunk_size,
2186
+ num_decoding_left_chunks)
2187
+ # lookahead + conformer encoder
2188
+ xs = self.pre_lookahead_layer(xs)
2189
+ xs = self.forward_layers(xs, chunk_masks, pos_emb, mask_pad)
2190
+
2191
+ # upsample + conformer encoder
2192
+ xs = xs.transpose(1, 2).contiguous()
2193
+ xs, xs_lens = self.up_layer(xs, xs_lens)
2194
+ xs = xs.transpose(1, 2).contiguous()
2195
+ T = xs.size(1)
2196
+ masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T)
2197
+ xs, pos_emb, masks = self.up_embed(xs, masks)
2198
+ mask_pad = masks # (B, 1, T/subsample_rate)
2199
+ chunk_masks = add_optional_chunk_mask(xs, masks,
2200
+ self.use_dynamic_chunk,
2201
+ self.use_dynamic_left_chunk,
2202
+ decoding_chunk_size,
2203
+ self.static_chunk_size * self.up_layer.stride,
2204
+ num_decoding_left_chunks)
2205
+ xs = self.forward_up_layers(xs, chunk_masks, pos_emb, mask_pad)
2206
+
2207
+ if self.normalize_before:
2208
+ xs = self.after_norm(xs)
2209
+ # Here we assume the mask is not changed in encoder layers, so just
2210
+ # return the masks before encoder layers, and the masks will be used
2211
+ # for cross attention with decoder later
2212
+ return xs, masks
2213
+
2214
+ def forward_layers(self, xs: torch.Tensor, chunk_masks: torch.Tensor,
2215
+ pos_emb: torch.Tensor,
2216
+ mask_pad: torch.Tensor) -> torch.Tensor:
2217
+ for layer in self.encoders:
2218
+ xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
2219
+ return xs
2220
+
2221
+ def forward_up_layers(self, xs: torch.Tensor, chunk_masks: torch.Tensor,
2222
+ pos_emb: torch.Tensor,
2223
+ mask_pad: torch.Tensor) -> torch.Tensor:
2224
+ for layer in self.up_encoders:
2225
+ xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
2226
+ return xs
2227
+
2228
+ class CausalMaskedDiffWithXvec(PreTrainedModel):
2229
+ """
2230
+ cosyvoice2.0 flow模块
2231
+ """
2232
+ def __init__(
2233
+ self,
2234
+ config: FlowConfig,
2235
+ mel_feat_conf: Dict = {
2236
+ 'n_fft': 1024,
2237
+ 'num_mels': 80,
2238
+ 'sampling_rate': 22050,
2239
+ 'hop_size': 256,
2240
+ 'win_size': 1024,
2241
+ 'fmin': 0,
2242
+ 'fmax': 8000,
2243
+ },
2244
+ ):
2245
+ super().__init__(config)
2246
+ self.input_size = config.input_size
2247
+ self.output_size = config.output_size
2248
+ self.decoder_conf = config.decoder_config
2249
+ self.mel_feat_conf = mel_feat_conf
2250
+ self.vocab_size = config.vocab_size # 与speech tokenizer保持一致 6561
2251
+ self.output_type = config.output_type
2252
+ self.input_frame_rate = config.input_frame_rate
2253
+ self.input_embedding = nn.Embedding(config.vocab_size, config.input_size)
2254
+ self.spk_embed_affine_layer = torch.nn.Linear(config.spk_embed_dim, config.output_size)
2255
+ self.encoder = UpsampleConformerEncoder(**config.encoder_config)
2256
+ self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), config.output_size)
2257
+
2258
+ decoder_config = copy.deepcopy(config.decoder_config)
2259
+ decoder_config['cfm_params'] = DictConfig(decoder_config['cfm_params'])
2260
+ self.decoder = CausalConditionalCFM(**decoder_config)
2261
+
2262
+ self.only_mask_loss = config.only_mask_loss
2263
+ self.token_mel_ratio = config.token_mel_ratio
2264
+ self.pre_lookahead_len = config.pre_lookahead_len
2265
+
2266
+ @torch.inference_mode()
2267
+ def inference(
2268
+ self,
2269
+ token,
2270
+ token_len,
2271
+ prompt_token,
2272
+ prompt_token_len,
2273
+ prompt_feat,
2274
+ prompt_feat_len,
2275
+ embedding,
2276
+ finalize,
2277
+ ):
2278
+ # if self.fp16 is True:
2279
+ # prompt_feat = prompt_feat.half()
2280
+ # embedding = embedding.half()
2281
+ # process
2282
+
2283
+ embedding = embedding.to(self.spk_embed_affine_layer.weight.data.dtype) # noqa, TODO
2284
+ prompt_feat = prompt_feat.to(self.spk_embed_affine_layer.weight.data.dtype) # noqa, TODO
2285
+
2286
+ assert token.shape[0] == 1
2287
+ # xvec projection
2288
+ embedding = F.normalize(embedding, dim=1)
2289
+ embedding = self.spk_embed_affine_layer(embedding)
2290
+
2291
+ # concat text and prompt_text
2292
+ token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len # 拼接prompt token+ 需要生成的token
2293
+ mask = (~make_pad_mask(token_len)).unsqueeze(-1).to(embedding)
2294
+ token = self.input_embedding(torch.clamp(token, min=0)) * mask
2295
+
2296
+ # text encode
2297
+ h, h_lengths = self.encoder(token, token_len)
2298
+ if finalize is False:
2299
+ h = h[:, :-self.pre_lookahead_len * self.token_mel_ratio]
2300
+ mel_len1, mel_len2 = prompt_feat.shape[1], h.shape[1] - prompt_feat.shape[1]
2301
+ h = self.encoder_proj(h)
2302
+
2303
+ # get conditions
2304
+ conds = torch.zeros([1, mel_len1 + mel_len2, self.output_size], device=token.device).to(h.dtype)
2305
+ conds[:, :mel_len1] = prompt_feat # prompt音频的mel 特征作为condition
2306
+ conds = conds.transpose(1, 2)
2307
+
2308
+ mask = (~make_pad_mask(torch.tensor([mel_len1 + mel_len2]))).to(h)
2309
+ feat, _ = self.decoder(
2310
+ mu=h.transpose(1, 2).contiguous(),
2311
+ mask=mask.unsqueeze(1),
2312
+ spks=embedding,
2313
+ cond=conds,
2314
+ n_timesteps=10
2315
+ )
2316
+ feat = feat[:, :, mel_len1:]
2317
+ assert feat.shape[2] == mel_len2
2318
+ return feat.float(), None
modeling_hifigan.py ADDED
@@ -0,0 +1,479 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # SenseTime
3
+ # Copyright (c) 2025 SenseTime
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+ from typing import Dict, Optional, List
7
+ import numpy as np
8
+ from scipy.signal import get_window
9
+ import torch
10
+ import torch.nn.functional as F
11
+ from torch.nn import ConvTranspose1d, Conv1d, Parameter
12
+ from torch.nn.utils import remove_weight_norm
13
+ from torch.nn.utils.parametrizations import weight_norm
14
+ from torch.distributions.uniform import Uniform
15
+ from torch import nn, sin, pow
16
+ from transformers.modeling_utils import PreTrainedModel
17
+
18
+ from .configuration_hifigan import HiFiGanConfig
19
+
20
+ def get_padding(kernel_size, dilation=1):
21
+ return int((kernel_size * dilation - dilation) / 2)
22
+
23
+ def init_weights(m, mean=0.0, std=0.01):
24
+ classname = m.__class__.__name__
25
+ if classname.find("Conv") != -1:
26
+ m.weight.data.normal_(mean, std)
27
+ return
28
+
29
+ # Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
30
+ # LICENSE is in incl_licenses directory.
31
+ class Snake(nn.Module):
32
+ '''
33
+ Implementation of a sine-based periodic activation function
34
+ Shape:
35
+ - Input: (B, C, T)
36
+ - Output: (B, C, T), same shape as the input
37
+ Parameters:
38
+ - alpha - trainable parameter
39
+ References:
40
+ - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
41
+ https://arxiv.org/abs/2006.08195
42
+ Examples:
43
+ >>> a1 = snake(256)
44
+ >>> x = torch.randn(256)
45
+ >>> x = a1(x)
46
+ '''
47
+ def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
48
+ '''
49
+ Initialization.
50
+ INPUT:
51
+ - in_features: shape of the input
52
+ - alpha: trainable parameter
53
+ alpha is initialized to 1 by default, higher values = higher-frequency.
54
+ alpha will be trained along with the rest of your model.
55
+ '''
56
+ super(Snake, self).__init__()
57
+ self.in_features = in_features
58
+
59
+ # initialize alpha
60
+ self.alpha_logscale = alpha_logscale
61
+ if self.alpha_logscale: # log scale alphas initialized to zeros
62
+ self.alpha = Parameter(torch.zeros(in_features) * alpha)
63
+ else: # linear scale alphas initialized to ones
64
+ self.alpha = Parameter(torch.ones(in_features) * alpha)
65
+
66
+ self.alpha.requires_grad = alpha_trainable
67
+
68
+ self.no_div_by_zero = 0.000000001
69
+
70
+ def forward(self, x):
71
+ '''
72
+ Forward pass of the function.
73
+ Applies the function to the input elementwise.
74
+ Snake ∶= x + 1/a * sin^2 (xa)
75
+ '''
76
+ alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T]
77
+ if self.alpha_logscale:
78
+ alpha = torch.exp(alpha)
79
+ x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
80
+
81
+ return x
82
+
83
+ class ConvRNNF0Predictor(nn.Module):
84
+ def __init__(self,
85
+ num_class: int = 1,
86
+ in_channels: int = 80,
87
+ cond_channels: int = 512
88
+ ):
89
+ super().__init__()
90
+
91
+ self.num_class = num_class
92
+ self.condnet = nn.Sequential(
93
+ weight_norm(
94
+ nn.Conv1d(in_channels, cond_channels, kernel_size=3, padding=1)
95
+ ),
96
+ nn.ELU(),
97
+ weight_norm(
98
+ nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
99
+ ),
100
+ nn.ELU(),
101
+ weight_norm(
102
+ nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
103
+ ),
104
+ nn.ELU(),
105
+ weight_norm(
106
+ nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
107
+ ),
108
+ nn.ELU(),
109
+ weight_norm(
110
+ nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
111
+ ),
112
+ nn.ELU(),
113
+ )
114
+ self.classifier = nn.Linear(in_features=cond_channels, out_features=self.num_class)
115
+
116
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
117
+ x = self.condnet(x)
118
+ x = x.transpose(1, 2)
119
+ return torch.abs(self.classifier(x).squeeze(-1))
120
+
121
+ class ResBlock(torch.nn.Module):
122
+ """Residual block module in HiFiGAN/BigVGAN."""
123
+ def __init__(
124
+ self,
125
+ channels: int = 512,
126
+ kernel_size: int = 3,
127
+ dilations: List[int] = [1, 3, 5],
128
+ ):
129
+ super(ResBlock, self).__init__()
130
+ self.convs1 = nn.ModuleList()
131
+ self.convs2 = nn.ModuleList()
132
+
133
+ for dilation in dilations:
134
+ self.convs1.append(
135
+ weight_norm(
136
+ Conv1d(
137
+ channels,
138
+ channels,
139
+ kernel_size,
140
+ 1,
141
+ dilation=dilation,
142
+ padding=get_padding(kernel_size, dilation)
143
+ )
144
+ )
145
+ )
146
+ self.convs2.append(
147
+ weight_norm(
148
+ Conv1d(
149
+ channels,
150
+ channels,
151
+ kernel_size,
152
+ 1,
153
+ dilation=1,
154
+ padding=get_padding(kernel_size, 1)
155
+ )
156
+ )
157
+ )
158
+ self.convs1.apply(init_weights)
159
+ self.convs2.apply(init_weights)
160
+ self.activations1 = nn.ModuleList([
161
+ Snake(channels, alpha_logscale=False)
162
+ for _ in range(len(self.convs1))
163
+ ])
164
+ self.activations2 = nn.ModuleList([
165
+ Snake(channels, alpha_logscale=False)
166
+ for _ in range(len(self.convs2))
167
+ ])
168
+
169
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
170
+ for idx in range(len(self.convs1)):
171
+ xt = self.activations1[idx](x)
172
+ xt = self.convs1[idx](xt)
173
+ xt = self.activations2[idx](xt)
174
+ xt = self.convs2[idx](xt)
175
+ x = xt + x
176
+ return x
177
+
178
+ def remove_weight_norm(self):
179
+ for idx in range(len(self.convs1)):
180
+ remove_weight_norm(self.convs1[idx])
181
+ remove_weight_norm(self.convs2[idx])
182
+
183
+
184
+ class SineGen(torch.nn.Module):
185
+ """ Definition of sine generator
186
+ SineGen(samp_rate, harmonic_num = 0,
187
+ sine_amp = 0.1, noise_std = 0.003,
188
+ voiced_threshold = 0,
189
+ flag_for_pulse=False)
190
+ samp_rate: sampling rate in Hz
191
+ harmonic_num: number of harmonic overtones (default 0)
192
+ sine_amp: amplitude of sine-wavefrom (default 0.1)
193
+ noise_std: std of Gaussian noise (default 0.003)
194
+ voiced_thoreshold: F0 threshold for U/V classification (default 0)
195
+ flag_for_pulse: this SinGen is used inside PulseGen (default False)
196
+ Note: when flag_for_pulse is True, the first time step of a voiced
197
+ segment is always sin(np.pi) or cos(0)
198
+ """
199
+
200
+ def __init__(self, samp_rate, harmonic_num=0,
201
+ sine_amp=0.1, noise_std=0.003,
202
+ voiced_threshold=0):
203
+ super(SineGen, self).__init__()
204
+ self.sine_amp = sine_amp
205
+ self.noise_std = noise_std
206
+ self.harmonic_num = harmonic_num
207
+ self.sampling_rate = samp_rate
208
+ self.voiced_threshold = voiced_threshold
209
+
210
+ def _f02uv(self, f0):
211
+ # generate uv signal
212
+ uv = (f0 > self.voiced_threshold).type(torch.float32)
213
+ return uv
214
+
215
+ @torch.no_grad()
216
+ def forward(self, f0):
217
+ """
218
+ :param f0: [B, 1, sample_len], Hz
219
+ :return: [B, 1, sample_len]
220
+ """
221
+
222
+ F_mat = torch.zeros((f0.size(0), self.harmonic_num + 1, f0.size(-1))).to(f0.device)
223
+ for i in range(self.harmonic_num + 1):
224
+ F_mat[:, i: i + 1, :] = f0 * (i + 1) / self.sampling_rate
225
+
226
+ theta_mat = 2 * np.pi * (torch.cumsum(F_mat, dim=-1) % 1)
227
+ u_dist = Uniform(low=-np.pi, high=np.pi)
228
+ phase_vec = u_dist.sample(sample_shape=(f0.size(0), self.harmonic_num + 1, 1)).to(F_mat.device)
229
+ phase_vec[:, 0, :] = 0
230
+
231
+ # generate sine waveforms
232
+ sine_waves = self.sine_amp * torch.sin(theta_mat + phase_vec)
233
+
234
+ # generate uv signal
235
+ uv = self._f02uv(f0)
236
+
237
+ # noise: for unvoiced should be similar to sine_amp
238
+ # std = self.sine_amp/3 -> max value ~ self.sine_amp
239
+ # . for voiced regions is self.noise_std
240
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
241
+ noise = noise_amp * torch.randn_like(sine_waves)
242
+
243
+ # first: set the unvoiced part to 0 by uv
244
+ # then: additive noise
245
+ sine_waves = sine_waves * uv + noise
246
+ return sine_waves, uv, noise
247
+
248
+
249
+ class SourceModuleHnNSF(torch.nn.Module):
250
+ """ SourceModule for hn-nsf
251
+ SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
252
+ add_noise_std=0.003, voiced_threshod=0)
253
+ sampling_rate: sampling_rate in Hz
254
+ harmonic_num: number of harmonic above F0 (default: 0)
255
+ sine_amp: amplitude of sine source signal (default: 0.1)
256
+ add_noise_std: std of additive Gaussian noise (default: 0.003)
257
+ note that amplitude of noise in unvoiced is decided
258
+ by sine_amp
259
+ voiced_threshold: threhold to set U/V given F0 (default: 0)
260
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
261
+ F0_sampled (batchsize, length, 1)
262
+ Sine_source (batchsize, length, 1)
263
+ noise_source (batchsize, length 1)
264
+ uv (batchsize, length, 1)
265
+ """
266
+
267
+ def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1,
268
+ add_noise_std=0.003, voiced_threshod=0):
269
+ super(SourceModuleHnNSF, self).__init__()
270
+
271
+ self.sine_amp = sine_amp
272
+ self.noise_std = add_noise_std
273
+
274
+ # to produce sine waveforms
275
+ self.l_sin_gen = SineGen(sampling_rate, harmonic_num,
276
+ sine_amp, add_noise_std, voiced_threshod)
277
+
278
+ # to merge source harmonics into a single excitation
279
+ self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
280
+ self.l_tanh = torch.nn.Tanh()
281
+
282
+ def forward(self, x):
283
+ """
284
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
285
+ F0_sampled (batchsize, length, 1)
286
+ Sine_source (batchsize, length, 1)
287
+ noise_source (batchsize, length 1)
288
+ """
289
+ # source for harmonic branch
290
+ with torch.no_grad():
291
+ sine_wavs, uv, _ = self.l_sin_gen(x.transpose(1, 2))
292
+ sine_wavs = sine_wavs.transpose(1, 2)
293
+ uv = uv.transpose(1, 2)
294
+ sine_wavs = sine_wavs.to(self.l_linear.weight.data.dtype) # noqa, TODO
295
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs))
296
+
297
+ # source for noise branch, in the same shape as uv
298
+ noise = torch.randn_like(uv) * self.sine_amp / 3
299
+ return sine_merge, noise, uv
300
+
301
+ class HiFTGenerator(PreTrainedModel):
302
+ """
303
+ HiFTNet Generator: Neural Source Filter + ISTFTNet
304
+ https://arxiv.org/abs/2309.09493
305
+ """
306
+ def __init__(
307
+ self,
308
+ config: HiFiGanConfig
309
+ ):
310
+ super(HiFTGenerator, self).__init__(config)
311
+
312
+ self.out_channels = 1
313
+ self.nb_harmonics = config.nb_harmonics
314
+ self.sampling_rate = config.sampling_rate
315
+ self.istft_params = config.istft_params
316
+ self.lrelu_slope = config.lrelu_slope
317
+ self.audio_limit = config.audio_limit
318
+
319
+ self.num_kernels = len(config.resblock_kernel_sizes)
320
+ self.num_upsamples = len(config.upsample_rates)
321
+ self.m_source = SourceModuleHnNSF(
322
+ sampling_rate=config.sampling_rate,
323
+ upsample_scale=np.prod(config.upsample_rates) * config.istft_params["hop_len"],
324
+ harmonic_num=config.nb_harmonics,
325
+ sine_amp=config.nsf_alpha,
326
+ add_noise_std=config.nsf_sigma,
327
+ voiced_threshod=config.nsf_voiced_threshold)
328
+ self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(config.upsample_rates) * config.istft_params["hop_len"])
329
+
330
+ self.conv_pre = weight_norm(
331
+ Conv1d(config.in_channels, config.base_channels, 7, 1, padding=3)
332
+ )
333
+
334
+ # Up
335
+ self.ups = nn.ModuleList()
336
+ for i, (u, k) in enumerate(zip(config.upsample_rates, config.upsample_kernel_sizes)):
337
+ self.ups.append(
338
+ weight_norm(
339
+ ConvTranspose1d(
340
+ config.base_channels // (2**i),
341
+ config.base_channels // (2**(i + 1)),
342
+ k,
343
+ u,
344
+ padding=(k - u) // 2,
345
+ )
346
+ )
347
+ )
348
+
349
+ # Down
350
+ self.source_downs = nn.ModuleList()
351
+ self.source_resblocks = nn.ModuleList()
352
+ downsample_rates = [1] + config.upsample_rates[::-1][:-1]
353
+ downsample_cum_rates = np.cumprod(downsample_rates)
354
+ for i, (u, k, d) in enumerate(zip(downsample_cum_rates[::-1], config.source_resblock_kernel_sizes, config.source_resblock_dilation_sizes)):
355
+ if u == 1:
356
+ self.source_downs.append(
357
+ Conv1d(config.istft_params["n_fft"] + 2, config.base_channels // (2 ** (i + 1)), 1, 1)
358
+ )
359
+ else:
360
+ self.source_downs.append(
361
+ Conv1d(config.istft_params["n_fft"] + 2, config.base_channels // (2 ** (i + 1)), u * 2, u, padding=(u // 2))
362
+ )
363
+
364
+ self.source_resblocks.append(
365
+ ResBlock(config.base_channels // (2 ** (i + 1)), k, d)
366
+ )
367
+
368
+ self.resblocks = nn.ModuleList()
369
+ for i in range(len(self.ups)):
370
+ ch = config.base_channels // (2**(i + 1))
371
+ for _, (k, d) in enumerate(zip(config.resblock_kernel_sizes, config.resblock_dilation_sizes)):
372
+ self.resblocks.append(ResBlock(ch, k, d))
373
+
374
+ self.conv_post = weight_norm(Conv1d(ch, config.istft_params["n_fft"] + 2, 7, 1, padding=3))
375
+ self.ups.apply(init_weights)
376
+ self.conv_post.apply(init_weights)
377
+ self.reflection_pad = nn.ReflectionPad1d((1, 0))
378
+ self.stft_window = torch.from_numpy(get_window("hann", config.istft_params["n_fft"], fftbins=True).astype(np.float32))
379
+ self.f0_predictor = ConvRNNF0Predictor(**config.f0_predictor_config)
380
+
381
+ def remove_weight_norm(self):
382
+ print('Removing weight norm...')
383
+ for l in self.ups:
384
+ remove_weight_norm(l)
385
+ for l in self.resblocks:
386
+ l.remove_weight_norm()
387
+ remove_weight_norm(self.conv_pre)
388
+ remove_weight_norm(self.conv_post)
389
+ self.m_source.remove_weight_norm()
390
+ for l in self.source_downs:
391
+ remove_weight_norm(l)
392
+ for l in self.source_resblocks:
393
+ l.remove_weight_norm()
394
+
395
+ def _stft(self, x):
396
+ spec = torch.stft(
397
+ x,
398
+ self.istft_params["n_fft"], self.istft_params["hop_len"], self.istft_params["n_fft"], window=self.stft_window.to(x.device),
399
+ return_complex=True)
400
+ spec = torch.view_as_real(spec) # [B, F, TT, 2]
401
+ return spec[..., 0], spec[..., 1]
402
+
403
+ def _istft(self, magnitude, phase):
404
+ magnitude = torch.clip(magnitude, max=1e2)
405
+ real = magnitude * torch.cos(phase)
406
+ img = magnitude * torch.sin(phase)
407
+ inverse_transform = torch.istft(torch.complex(real, img), self.istft_params["n_fft"], self.istft_params["hop_len"],
408
+ self.istft_params["n_fft"], window=self.stft_window.to(magnitude.device))
409
+ return inverse_transform
410
+
411
+ def decode(self, x: torch.Tensor, s: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor:
412
+ s_stft_real, s_stft_imag = self._stft(s.squeeze(1))
413
+ s_stft = torch.cat([s_stft_real, s_stft_imag], dim=1)
414
+ s_stft = s_stft.to(x) # noqa TODO
415
+ x = self.conv_pre(x)
416
+ for i in range(self.num_upsamples):
417
+ x = F.leaky_relu(x, self.lrelu_slope)
418
+ x = self.ups[i](x)
419
+
420
+ if i == self.num_upsamples - 1:
421
+ x = self.reflection_pad(x)
422
+
423
+ # fusion
424
+ si = self.source_downs[i](s_stft)
425
+ si = self.source_resblocks[i](si)
426
+ x = x + si
427
+
428
+ xs = None
429
+ for j in range(self.num_kernels):
430
+ if xs is None:
431
+ xs = self.resblocks[i * self.num_kernels + j](x)
432
+ else:
433
+ xs += self.resblocks[i * self.num_kernels + j](x)
434
+ x = xs / self.num_kernels
435
+
436
+ x = F.leaky_relu(x)
437
+ x = self.conv_post(x)
438
+ magnitude = torch.exp(x[:, :self.istft_params["n_fft"] // 2 + 1, :])
439
+ phase = torch.sin(x[:, self.istft_params["n_fft"] // 2 + 1:, :]) # actually, sin is redundancy
440
+
441
+ magnitude = magnitude.to(torch.float) # noqa TODO
442
+ phase = phase.to(torch.float) # noqa TODO
443
+
444
+ x = self._istft(magnitude, phase)
445
+ x = torch.clamp(x, -self.audio_limit, self.audio_limit)
446
+ return x
447
+
448
+ def forward(
449
+ self,
450
+ batch: dict,
451
+ device: torch.device,
452
+ ) -> Dict[str, Optional[torch.Tensor]]:
453
+ speech_feat = batch['speech_feat'].transpose(1, 2).to(device)
454
+ # mel->f0
455
+ f0 = self.f0_predictor(speech_feat)
456
+ # f0->source
457
+ s = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t
458
+ s, _, _ = self.m_source(s)
459
+ s = s.transpose(1, 2)
460
+ # mel+source->speech
461
+ generated_speech = self.decode(x=speech_feat, s=s)
462
+ return generated_speech, f0
463
+
464
+ @torch.inference_mode()
465
+ def inference(self, speech_feat: torch.Tensor, cache_source: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor:
466
+ # process data
467
+ speech_feat = speech_feat.to(self.f0_predictor.classifier.weight.data.dtype) # noqa, TODO
468
+ cache_source = cache_source.to(self.f0_predictor.classifier.weight.data.dtype) # noqa, TODO
469
+ # mel->f0
470
+ f0 = self.f0_predictor(speech_feat)
471
+ # f0->source
472
+ s = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t
473
+ s, _, _ = self.m_source(s)
474
+ s = s.transpose(1, 2)
475
+ # use cache_source to avoid glitch
476
+ if cache_source.shape[2] != 0:
477
+ s[:, :, :cache_source.shape[2]] = cache_source
478
+ generated_speech = self.decode(x=speech_feat, s=s)
479
+ return generated_speech, s
modeling_interactiveomni.py ADDED
@@ -0,0 +1,773 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # SenseTime
3
+ # Copyright (c) 2025 SenseTime
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+ import warnings
7
+ from typing import Any, List, Optional, Tuple, Union
8
+ import re
9
+ import json
10
+ import math
11
+ import librosa
12
+ import numpy as np
13
+ from PIL import Image
14
+ from decord import VideoReader, cpu
15
+ from torch import nn
16
+ import torch
17
+ import torchvision.transforms as T
18
+ from torchvision.transforms.functional import InterpolationMode
19
+ from transformers import (GenerationConfig, Qwen3ForCausalLM, WhisperFeatureExtractor)
20
+ from transformers.modeling_utils import PreTrainedModel
21
+ import onnxruntime
22
+ import torchaudio.compliance.kaldi as kaldi
23
+ import torchaudio
24
+ from transformers.utils.hub import cached_file
25
+
26
+ from .configuration_interactiveomni import InteractiveOmniConfig
27
+ from .modeling_intern_vit import InternVisionModel
28
+ from .modeling_whisper import AudioWhisperModel
29
+ from .modeling_voicelm import VoiceLM
30
+ from .conversation import get_conv_template
31
+
32
+ from .modeling_flow import CausalMaskedDiffWithXvec
33
+ from .modeling_hifigan import HiFTGenerator
34
+
35
+ import logging
36
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
37
+ logger = logging.getLogger(__name__)
38
+
39
+ IMAGENET_MEAN = (0.485, 0.456, 0.406)
40
+ IMAGENET_STD = (0.229, 0.224, 0.225)
41
+
42
+ IMG_START_TOKEN = '<img>'
43
+ IMG_END_TOKEN = '</img>'
44
+ IMG_CONTEXT_TOKEN = '<IMG_CONTEXT>'
45
+ AUDIO_START_TOKEN = '<audio>'
46
+ AUDIO_END_TOKEN = '</audio>'
47
+ AUDIO_CONTEXT_TOKEN = '<AUDIO_CONTEXT>'
48
+
49
+
50
+ class InteractiveOmniModel(PreTrainedModel):
51
+ config_class = InteractiveOmniConfig
52
+ main_input_name = 'pixel_values'
53
+ base_model_prefix = 'language_model'
54
+ _no_split_modules = ['InternVisionModel', 'AudioWhisperModel', 'Qwen3DecoderLayer', 'Qwen2DecoderLayer']
55
+
56
+ def __init__(self, config: InteractiveOmniConfig, vision_model=None, language_model=None, audio_model=None):
57
+ super().__init__(config)
58
+
59
+ image_size = config.force_image_size or config.vision_config.image_size
60
+ patch_size = config.vision_config.patch_size
61
+ self.patch_size = patch_size
62
+ self.select_layer = config.select_layer
63
+ self.template = config.template
64
+ self.num_image_token = int((image_size // patch_size) ** 2 * (config.downsample_ratio ** 2))
65
+ self.downsample_ratio = config.downsample_ratio
66
+ self.ps_version = config.ps_version
67
+ self.audio_feature_extractor = WhisperFeatureExtractor(**config.audio_preprocessor_config)
68
+ self.transform = self.build_transform(input_size=image_size)
69
+
70
+ self.campplus_session = None
71
+ self.default_speaker_embedding = None
72
+ self.default_wav_path = None
73
+
74
+ logger.info(f'num_image_token: {self.num_image_token}')
75
+ logger.info(f'ps_version: {self.ps_version}')
76
+ if vision_model is not None:
77
+ self.vision_model = vision_model
78
+ else:
79
+ self.vision_model = InternVisionModel(config.vision_config)
80
+ if audio_model is not None:
81
+ self.audio_model = audio_model
82
+ else:
83
+ self.audio_model = AudioWhisperModel(config.audio_config)
84
+ if language_model is not None:
85
+ self.language_model = language_model
86
+ else:
87
+ self.language_model = Qwen3ForCausalLM(config.llm_config)
88
+
89
+ self.voicelm_model = VoiceLM(config.voicelm_config)
90
+ self.flow_model = CausalMaskedDiffWithXvec(config.flow_config).float()
91
+ self.hifigan_model = HiFTGenerator(config.hifigan_config).float()
92
+
93
+ vit_hidden_size = config.vision_config.hidden_size
94
+ audio_hidden_size = config.audio_config.d_model
95
+ llm_hidden_size = config.llm_config.hidden_size
96
+
97
+ self.mlp1 = nn.Sequential(
98
+ nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio) ** 2),
99
+ nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio) ** 2, llm_hidden_size),
100
+ nn.GELU(),
101
+ nn.Linear(llm_hidden_size, llm_hidden_size)
102
+ )
103
+ self.mlp2 = nn.Sequential(
104
+ nn.LayerNorm(audio_hidden_size),
105
+ nn.Linear(audio_hidden_size, llm_hidden_size),
106
+ nn.GELU(),
107
+ nn.Linear(llm_hidden_size, llm_hidden_size)
108
+ )
109
+
110
+ self.mlp_llm2voicelm = nn.Sequential(
111
+ nn.LayerNorm(llm_hidden_size),
112
+ nn.Linear(llm_hidden_size, config.voicelm_config.llm_input_size),
113
+ nn.GELU(),
114
+ nn.Linear(config.voicelm_config.llm_input_size, config.voicelm_config.llm_input_size)
115
+ )
116
+ self.gate = nn.Sequential(
117
+ nn.Linear(2 * llm_hidden_size, llm_hidden_size),
118
+ nn.Sigmoid()
119
+ )
120
+
121
+ self.img_context_token_id = None
122
+ self.audio_context_token_id = None
123
+ self.neftune_alpha = None
124
+
125
+ self.post_init()
126
+ pass
127
+
128
+ def fusion(self, rep, emb):
129
+ gate = self.gate(torch.cat([rep, emb], dim=-1))
130
+ return rep * gate + emb * (1 - gate)
131
+
132
+ def __load_campplus_session(self, campplus_path:str):
133
+ ''''''
134
+ logger.info(f"load campplus session: {campplus_path}")
135
+ option = onnxruntime.SessionOptions()
136
+ option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
137
+ option.intra_op_num_threads = 1
138
+ campplus_session = onnxruntime.InferenceSession(
139
+ campplus_path,
140
+ sess_options=option,
141
+ providers=["CPUExecutionProvider"],
142
+ )
143
+ self.campplus_session = campplus_session
144
+ return campplus_session
145
+
146
+ def extract_speaker_embedding(self, prompt_wav:str):
147
+ '''extract speaker embedding tensor'''
148
+ logger.info(f"extract speaker embedding: {prompt_wav}")
149
+ target_sr = 16000
150
+ prompt_speech_16k, sample_rate = torchaudio.load(prompt_wav)
151
+ prompt_speech_16k = prompt_speech_16k.mean(dim=0, keepdim=True)
152
+ if sample_rate != target_sr:
153
+ assert sample_rate > target_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr)
154
+ prompt_speech_16k = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(prompt_speech_16k)
155
+
156
+ feat = kaldi.fbank(
157
+ prompt_speech_16k,
158
+ num_mel_bins=80,
159
+ dither=0,
160
+ sample_frequency=target_sr,
161
+ )
162
+ feat = feat - feat.mean(dim=0, keepdim=True)
163
+ speaker_embedding = self.campplus_session.run(
164
+ None,
165
+ {self.campplus_session.get_inputs()[0].name: feat.unsqueeze(dim=0).cpu().numpy()},
166
+ )[0].flatten().tolist()
167
+ speaker_embedding = torch.tensor([speaker_embedding])
168
+ return speaker_embedding
169
+
170
+ def build_transform(self, input_size):
171
+ MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
172
+ transform = T.Compose([
173
+ T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
174
+ T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
175
+ T.ToTensor(),
176
+ T.Normalize(mean=MEAN, std=STD)
177
+ ])
178
+
179
+ return transform
180
+
181
+ def find_closest_aspect_ratio(self, image, min_num=1, max_num=6, image_size=448):
182
+ assert min_num == 1
183
+ original_width, original_height = image.size
184
+ log_ratio = math.log(original_width / original_height)
185
+ ratio = original_width * original_height / (image_size * image_size)
186
+ multiple = min(math.ceil(ratio), max_num)
187
+ if multiple <= 1:
188
+ return [1, 1]
189
+ candidate_split_grids_nums = []
190
+ for i in [multiple - 1, multiple, multiple + 1]:
191
+ if i > max_num:
192
+ continue
193
+ candidate_split_grids_nums.append(i)
194
+
195
+ candidate_grids = []
196
+ for split_grids_nums in candidate_split_grids_nums:
197
+ m = 1
198
+ while m <= split_grids_nums:
199
+ if split_grids_nums % m == 0:
200
+ candidate_grids.append([m, split_grids_nums // m])
201
+ m += 1
202
+ best_grid = [1, 1]
203
+ min_error = float("inf")
204
+ for grid in candidate_grids:
205
+ error = abs(log_ratio - math.log(grid[0] / grid[1]))
206
+ if error < min_error:
207
+ best_grid = grid
208
+ min_error = error
209
+
210
+ return best_grid
211
+
212
+ def dynamic_preprocess(self, image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
213
+ target_aspect_ratio = self.find_closest_aspect_ratio(image, min_num, max_num, image_size)
214
+ target_width = image_size * target_aspect_ratio[0]
215
+ target_height = image_size * target_aspect_ratio[1]
216
+ blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
217
+ # resize the image
218
+ resized_img = image.resize((target_width, target_height))
219
+ processed_images = []
220
+ for i in range(blocks):
221
+ box = (
222
+ (i % (target_width // image_size)) * image_size,
223
+ (i // (target_width // image_size)) * image_size,
224
+ ((i % (target_width // image_size)) + 1) * image_size,
225
+ ((i // (target_width // image_size)) + 1) * image_size
226
+ )
227
+ # split the image
228
+ split_img = resized_img.crop(box)
229
+ processed_images.append(split_img)
230
+ assert len(processed_images) == blocks
231
+ if use_thumbnail and len(processed_images) != 1:
232
+ thumbnail_img = image.resize((image_size, image_size))
233
+ processed_images.append(thumbnail_img)
234
+ return processed_images
235
+
236
+ def load_image(self, image, input_size=448, max_num=12):
237
+ if not isinstance(image, Image.Image):
238
+ image = Image.open(image).convert('RGB')
239
+ images = self.dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
240
+ return images
241
+
242
+ def pixel_shuffle(self, x, scale_factor=0.5):
243
+ n, w, h, c = x.size()
244
+ # N, W, H, C --> N, W, H * scale, C // scale
245
+ x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
246
+ # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
247
+ x = x.permute(0, 2, 1, 3).contiguous()
248
+ # N, H * scale, W, C // scale --> N, H * scale, W * scale, C // (scale ** 2)
249
+ x = x.view(n, int(h * scale_factor), int(w * scale_factor),
250
+ int(c / (scale_factor * scale_factor)))
251
+ if self.ps_version == 'v1':
252
+ warnings.warn("In ps_version 'v1', the height and width have not been swapped back, "
253
+ 'which results in a transposed image.')
254
+ else:
255
+ x = x.permute(0, 2, 1, 3).contiguous()
256
+ return x
257
+
258
+ def extract_feature(self, pixel_values):
259
+ if self.select_layer == -1:
260
+ vit_embeds = self.vision_model(
261
+ pixel_values=pixel_values,
262
+ output_hidden_states=False,
263
+ return_dict=True).last_hidden_state
264
+ else:
265
+ vit_embeds = self.vision_model(
266
+ pixel_values=pixel_values,
267
+ output_hidden_states=True,
268
+ return_dict=True).hidden_states[self.select_layer]
269
+ vit_embeds = vit_embeds[:, 1:, :]
270
+
271
+ if self.training and self.neftune_alpha is not None:
272
+ vit_embeds = self.noised_embed(vit_embeds, self.neftune_alpha)
273
+
274
+ h = w = int(vit_embeds.shape[1] ** 0.5)
275
+ vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
276
+ vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio)
277
+ vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
278
+ vit_embeds = self.mlp1(vit_embeds)#.to(pixel_values.device)
279
+ return vit_embeds
280
+
281
+ def get_T_after_cnn(self, L_in, dilation=1):
282
+ for (padding, kernel_size, stride) in eval("[(1,3,1)] + [(1,3,2)] "):
283
+ L_out = L_in + 2 * padding - dilation * (kernel_size - 1) - 1
284
+ L_out = 1 + L_out // stride
285
+ L_in = L_out
286
+ return L_out
287
+
288
+ def process_audio(self, audio, return_tensors, sampling_rate=16000):
289
+ L = (audio.shape[0] if audio.shape[0] <= 480000 else 480000) # max_length < 30s
290
+ mel_len = L // 160
291
+ audio_len_after_cnn = self.get_T_after_cnn(mel_len)
292
+ audio_token_num = (audio_len_after_cnn - 2) // 2 + 1
293
+ inputs = self.audio_feature_extractor(audio, return_tensors=return_tensors, sampling_rate=sampling_rate)
294
+ inputs['audio_len_after_cnn'] = torch.tensor(audio_len_after_cnn, dtype=torch.long)
295
+ inputs['audio_token_num'] = torch.tensor(audio_token_num, dtype=torch.long)
296
+ return inputs
297
+
298
+ def load_audio(self, audio_file, sampling_rate=16000):
299
+ audio_values, _ = librosa.load(audio_file, sr=sampling_rate) # sample rate should be 16000
300
+
301
+ audio_process_values = self.process_audio(audio_values, sampling_rate=sampling_rate, return_tensors="pt")
302
+ input_features = audio_process_values['input_features']
303
+ audio_len_after_cnn = audio_process_values['audio_len_after_cnn']
304
+ audio_token_num = audio_process_values['audio_token_num']
305
+
306
+ audio_input_dict = {'audio_values': input_features,
307
+ 'audio_len_after_cnn': audio_len_after_cnn,
308
+ 'audio_token_num': audio_token_num,
309
+ }
310
+ return audio_input_dict
311
+
312
+ def extract_audio_feature(self, audio_values, audio_len_after_cnn):
313
+
314
+ audio_values = audio_values.squeeze(1)
315
+ max_len_in_batch = int(torch.max(audio_len_after_cnn).item())
316
+ padding_mask = torch.ones([audio_values.size(0), max_len_in_batch]).to(dtype=audio_values.dtype, device=audio_values.device)
317
+ for index in range(len(audio_values)):
318
+ padding_mask[index, :int(audio_len_after_cnn[index].item())] = 0
319
+
320
+ last_hidden_state = self.audio_model(audio_values, padding_mask, audio_len_after_cnn) # (bs, max_token_num, 1280)
321
+
322
+ audio_embeds = self.mlp2(last_hidden_state)
323
+
324
+ return audio_embeds
325
+
326
+ def get_index(self, bound, fps, max_frame, first_idx=0, num_segments=32):
327
+ if bound:
328
+ start, end = bound[0], bound[1]
329
+ else:
330
+ start, end = -100000, 100000
331
+ start_idx = max(first_idx, round(start * fps))
332
+ end_idx = min(round(end * fps), max_frame)
333
+ seg_size = float(end_idx - start_idx) / num_segments
334
+ frame_indices = np.array([
335
+ int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
336
+ for idx in range(num_segments)
337
+ ])
338
+ return frame_indices
339
+
340
+ def load_video(self, video_path, bound=None, num_segments=32):
341
+ vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
342
+ max_frame = len(vr) - 1
343
+ fps = float(vr.get_avg_fps())
344
+ frame_indices = self.get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
345
+ frames = list()
346
+ for frame_index in frame_indices:
347
+ img = Image.fromarray(vr[frame_index].asnumpy()).convert('RGB')
348
+ frames.append(img)
349
+ return frames
350
+
351
+ def find_second_last_occurrence(self, input_ids_list, target_id):
352
+ '''find taget_id index'''
353
+ reversed_list = list(reversed(input_ids_list))
354
+ first_occurrence = -1
355
+ second_occurrence = -1
356
+ for idx, val in enumerate(reversed_list):
357
+ if val == target_id:
358
+ if first_occurrence == -1:
359
+ first_occurrence = idx # first index
360
+ elif second_occurrence == -1:
361
+ second_occurrence = idx # second index
362
+ break
363
+
364
+ if second_occurrence == -1:
365
+ return -1
366
+ return len(input_ids_list) - second_occurrence - 1
367
+
368
+ def decode_speech_tokens(
369
+ self,
370
+ speech_tokens,
371
+ speaker_embedding=None,
372
+ flow_prompt_speech_token=None,
373
+ prompt_speech_feat=None,
374
+ finalize=True,
375
+ token_offset=0,
376
+ ):
377
+ if speaker_embedding is None:
378
+ speaker_embedding = torch.zeros(1, 192)
379
+ pass
380
+ if flow_prompt_speech_token is None:
381
+ flow_prompt_speech_token = torch.zeros(1, 0, dtype=torch.int32)
382
+ pass
383
+ if prompt_speech_feat is None:
384
+ prompt_speech_feat = torch.zeros(1, 0, 80)
385
+ pass
386
+
387
+ self.flow_model.encoder.static_chunk_size = 2 * self.flow_model.input_frame_rate # 50
388
+ self.flow_model.decoder.estimator.static_chunk_size = 2 * self.flow_model.input_frame_rate * self.flow_model.token_mel_ratio # 100
389
+ device = speech_tokens.device
390
+
391
+ tts_mel, _ = self.flow_model.inference(
392
+ token=speech_tokens.to(device),
393
+ token_len=torch.tensor([speech_tokens.shape[1]], dtype=torch.int32).to(device),
394
+ prompt_token=flow_prompt_speech_token.to(device),
395
+ prompt_token_len=torch.tensor([flow_prompt_speech_token.shape[1]], dtype=torch.int32).to(device),
396
+ prompt_feat=prompt_speech_feat.to(device),
397
+ prompt_feat_len=torch.tensor([prompt_speech_feat.shape[1]], dtype=torch.int32).to(device),
398
+ embedding=speaker_embedding.to(device),
399
+ finalize=finalize,
400
+ )
401
+ tts_mel = tts_mel[:, :, token_offset * self.config.flow_config.token_mel_ratio:]
402
+
403
+ hift_cache_source = torch.zeros(1, 1, 0)
404
+ tts_speech, tts_source = self.hifigan_model.inference(speech_feat=tts_mel, cache_source=hift_cache_source) # [1, sampling point num]
405
+
406
+ return tts_speech
407
+
408
+ @torch.no_grad()
409
+ def generate(
410
+ self,
411
+ pixel_values: torch.FloatTensor,
412
+ input_ids: torch.FloatTensor,
413
+ attention_mask: torch.LongTensor,
414
+ visual_features: Optional[torch.FloatTensor] = None,
415
+ audio_values: Optional[torch.FloatTensor] = None,
416
+ audio_len_after_cnn: Optional[bool] = None,
417
+ audio_token_num: Optional[bool] = None,
418
+ generation_config: Optional[GenerationConfig] = None,
419
+ output_hidden_states: Optional[bool] = None,
420
+ start_token_id:int = 151644,
421
+ generate_audio:bool = False,
422
+ speaker_embedding:torch.Tensor = torch.zeros(1, 192),
423
+ mix_ratio:list=[5,25],
424
+ **generate_kwargs,
425
+ ) -> torch.LongTensor:
426
+ assert self.img_context_token_id is not None
427
+ assert self.audio_context_token_id is not None
428
+
429
+ vit_embeds = None
430
+ if visual_features is not None:
431
+ vit_embeds = visual_features
432
+ elif pixel_values is not None:
433
+ vit_embeds = self.extract_feature(pixel_values)
434
+ cur_conv_start_id = self.find_second_last_occurrence(input_ids.tolist()[0], start_token_id)
435
+
436
+ input_embeds = self.language_model.get_input_embeddings()(input_ids)
437
+ B, N, C = input_embeds.shape
438
+ input_embeds = input_embeds.reshape(B * N, C)
439
+
440
+ input_ids = input_ids.reshape(B * N)
441
+
442
+ if vit_embeds is not None:
443
+ selected = (input_ids == self.img_context_token_id)
444
+ input_embeds[selected] = vit_embeds.reshape(-1, C)
445
+
446
+ if audio_values is not None and audio_len_after_cnn is not None and audio_token_num is not None:
447
+ audio_embeds = self.extract_audio_feature(audio_values, audio_len_after_cnn)
448
+ output_audios = []
449
+ for i in range(len(audio_token_num)):
450
+ token_num = int(audio_token_num[i].item())
451
+ audio = audio_embeds[i][:token_num]
452
+ output_audios.append(audio)
453
+ output_audios = torch.cat(output_audios, dim=0)
454
+ selected = (input_ids == self.audio_context_token_id)
455
+ input_embeds[selected] = output_audios.reshape(-1, C)
456
+
457
+ input_embeds = input_embeds.reshape(B, N, C)
458
+
459
+ outputs = self.language_model.generate(
460
+ inputs_embeds=input_embeds,
461
+ attention_mask=attention_mask,
462
+ generation_config=generation_config,
463
+ output_hidden_states=output_hidden_states or generate_audio,
464
+ return_dict_in_generate=generate_audio,
465
+ use_cache=True,
466
+ **generate_kwargs,
467
+ )
468
+ if not generate_audio:
469
+ return outputs, None, None
470
+
471
+ hidden_states = torch.cat(
472
+ [outputs.hidden_states[0][-1][:, -1:, :]] + [outputs.hidden_states[i][-1] for i in range(1, len(outputs.hidden_states))],
473
+ dim=1,
474
+ )
475
+ sampled_token = outputs.sequences
476
+ if sampled_token.shape[1] == hidden_states.shape[1] + 1:
477
+ sampled_token = sampled_token[:, 1:]
478
+ sampled_token_embeddings = self.language_model.get_input_embeddings()(sampled_token)
479
+ target_text_token_hidden_states = self.fusion(hidden_states, sampled_token_embeddings)
480
+
481
+ input_token_hidden_states = outputs.hidden_states[0][-1][:, cur_conv_start_id:-1, :]
482
+ question_input_embeddings = input_embeds[:, cur_conv_start_id+1:, :]
483
+ input_token_hidden_states = self.fusion(input_token_hidden_states, question_input_embeddings)
484
+
485
+ input_feature = self.mlp_llm2voicelm(input_token_hidden_states)
486
+ target_text_feature = self.mlp_llm2voicelm(target_text_token_hidden_states) #
487
+
488
+ try:
489
+ speech_tokens = self.voicelm_model.inference_bistream(input_feature, target_text_feature, mix_ratio=mix_ratio)
490
+ speech_tokens = torch.LongTensor([speech_tokens]).to(input_feature.device)
491
+ tts_speech = self.decode_speech_tokens(
492
+ speech_tokens,
493
+ speaker_embedding=speaker_embedding,
494
+ )
495
+ except Exception as e:
496
+ logger.warning(f"=========voice lm except:{e}")
497
+ return outputs.sequences,None, None
498
+ return outputs.sequences, speech_tokens, tts_speech
499
+
500
+ def chat(
501
+ self,
502
+ tokenizer,
503
+ generation_config,
504
+ messages,
505
+ max_patch_num=12,
506
+ frame=8,
507
+ generate_audio=False,
508
+ speaker_embedding=torch.zeros(1, 192),
509
+ print_flag=True,
510
+ ):
511
+ if self.flow_model.dtype != torch.float32 or self.hifigan_model.dtype != torch.float32:
512
+ logger.info(f"reset flow model and higigan model dtype to float32")
513
+ self.reset_vocoder()
514
+ pass
515
+ if messages is None or len(messages) == 0:
516
+ raise RuntimeError('no messages')
517
+ role_transfer_dict = {
518
+ 'system': ['user'],
519
+ 'user': ['assistant'],
520
+ 'assistant': ['user'],
521
+ }
522
+
523
+ first_role = ['system', 'user']
524
+ last_role = ['user']
525
+ if messages[-1]['role'] not in last_role:
526
+ raise RuntimeError(f"last role error, expect {last_role}, but got {messages[-1]}")
527
+
528
+ current_role = None
529
+ dynamic_images = list()
530
+ dynamic_nums = list()
531
+ audio_values = list()
532
+ audio_len_after_cnn = list()
533
+ audio_token_num = list()
534
+ template = get_conv_template(self.template)
535
+ for index in range(len(messages)):
536
+ text = ''
537
+ audios = list()
538
+ images = list()
539
+ message = messages[index]
540
+ if index == 0:
541
+ if message['role'] not in first_role:
542
+ raise RuntimeError(f'first role error expect {first_role}, but got {message}')
543
+ else:
544
+ if message['role'] not in current_role:
545
+ raise RuntimeError(f'role error expect {current_role}, but got {message}')
546
+ current_role = message['role']
547
+ if isinstance(message["content"], list):
548
+ for item in message["content"]:
549
+ if item['type'] == 'text':
550
+ if item.get('text', None) is None:
551
+ continue
552
+ text += item['text']
553
+ elif item['type'] == 'audio':
554
+ if item.get('audio', None) is None:
555
+ continue
556
+ if type(item['audio']) is list:
557
+ assert len(item['audio']) == 1, f'only support 1 audio file in round, but got {item["audio"]}'
558
+ audio = item['audio'][0]
559
+ else:
560
+ audio = item['audio']
561
+ audios.append(audio)
562
+ elif item['type'] == 'image':
563
+ if item.get('image', None) is None:
564
+ continue
565
+ if type(item['image']) is not list:
566
+ images.append(item['image'])
567
+ else:
568
+ images.extend(item['image'])
569
+ elif item['type'] == 'video':
570
+ if item.get('video', None) is None:
571
+ continue
572
+ if type(item['video']) is list:
573
+ assert len(item['video']) == 1, f'only support 1 video file in round, but got {item["video"]}'
574
+ video = item['video'][0]
575
+ else:
576
+ video = item['video']
577
+ frames = self.load_video(video, num_segments=frame)
578
+ images.extend(frames)
579
+ else:
580
+ assert isinstance(message["content"], str), message["content"]
581
+ text = message["content"]
582
+
583
+ if len(audios) != 0:
584
+ assert len(audios) == 1, f'only support 1 audio file in round, but got {audios}'
585
+ if '<audio>' in text:
586
+ matches = re.findall(r"<audio>", text)
587
+ assert len(matches) == len(audios), f'<audio> error {text} {len(audios)}' + text
588
+ text = re.sub(r'(<audio>)(?!\n)', r'\1\n', text)
589
+ else:
590
+ text = '<audio>\n'*len(audios) + text
591
+
592
+ audio_path = audios[0]
593
+ audio_input_dict = self.load_audio(audio_path)
594
+ assert audio_input_dict['audio_token_num'].item() != 0, f'audio_token_num of {audio_path} is 0.'
595
+ audio_values.append(audio_input_dict['audio_values'])
596
+ audio_len_after_cnn.append(audio_input_dict['audio_len_after_cnn'])
597
+ audio_token_num.append(audio_input_dict['audio_token_num'])
598
+
599
+ if images is not None:
600
+ if '<image>' in text:
601
+ matches = re.findall(r"<image>", text)
602
+ assert len(matches) == len(images), f'<image> error {text} {len(images)}' + text
603
+ text = re.sub(r'(<image>)(?!\n)', r'\1\n', text)
604
+ else:
605
+ text = '<image>\n'*len(images) + text
606
+
607
+ for image in images:
608
+ dynamic_image = self.load_image(image, max_num=max_patch_num)
609
+ dynamic_images += dynamic_image
610
+ dynamic_nums.append(len(dynamic_image))
611
+
612
+ if message['role'] == 'system':
613
+ template.set_system_message(text)
614
+ elif message['role'] == 'user':
615
+ template.append_message(template.roles[0], text)
616
+ elif message['role'] == 'assistant':
617
+ template.append_message(template.roles[1], text)
618
+ else:
619
+ raise ValueError('unexpected role')
620
+
621
+ current_role = role_transfer_dict[current_role]
622
+
623
+ template.append_message(template.roles[1], None)
624
+
625
+ if len(audio_values) != 0:
626
+ audio_values = torch.cat(audio_values, dim=0).to(dtype=self.dtype).cuda() # [num_audio, 128, 3000]
627
+ audio_len_after_cnn = torch.stack(audio_len_after_cnn, dim=0) # [num_audio]
628
+ audio_token_num = torch.stack(audio_token_num, dim=0) # [num_audio]
629
+ else:
630
+ audio_values = None
631
+ audio_len_after_cnn = None
632
+ audio_token_num = None
633
+
634
+ if len(dynamic_images) != 0:
635
+ pixel_values = [self.transform(image) for image in dynamic_images]
636
+ pixel_values = torch.stack(pixel_values)
637
+ pixel_values = pixel_values.to(torch.bfloat16).cuda()
638
+ else:
639
+ pixel_values = None
640
+ dynamic_nums = None
641
+
642
+ img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
643
+ self.img_context_token_id = img_context_token_id
644
+ audio_context_token_id = tokenizer.convert_tokens_to_ids(AUDIO_CONTEXT_TOKEN)
645
+ self.audio_context_token_id = audio_context_token_id
646
+
647
+ # also add end-of-assistant token in eos token id to avoid unnecessary generation
648
+ eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(["<|im_end|>"])[0]]
649
+ start_token_id = tokenizer.convert_tokens_to_ids(["<|im_start|>"])[0]
650
+
651
+ query = template.get_prompt()
652
+
653
+ if audio_values is not None:
654
+ if print_flag:
655
+ logger.info(f'audio num: {len(audio_token_num)}')
656
+ audio_tokens_list = list()
657
+ for index in range(len(audio_token_num)):
658
+ audio_token_num_i = audio_token_num[index]
659
+ if print_flag:
660
+ logger.info(f'audio_token_num: {audio_token_num_i}')
661
+ audio_tokens = AUDIO_START_TOKEN + AUDIO_CONTEXT_TOKEN * audio_token_num_i + AUDIO_END_TOKEN
662
+ audio_tokens_list.append(audio_tokens)
663
+
664
+ audio_tokens_iter = iter(audio_tokens_list)
665
+
666
+ query = re.sub(r"<audio>", lambda match:next(audio_tokens_iter), query)
667
+
668
+ if pixel_values is not None:
669
+ if print_flag:
670
+ logger.info(f'image num: {len(dynamic_nums)}')
671
+ image_tokens_list = list()
672
+ total_dynamic_num = 0
673
+ for index in range(len(dynamic_nums)):
674
+ dynamic_num = dynamic_nums[index]
675
+ total_dynamic_num += dynamic_num
676
+ if print_flag:
677
+ logger.info(f'dynamic ViT batch size: {dynamic_num}')
678
+ image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * dynamic_num + IMG_END_TOKEN
679
+ image_tokens_list.append(image_tokens)
680
+ assert total_dynamic_num == pixel_values.shape[0], f'dynamic num not equal, {total_dynamic_num}, {pixel_values.shape[0]}'
681
+
682
+ image_tokens_iter = iter(image_tokens_list)
683
+
684
+ query = re.sub(r"<image>", lambda match:next(image_tokens_iter), query)
685
+
686
+ model_inputs = tokenizer(query, return_tensors='pt', add_special_tokens=False)
687
+ input_ids = model_inputs['input_ids'].cuda()
688
+ attention_mask = model_inputs['attention_mask'].cuda()
689
+ generation_config['eos_token_id'] = eos_token_id
690
+ generation_output, speech_token, audio_bytes = self.generate(
691
+ pixel_values=pixel_values,
692
+ audio_values=audio_values,
693
+ audio_len_after_cnn=audio_len_after_cnn,
694
+ audio_token_num=audio_token_num,
695
+ input_ids=input_ids,
696
+ attention_mask=attention_mask,
697
+ generate_audio=generate_audio,
698
+ start_token_id=start_token_id,
699
+ speaker_embedding=speaker_embedding,
700
+ **generation_config
701
+ )
702
+ response = tokenizer.batch_decode(generation_output, skip_special_tokens=False)[0]
703
+ response = response.split("<|im_end|>")[0].replace('<|endoftext|>', '').strip()
704
+ query_to_print = query
705
+ if pixel_values is not None:
706
+ query_to_print = query_to_print.replace(IMG_CONTEXT_TOKEN, '')
707
+ query_to_print = query_to_print.replace(f'{IMG_START_TOKEN}{IMG_END_TOKEN}', '<image>')
708
+ if audio_values is not None:
709
+ query_to_print = query_to_print.replace(AUDIO_CONTEXT_TOKEN, '')
710
+ query_to_print = query_to_print.replace(f'{AUDIO_START_TOKEN}{AUDIO_END_TOKEN}', '<audio>')
711
+ if print_flag:
712
+ logger.info('query: ' + json.dumps(query_to_print, ensure_ascii=False))
713
+ logger.info('response: ' + response)
714
+
715
+ if generate_audio:
716
+ return response, audio_bytes
717
+ return response
718
+
719
+ def __cache_file(self, pretrained_model_name_or_path:str, filename:str, **kw):
720
+ '''cache some file'''
721
+ full_path = cached_file(
722
+ pretrained_model_name_or_path,
723
+ filename,
724
+ subfolder=kw.pop("subfolder", None),
725
+ cache_dir=kw.pop("cache_dir", None),
726
+ force_download=kw.pop("force_download", False),
727
+ proxies=kw.pop("proxies", None),
728
+ resume_download=kw.pop("resume_download", None),
729
+ local_files_only=kw.pop("local_files_only", False),
730
+ token=kw.pop("use_auth_token", None),
731
+ revision=kw.pop("revision", None),
732
+ )
733
+ if full_path is None:
734
+ raise ValueError(f"""{pretrained_model_name_or_path}/{filename} not exists""")
735
+ return full_path
736
+
737
+ @classmethod
738
+ def from_pretrained(
739
+ cls,
740
+ pretrained_model_name_or_path,
741
+ *model_args,
742
+ config=None,
743
+ cache_dir=None,
744
+ ignore_mismatched_sizes=False,
745
+ force_download=False,
746
+ local_files_only=False,
747
+ token=None,
748
+ revision="main",
749
+ use_safetensors=None,
750
+ weights_only=True,
751
+ **kwargs,
752
+ ):
753
+ model = super().from_pretrained(
754
+ pretrained_model_name_or_path,
755
+ *model_args,
756
+ config=config,
757
+ cache_dir=cache_dir,
758
+ ignore_mismatched_sizes=ignore_mismatched_sizes,
759
+ force_download=force_download,
760
+ local_files_only=local_files_only,
761
+ token=token,
762
+ revision=revision,
763
+ use_safetensors=use_safetensors,
764
+ weights_only=weights_only,
765
+ **kwargs,
766
+ )
767
+ campplus_path = model.__cache_file(pretrained_model_name_or_path, "campplus.onnx", **kwargs)
768
+ model.__load_campplus_session(campplus_path)
769
+ default_wav_path = model.__cache_file(pretrained_model_name_or_path, "taozi.wav", **kwargs)
770
+ model.default_wav_path = default_wav_path
771
+ model.default_speaker_embedding = model.extract_speaker_embedding(default_wav_path)
772
+
773
+ return model
modeling_intern_vit.py ADDED
@@ -0,0 +1,427 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # InternVL
3
+ # Copyright (c) 2023 OpenGVLab
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+ from typing import Optional, Tuple, Union
7
+
8
+ import torch
9
+ import torch.nn.functional as F
10
+ import torch.utils.checkpoint
11
+ from einops import rearrange
12
+ from timm.models.layers import DropPath
13
+ from torch import nn
14
+ from transformers.activations import ACT2FN
15
+ from transformers.modeling_outputs import (BaseModelOutput,
16
+ BaseModelOutputWithPooling)
17
+ from transformers.modeling_utils import PreTrainedModel
18
+ from transformers.utils import logging
19
+
20
+ from .configuration_intern_vit import InternVisionConfig
21
+
22
+ try:
23
+ from flash_attn.bert_padding import pad_input, unpad_input
24
+ from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func
25
+ has_flash_attn = True
26
+ except:
27
+ print('FlashAttention2 is not installed.')
28
+ has_flash_attn = False
29
+
30
+ logger = logging.get_logger(__name__)
31
+
32
+
33
+ class FlashAttention(nn.Module):
34
+ """Implement the scaled dot product attention with softmax.
35
+ Arguments
36
+ ---------
37
+ softmax_scale: The temperature to use for the softmax attention.
38
+ (default: 1/sqrt(d_keys) where d_keys is computed at
39
+ runtime)
40
+ attention_dropout: The dropout rate to apply to the attention
41
+ (default: 0.0)
42
+ """
43
+
44
+ def __init__(self, softmax_scale=None, attention_dropout=0.0, device=None, dtype=None):
45
+ super().__init__()
46
+ self.softmax_scale = softmax_scale
47
+ self.dropout_p = attention_dropout
48
+
49
+ def forward(self, qkv, key_padding_mask=None, causal=False, cu_seqlens=None,
50
+ max_s=None, need_weights=False):
51
+ """Implements the multihead softmax attention.
52
+ Arguments
53
+ ---------
54
+ qkv: The tensor containing the query, key, and value. (B, S, 3, H, D) if key_padding_mask is None
55
+ if unpadded: (nnz, 3, h, d)
56
+ key_padding_mask: a bool tensor of shape (B, S)
57
+ """
58
+ assert not need_weights
59
+ assert qkv.dtype in [torch.float16, torch.bfloat16]
60
+ assert qkv.is_cuda
61
+
62
+ if cu_seqlens is None:
63
+ batch_size = qkv.shape[0]
64
+ seqlen = qkv.shape[1]
65
+ if key_padding_mask is None:
66
+ qkv = rearrange(qkv, 'b s ... -> (b s) ...')
67
+ max_s = seqlen
68
+ cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32,
69
+ device=qkv.device)
70
+ output = flash_attn_varlen_qkvpacked_func(
71
+ qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
72
+ softmax_scale=self.softmax_scale, causal=causal
73
+ )
74
+ output = rearrange(output, '(b s) ... -> b s ...', b=batch_size)
75
+ else:
76
+ nheads = qkv.shape[-2]
77
+ x = rearrange(qkv, 'b s three h d -> b s (three h d)')
78
+ x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask)
79
+ x_unpad = rearrange(x_unpad, 'nnz (three h d) -> nnz three h d', three=3, h=nheads)
80
+ output_unpad = flash_attn_varlen_qkvpacked_func(
81
+ x_unpad, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
82
+ softmax_scale=self.softmax_scale, causal=causal
83
+ )
84
+ output = rearrange(pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'),
85
+ indices, batch_size, seqlen),
86
+ 'b s (h d) -> b s h d', h=nheads)
87
+ else:
88
+ assert max_s is not None
89
+ output = flash_attn_varlen_qkvpacked_func(
90
+ qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
91
+ softmax_scale=self.softmax_scale, causal=causal
92
+ )
93
+
94
+ return output, None
95
+
96
+
97
+ class InternRMSNorm(nn.Module):
98
+ def __init__(self, hidden_size, eps=1e-6):
99
+ super().__init__()
100
+ self.weight = nn.Parameter(torch.ones(hidden_size))
101
+ self.variance_epsilon = eps
102
+
103
+ def forward(self, hidden_states):
104
+ input_dtype = hidden_states.dtype
105
+ hidden_states = hidden_states.to(torch.float32)
106
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
107
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
108
+ return self.weight * hidden_states.to(input_dtype)
109
+
110
+
111
+ try:
112
+ from apex.normalization import FusedRMSNorm
113
+
114
+ InternRMSNorm = FusedRMSNorm # noqa
115
+
116
+ logger.info('Discovered apex.normalization.FusedRMSNorm - will use it instead of InternRMSNorm')
117
+ except ImportError:
118
+ # using the normal InternRMSNorm
119
+ pass
120
+ except Exception:
121
+ logger.warning('discovered apex but it failed to load, falling back to InternRMSNorm')
122
+ pass
123
+
124
+
125
+ NORM2FN = {
126
+ 'rms_norm': InternRMSNorm,
127
+ 'layer_norm': nn.LayerNorm,
128
+ }
129
+
130
+
131
+ class InternVisionEmbeddings(nn.Module):
132
+ def __init__(self, config: InternVisionConfig):
133
+ super().__init__()
134
+ self.config = config
135
+ self.embed_dim = config.hidden_size
136
+ self.image_size = config.image_size
137
+ self.patch_size = config.patch_size
138
+
139
+ self.class_embedding = nn.Parameter(
140
+ torch.randn(1, 1, self.embed_dim),
141
+ )
142
+
143
+ self.patch_embedding = nn.Conv2d(
144
+ in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
145
+ )
146
+
147
+ self.num_patches = (self.image_size // self.patch_size) ** 2
148
+ self.num_positions = self.num_patches + 1
149
+
150
+ self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))
151
+
152
+ def _get_pos_embed(self, pos_embed, H, W):
153
+ target_dtype = pos_embed.dtype
154
+ pos_embed = pos_embed.float().reshape(
155
+ 1, self.image_size // self.patch_size, self.image_size // self.patch_size, -1).permute(0, 3, 1, 2)
156
+ pos_embed = F.interpolate(pos_embed, size=(H, W), mode='bicubic', align_corners=False).\
157
+ reshape(1, -1, H * W).permute(0, 2, 1).to(target_dtype)
158
+ return pos_embed
159
+
160
+ def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
161
+ target_dtype = self.patch_embedding.weight.dtype
162
+ patch_embeds = self.patch_embedding(pixel_values) # shape = [*, channel, width, height]
163
+ batch_size, _, height, width = patch_embeds.shape
164
+ patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
165
+ class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
166
+ embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
167
+ position_embedding = torch.cat([
168
+ self.position_embedding[:, :1, :],
169
+ self._get_pos_embed(self.position_embedding[:, 1:, :], height, width)
170
+ ], dim=1)
171
+ embeddings = embeddings + position_embedding.to(target_dtype)
172
+ return embeddings
173
+
174
+
175
+ class InternAttention(nn.Module):
176
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
177
+
178
+ def __init__(self, config: InternVisionConfig):
179
+ super().__init__()
180
+ self.config = config
181
+ self.embed_dim = config.hidden_size
182
+ self.num_heads = config.num_attention_heads
183
+ self.use_flash_attn = config.use_flash_attn and has_flash_attn
184
+ if config.use_flash_attn and not has_flash_attn:
185
+ print('Warning: Flash Attention is not available, use_flash_attn is set to False.')
186
+ self.head_dim = self.embed_dim // self.num_heads
187
+ if self.head_dim * self.num_heads != self.embed_dim:
188
+ raise ValueError(
189
+ f'embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:'
190
+ f' {self.num_heads}).'
191
+ )
192
+
193
+ self.scale = self.head_dim ** -0.5
194
+ self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=config.qkv_bias)
195
+ self.attn_drop = nn.Dropout(config.attention_dropout)
196
+ self.proj_drop = nn.Dropout(config.dropout)
197
+
198
+ self.qk_normalization = config.qk_normalization
199
+
200
+ if self.qk_normalization:
201
+ self.q_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
202
+ self.k_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
203
+
204
+ if self.use_flash_attn:
205
+ self.inner_attn = FlashAttention(attention_dropout=config.attention_dropout)
206
+ self.proj = nn.Linear(self.embed_dim, self.embed_dim)
207
+
208
+ def _naive_attn(self, x):
209
+ B, N, C = x.shape
210
+ qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
211
+ q, k, v = qkv.unbind(0) # make torchscript happy (cannot use tensor as tuple)
212
+
213
+ if self.qk_normalization:
214
+ B_, H_, N_, D_ = q.shape
215
+ q = self.q_norm(q.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
216
+ k = self.k_norm(k.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
217
+
218
+ attn = ((q * self.scale) @ k.transpose(-2, -1))
219
+ attn = attn.softmax(dim=-1)
220
+ attn = self.attn_drop(attn)
221
+
222
+ x = (attn @ v).transpose(1, 2).reshape(B, N, C)
223
+ x = self.proj(x)
224
+ x = self.proj_drop(x)
225
+ return x
226
+
227
+ def _flash_attn(self, x, key_padding_mask=None, need_weights=False):
228
+ qkv = self.qkv(x)
229
+ qkv = rearrange(qkv, 'b s (three h d) -> b s three h d', three=3, h=self.num_heads)
230
+
231
+ if self.qk_normalization:
232
+ q, k, v = qkv.unbind(2)
233
+ q = self.q_norm(q.flatten(-2, -1)).view(q.shape)
234
+ k = self.k_norm(k.flatten(-2, -1)).view(k.shape)
235
+ qkv = torch.stack([q, k, v], dim=2)
236
+
237
+ context, _ = self.inner_attn(
238
+ qkv, key_padding_mask=key_padding_mask, need_weights=need_weights, causal=False
239
+ )
240
+ outs = self.proj(rearrange(context, 'b s h d -> b s (h d)'))
241
+ outs = self.proj_drop(outs)
242
+ return outs
243
+
244
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
245
+ x = self._naive_attn(hidden_states) if not self.use_flash_attn else self._flash_attn(hidden_states)
246
+ return x
247
+
248
+
249
+ class InternMLP(nn.Module):
250
+ def __init__(self, config: InternVisionConfig):
251
+ super().__init__()
252
+ self.config = config
253
+ self.act = ACT2FN[config.hidden_act]
254
+ self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
255
+ self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
256
+
257
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
258
+ hidden_states = self.fc1(hidden_states)
259
+ hidden_states = self.act(hidden_states)
260
+ hidden_states = self.fc2(hidden_states)
261
+ return hidden_states
262
+
263
+
264
+ class InternVisionEncoderLayer(nn.Module):
265
+ def __init__(self, config: InternVisionConfig, drop_path_rate: float):
266
+ super().__init__()
267
+ self.embed_dim = config.hidden_size
268
+ self.intermediate_size = config.intermediate_size
269
+ self.norm_type = config.norm_type
270
+
271
+ self.attn = InternAttention(config)
272
+ self.mlp = InternMLP(config)
273
+ self.norm1 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps)
274
+ self.norm2 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps)
275
+
276
+ self.ls1 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
277
+ self.ls2 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
278
+ self.drop_path1 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
279
+ self.drop_path2 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
280
+
281
+ def forward(
282
+ self,
283
+ hidden_states: torch.Tensor,
284
+ ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor], Optional[Tuple[torch.FloatTensor]]]:
285
+ """
286
+ Args:
287
+ hidden_states (`Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]`): input to the layer of shape `(batch, seq_len, embed_dim)`
288
+ """
289
+ hidden_states = hidden_states + self.drop_path1(self.attn(self.norm1(hidden_states)) * self.ls1)
290
+
291
+ hidden_states = hidden_states + self.drop_path2(self.mlp(self.norm2(hidden_states)) * self.ls2)
292
+
293
+ return hidden_states
294
+
295
+
296
+ class InternVisionEncoder(nn.Module):
297
+ """
298
+ Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
299
+ [`InternEncoderLayer`].
300
+
301
+ Args:
302
+ config (`InternConfig`):
303
+ The corresponding vision configuration for the `InternEncoder`.
304
+ """
305
+
306
+ def __init__(self, config: InternVisionConfig):
307
+ super().__init__()
308
+ self.config = config
309
+ # stochastic depth decay rule
310
+ dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)]
311
+ self.layers = nn.ModuleList([
312
+ InternVisionEncoderLayer(config, dpr[idx]) for idx in range(config.num_hidden_layers)])
313
+ self.gradient_checkpointing = True
314
+
315
+ def forward(
316
+ self,
317
+ inputs_embeds,
318
+ output_hidden_states: Optional[bool] = None,
319
+ return_dict: Optional[bool] = None,
320
+ ) -> Union[Tuple, BaseModelOutput]:
321
+ r"""
322
+ Args:
323
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
324
+ Embedded representation of the inputs. Should be float, not int tokens.
325
+ output_hidden_states (`bool`, *optional*):
326
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
327
+ for more detail.
328
+ return_dict (`bool`, *optional*):
329
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
330
+ """
331
+ output_hidden_states = (
332
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
333
+ )
334
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
335
+
336
+ encoder_states = () if output_hidden_states else None
337
+ hidden_states = inputs_embeds
338
+
339
+ for idx, encoder_layer in enumerate(self.layers):
340
+ if output_hidden_states:
341
+ encoder_states = encoder_states + (hidden_states,)
342
+ if self.gradient_checkpointing and self.training:
343
+ layer_outputs = torch.utils.checkpoint.checkpoint(
344
+ encoder_layer,
345
+ hidden_states)
346
+ else:
347
+ layer_outputs = encoder_layer(
348
+ hidden_states,
349
+ )
350
+ hidden_states = layer_outputs
351
+
352
+ if output_hidden_states:
353
+ encoder_states = encoder_states + (hidden_states,)
354
+
355
+ if not return_dict:
356
+ return tuple(v for v in [hidden_states, encoder_states] if v is not None)
357
+ return BaseModelOutput(
358
+ last_hidden_state=hidden_states, hidden_states=encoder_states
359
+ )
360
+
361
+
362
+ class InternVisionModel(PreTrainedModel):
363
+ main_input_name = 'pixel_values'
364
+ config_class = InternVisionConfig
365
+ _no_split_modules = ['InternVisionEncoderLayer']
366
+
367
+ def __init__(self, config: InternVisionConfig):
368
+ super().__init__(config)
369
+ self.config = config
370
+
371
+ self.embeddings = InternVisionEmbeddings(config)
372
+ self.encoder = InternVisionEncoder(config)
373
+
374
+ def resize_pos_embeddings(self, old_size, new_size, patch_size):
375
+ pos_emb = self.embeddings.position_embedding
376
+ _, num_positions, embed_dim = pos_emb.shape
377
+ cls_emb = pos_emb[:, :1, :]
378
+ pos_emb = pos_emb[:, 1:, :].reshape(1, old_size // patch_size, old_size // patch_size, -1).permute(0, 3, 1, 2)
379
+ pos_emb = F.interpolate(pos_emb.float(), size=new_size // patch_size, mode='bicubic', align_corners=False)
380
+ pos_emb = pos_emb.to(cls_emb.dtype).reshape(1, embed_dim, -1).permute(0, 2, 1)
381
+ pos_emb = torch.cat([cls_emb, pos_emb], dim=1)
382
+ self.embeddings.position_embedding = nn.Parameter(pos_emb)
383
+ self.embeddings.image_size = new_size
384
+ logger.info('Resized position embeddings from {} to {}'.format(old_size, new_size))
385
+
386
+ def get_input_embeddings(self):
387
+ return self.embeddings
388
+
389
+ def forward(
390
+ self,
391
+ pixel_values: Optional[torch.FloatTensor] = None,
392
+ output_hidden_states: Optional[bool] = None,
393
+ return_dict: Optional[bool] = None,
394
+ pixel_embeds: Optional[torch.FloatTensor] = None,
395
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
396
+ output_hidden_states = (
397
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
398
+ )
399
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
400
+
401
+ if pixel_values is None and pixel_embeds is None:
402
+ raise ValueError('You have to specify pixel_values or pixel_embeds')
403
+
404
+ if pixel_embeds is not None:
405
+ hidden_states = pixel_embeds
406
+ else:
407
+ if len(pixel_values.shape) == 4:
408
+ hidden_states = self.embeddings(pixel_values)
409
+ else:
410
+ raise ValueError(f'wrong pixel_values size: {pixel_values.shape}')
411
+ encoder_outputs = self.encoder(
412
+ inputs_embeds=hidden_states,
413
+ output_hidden_states=output_hidden_states,
414
+ return_dict=return_dict,
415
+ )
416
+ last_hidden_state = encoder_outputs.last_hidden_state
417
+ pooled_output = last_hidden_state[:, 0, :]
418
+
419
+ if not return_dict:
420
+ return (last_hidden_state, pooled_output) + encoder_outputs[1:]
421
+
422
+ return BaseModelOutputWithPooling(
423
+ last_hidden_state=last_hidden_state,
424
+ pooler_output=pooled_output,
425
+ hidden_states=encoder_outputs.hidden_states,
426
+ attentions=encoder_outputs.attentions,
427
+ )
modeling_voicelm.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # SenseTime
3
+ # Copyright (c) 2025 SenseTime
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+ from typing import List
7
+ import math
8
+ import torch
9
+ from torch import nn
10
+ from transformers import Qwen2ForCausalLM
11
+ from transformers import PreTrainedModel
12
+ import logging
13
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
14
+ logger = logging.getLogger(__name__)
15
+
16
+ from .configuration_voicelm import VoiceLMConfig
17
+
18
+ class Qwen2Encoder(torch.nn.Module):
19
+ def __init__(self, config):
20
+ super().__init__()
21
+ self.model = Qwen2ForCausalLM(config)
22
+ pass
23
+
24
+ def forward_one_step(self, xs, masks, cache=None):
25
+ input_masks = masks[:, -1, :]
26
+ outs = self.model(
27
+ inputs_embeds=xs,
28
+ attention_mask=input_masks,
29
+ output_hidden_states=True,
30
+ return_dict=True,
31
+ use_cache=True,
32
+ past_key_values=cache,
33
+ )
34
+ xs = outs.hidden_states[-1]
35
+ new_cache = outs.past_key_values
36
+ return xs, new_cache
37
+
38
+ class VoiceLM(PreTrainedModel):
39
+ """
40
+ voicelm model
41
+ """
42
+ def __init__(self, config: VoiceLMConfig):
43
+ super().__init__(config)
44
+ self.llm_input_size = config.llm_input_size
45
+ self.llm_output_size = config.llm_output_size
46
+ self.speech_token_size = config.speech_token_size # 6561
47
+ self.sampling_config = config.sampling_config
48
+
49
+ self.sos_eos = 0
50
+ self.task_id = 1
51
+ self.fill_token = 2
52
+
53
+ self.llm_embedding = torch.nn.Embedding(2, config.llm_input_size)
54
+ self.llm = Qwen2Encoder(config.llm_config)
55
+ self.llm_decoder = nn.Linear(config.llm_output_size, config.speech_token_size + 3)
56
+
57
+ # speech token embedding (6564, 896)
58
+ self.speech_embedding = torch.nn.Embedding(
59
+ config.speech_token_size + 3,
60
+ config.llm_input_size,
61
+ )
62
+ pass
63
+
64
+ # Repetition Aware Sampling in VALL-E 2
65
+ def ras_sampling(self, weighted_scores:torch.Tensor, decoded_tokens, sampling, top_p=0.8, top_k=25, win_size=10, tau_r=0.1):
66
+ top_ids = self.nucleus_sampling(weighted_scores, top_p=top_p, top_k=top_k)
67
+ rep_num = (torch.tensor(decoded_tokens[-win_size:]).to(weighted_scores.device) == top_ids).sum().item()
68
+ if rep_num >= win_size * tau_r:
69
+ top_ids = self.random_sampling(weighted_scores, decoded_tokens, sampling)
70
+ return top_ids
71
+
72
+ def nucleus_sampling(self, weighted_scores:torch.Tensor, top_p=0.8, top_k=25):
73
+ prob, indices = [], []
74
+ cum_prob = 0.0
75
+ sorted_value, sorted_idx = weighted_scores.softmax(dim=0).sort(descending=True, stable=True)
76
+ for i in range(len(sorted_idx)):
77
+ # sampling both top-p and numbers.
78
+ if cum_prob < top_p and len(prob) < top_k:
79
+ cum_prob += sorted_value[i]
80
+ prob.append(sorted_value[i])
81
+ indices.append(sorted_idx[i])
82
+ else:
83
+ break
84
+ prob = torch.tensor(prob).to(weighted_scores)
85
+ indices = torch.tensor(indices, dtype=torch.long).to(weighted_scores.device)
86
+ top_ids = indices[prob.multinomial(1, replacement=True)]
87
+ return top_ids
88
+
89
+ def random_sampling(self, weighted_scores:torch.Tensor, decoded_tokens, sampling):
90
+ top_ids = weighted_scores.softmax(dim=0).multinomial(1, replacement=True)
91
+ return top_ids
92
+
93
+ def sampling_ids(
94
+ self,
95
+ weighted_scores: torch.Tensor,
96
+ decoded_tokens: List,
97
+ sampling: int,
98
+ ignore_eos: bool = True,
99
+ ):
100
+ num_trials, max_trials = 0, 100
101
+ while True:
102
+ top_ids = self.ras_sampling(weighted_scores, decoded_tokens, sampling, **self.sampling_config)
103
+ if (not ignore_eos) or (self.speech_token_size not in top_ids):
104
+ break
105
+ num_trials += 1
106
+ if num_trials > max_trials:
107
+ raise RuntimeError('sampling reaches max_trials {} and still get eos when ignore_eos is True, check your input!'.format(max_trials))
108
+ return top_ids
109
+
110
+ @torch.inference_mode()
111
+ def inference_bistream(
112
+ self,
113
+ input_feature: torch.Tensor,
114
+ target_text_feature: torch.Tensor,
115
+ sampling: int = 25,
116
+ mix_ratio: List[int] = [5, 25],
117
+ ):
118
+ text_token_len = target_text_feature.size(1)
119
+ # 1. prepare input
120
+ sos_eos_emb = self.llm_embedding.weight[self.sos_eos].reshape(1, 1, -1)
121
+ task_id_emb = self.llm_embedding.weight[self.task_id].reshape(1, 1, -1)
122
+ lm_input = torch.concat([sos_eos_emb, input_feature], dim=1)
123
+
124
+ # 2. iterate text
125
+ out_tokens = []
126
+ return_out_tokens = []
127
+ cache = None
128
+
129
+ text_cache = target_text_feature
130
+ next_fill_index = -1
131
+
132
+ for j in range(int(math.floor((text_token_len) / mix_ratio[0] ))):
133
+ if (len(out_tokens) != 0 and out_tokens[-1] == self.speech_token_size + 2) or (len(out_tokens) == 0 and lm_input.size(1) == (1 + input_feature.size(1))):
134
+ logger.info('get fill token, need to append more text token')
135
+ if text_cache.size(1) >= mix_ratio[0]:
136
+ lm_input_text = text_cache[:, :mix_ratio[0]]
137
+ logger.info('append {} text token'.format(lm_input_text.size(1)))
138
+ if len(out_tokens) != 0 and out_tokens[-1] == self.speech_token_size + 2:
139
+ lm_input = lm_input_text
140
+ else:
141
+ lm_input = torch.concat([lm_input, lm_input_text], dim=1)
142
+ text_cache = text_cache[:, mix_ratio[0]:]
143
+ else:
144
+ logger.info('not enough text token to decode, wait for more')
145
+ continue
146
+ while True:
147
+ seq_len = lm_input.shape[1] if cache is None else lm_input.shape[1] + cache[0][0].size(2)
148
+ y_pred, cache = self.llm.forward_one_step(lm_input,
149
+ masks=torch.tril(torch.ones((1, seq_len, seq_len), device=lm_input.device)).to(torch.bool),
150
+ cache=cache)
151
+ logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1)
152
+ if next_fill_index != -1 and len(out_tokens) == next_fill_index:
153
+ top_ids = self.speech_token_size + 2
154
+ next_fill_index += (mix_ratio[1] + 1)
155
+ else:
156
+ top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True).item()
157
+ if top_ids == self.speech_token_size + 2:
158
+ next_fill_index = len(out_tokens) + mix_ratio[1] + 1
159
+ logger.info('fill_token index {} next fill_token index {}'.format(len(out_tokens), next_fill_index))
160
+ out_tokens.append(top_ids)
161
+ if top_ids >= self.speech_token_size:
162
+ if top_ids == self.speech_token_size + 2:
163
+ break
164
+ else:
165
+ raise ValueError('should not get token {}'.format(top_ids))
166
+ # yield top_ids
167
+
168
+ return_out_tokens.append(top_ids)
169
+ lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1)
170
+
171
+ # 3. final decode
172
+ lm_input = torch.concat([lm_input, text_cache, task_id_emb], dim=1)
173
+ logger.info('no more text token, decode until met eos')
174
+ while True:
175
+ seq_len = lm_input.shape[1] if cache is None else lm_input.shape[1] + cache[0][0].size(2)
176
+ y_pred, cache = self.llm.forward_one_step(lm_input,
177
+ masks=torch.tril(torch.ones((1, seq_len, seq_len), device=lm_input.device)).to(torch.bool),
178
+ cache=cache)
179
+ logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1)
180
+ top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=False).item()
181
+ out_tokens.append(top_ids)
182
+ if top_ids >= self.speech_token_size:
183
+ if top_ids == self.speech_token_size:
184
+ break
185
+ else:
186
+ raise ValueError('should not get token {}'.format(top_ids))
187
+ # in stream mode, yield token one by one
188
+ # yield top_ids
189
+
190
+ return_out_tokens.append(top_ids)
191
+ lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1)
192
+ return return_out_tokens
modeling_whisper.py ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>",
16
+ "<IMG_CONTEXT>",
17
+ "<img>",
18
+ "</img>",
19
+ "<quad>",
20
+ "</quad>",
21
+ "<ref>",
22
+ "</ref>",
23
+ "<box>",
24
+ "</box>",
25
+ "<|action_start|>",
26
+ "<|action_end|>",
27
+ "<|plugin|>",
28
+ "<|interpreter|>",
29
+ "<FAKE_PAD_0>",
30
+ "<FAKE_PAD_1>",
31
+ "<FAKE_PAD_2>",
32
+ "<FAKE_PAD_3>",
33
+ "<FAKE_PAD_4>",
34
+ "<FAKE_PAD_5>",
35
+ "<FAKE_PAD_6>",
36
+ "<FAKE_PAD_7>",
37
+ "<FAKE_PAD_8>",
38
+ "<FAKE_PAD_9>",
39
+ "<FAKE_PAD_10>",
40
+ "<FAKE_PAD_11>",
41
+ "<FAKE_PAD_12>",
42
+ "<FAKE_PAD_13>",
43
+ "<FAKE_PAD_14>",
44
+ "<FAKE_PAD_15>",
45
+ "<FAKE_PAD_16>",
46
+ "<FAKE_PAD_17>",
47
+ "<FAKE_PAD_18>",
48
+ "<FAKE_PAD_19>",
49
+ "<FAKE_PAD_20>",
50
+ "<FAKE_PAD_21>",
51
+ "<FAKE_PAD_22>",
52
+ "<FAKE_PAD_23>",
53
+ "<FAKE_PAD_24>",
54
+ "<FAKE_PAD_25>",
55
+ "<FAKE_PAD_26>",
56
+ "<FAKE_PAD_27>",
57
+ "<FAKE_PAD_28>",
58
+ "<FAKE_PAD_29>",
59
+ "<FAKE_PAD_30>",
60
+ "<FAKE_PAD_31>",
61
+ "<FAKE_PAD_32>",
62
+ "<FAKE_PAD_33>",
63
+ "<FAKE_PAD_34>",
64
+ "<FAKE_PAD_35>",
65
+ "<FAKE_PAD_36>",
66
+ "<FAKE_PAD_37>",
67
+ "<FAKE_PAD_38>",
68
+ "<FAKE_PAD_39>",
69
+ "<FAKE_PAD_40>",
70
+ "<FAKE_PAD_41>",
71
+ "<FAKE_PAD_42>",
72
+ "<FAKE_PAD_43>",
73
+ "<FAKE_PAD_44>",
74
+ "<FAKE_PAD_45>",
75
+ "<FAKE_PAD_46>",
76
+ "<FAKE_PAD_47>",
77
+ "<FAKE_PAD_48>",
78
+ "<FAKE_PAD_49>",
79
+ "<FAKE_PAD_50>",
80
+ "<FAKE_PAD_51>",
81
+ "<FAKE_PAD_52>",
82
+ "<FAKE_PAD_53>",
83
+ "<FAKE_PAD_54>",
84
+ "<FAKE_PAD_55>",
85
+ "<FAKE_PAD_56>",
86
+ "<FAKE_PAD_57>",
87
+ "<FAKE_PAD_58>",
88
+ "<FAKE_PAD_59>",
89
+ "<FAKE_PAD_60>",
90
+ "<FAKE_PAD_61>",
91
+ "<FAKE_PAD_62>",
92
+ "<FAKE_PAD_63>",
93
+ "<FAKE_PAD_64>",
94
+ "<FAKE_PAD_65>",
95
+ "<FAKE_PAD_66>",
96
+ "<FAKE_PAD_67>",
97
+ "<FAKE_PAD_68>",
98
+ "<FAKE_PAD_69>",
99
+ "<FAKE_PAD_70>",
100
+ "<FAKE_PAD_71>",
101
+ "<FAKE_PAD_72>",
102
+ "<FAKE_PAD_73>",
103
+ "<FAKE_PAD_74>",
104
+ "<FAKE_PAD_75>",
105
+ "<FAKE_PAD_76>",
106
+ "<FAKE_PAD_77>",
107
+ "<FAKE_PAD_78>",
108
+ "<FAKE_PAD_79>",
109
+ "<FAKE_PAD_80>",
110
+ "<FAKE_PAD_81>",
111
+ "<FAKE_PAD_82>",
112
+ "<FAKE_PAD_83>",
113
+ "<FAKE_PAD_84>",
114
+ "<FAKE_PAD_85>",
115
+ "<FAKE_PAD_86>",
116
+ "<FAKE_PAD_87>",
117
+ "<FAKE_PAD_88>",
118
+ "<FAKE_PAD_89>",
119
+ "<FAKE_PAD_90>",
120
+ "<FAKE_PAD_91>",
121
+ "<FAKE_PAD_92>",
122
+ "<FAKE_PAD_93>",
123
+ "<FAKE_PAD_94>",
124
+ "<FAKE_PAD_95>",
125
+ "<FAKE_PAD_96>",
126
+ "<FAKE_PAD_97>",
127
+ "<FAKE_PAD_98>",
128
+ "<FAKE_PAD_99>",
129
+ "<FAKE_PAD_100>",
130
+ "<FAKE_PAD_101>",
131
+ "<FAKE_PAD_102>",
132
+ "<FAKE_PAD_103>",
133
+ "<FAKE_PAD_104>",
134
+ "<FAKE_PAD_105>",
135
+ "<FAKE_PAD_106>",
136
+ "<FAKE_PAD_107>",
137
+ "<FAKE_PAD_108>",
138
+ "<FAKE_PAD_109>",
139
+ "<FAKE_PAD_110>",
140
+ "<FAKE_PAD_111>",
141
+ "<FAKE_PAD_112>",
142
+ "<FAKE_PAD_113>",
143
+ "<FAKE_PAD_114>",
144
+ "<FAKE_PAD_115>",
145
+ "<FAKE_PAD_116>",
146
+ "<FAKE_PAD_117>",
147
+ "<FAKE_PAD_118>",
148
+ "<FAKE_PAD_119>",
149
+ "<FAKE_PAD_120>",
150
+ "<FAKE_PAD_121>",
151
+ "<FAKE_PAD_122>",
152
+ "<FAKE_PAD_123>",
153
+ "<FAKE_PAD_124>",
154
+ "<FAKE_PAD_125>",
155
+ "<FAKE_PAD_126>",
156
+ "<FAKE_PAD_127>",
157
+ "<FAKE_PAD_128>",
158
+ "<FAKE_PAD_129>",
159
+ "<FAKE_PAD_130>",
160
+ "<FAKE_PAD_131>",
161
+ "<FAKE_PAD_132>",
162
+ "<FAKE_PAD_133>",
163
+ "<FAKE_PAD_134>",
164
+ "<FAKE_PAD_135>",
165
+ "<FAKE_PAD_136>",
166
+ "<FAKE_PAD_137>",
167
+ "<FAKE_PAD_138>",
168
+ "<FAKE_PAD_139>",
169
+ "<FAKE_PAD_140>",
170
+ "<FAKE_PAD_141>",
171
+ "<FAKE_PAD_142>",
172
+ "<FAKE_PAD_143>",
173
+ "<FAKE_PAD_144>",
174
+ "<FAKE_PAD_145>",
175
+ "<FAKE_PAD_146>",
176
+ "<FAKE_PAD_147>",
177
+ "<FAKE_PAD_148>",
178
+ "<FAKE_PAD_149>",
179
+ "<FAKE_PAD_150>",
180
+ "<FAKE_PAD_151>",
181
+ "<FAKE_PAD_152>",
182
+ "<FAKE_PAD_153>",
183
+ "<FAKE_PAD_154>",
184
+ "<FAKE_PAD_155>",
185
+ "<FAKE_PAD_156>",
186
+ "<FAKE_PAD_157>",
187
+ "<FAKE_PAD_158>",
188
+ "<FAKE_PAD_159>",
189
+ "<FAKE_PAD_160>",
190
+ "<FAKE_PAD_161>",
191
+ "<FAKE_PAD_162>",
192
+ "<FAKE_PAD_163>",
193
+ "<FAKE_PAD_164>",
194
+ "<FAKE_PAD_165>",
195
+ "<FAKE_PAD_166>",
196
+ "<FAKE_PAD_167>",
197
+ "<FAKE_PAD_168>",
198
+ "<FAKE_PAD_169>",
199
+ "<FAKE_PAD_170>",
200
+ "<FAKE_PAD_171>",
201
+ "<FAKE_PAD_172>",
202
+ "<FAKE_PAD_173>",
203
+ "<FAKE_PAD_174>",
204
+ "<FAKE_PAD_175>",
205
+ "<FAKE_PAD_176>",
206
+ "<FAKE_PAD_177>",
207
+ "<FAKE_PAD_178>",
208
+ "<FAKE_PAD_179>",
209
+ "<FAKE_PAD_180>",
210
+ "<FAKE_PAD_181>",
211
+ "<FAKE_PAD_182>",
212
+ "<FAKE_PAD_183>",
213
+ "<FAKE_PAD_184>",
214
+ "<FAKE_PAD_185>",
215
+ "<FAKE_PAD_186>",
216
+ "<FAKE_PAD_187>",
217
+ "<FAKE_PAD_188>",
218
+ "<FAKE_PAD_189>",
219
+ "<FAKE_PAD_190>",
220
+ "<FAKE_PAD_191>",
221
+ "<FAKE_PAD_192>",
222
+ "<FAKE_PAD_193>",
223
+ "<FAKE_PAD_194>",
224
+ "<FAKE_PAD_195>",
225
+ "<FAKE_PAD_196>",
226
+ "<FAKE_PAD_197>",
227
+ "<FAKE_PAD_198>",
228
+ "<FAKE_PAD_199>",
229
+ "<FAKE_PAD_200>",
230
+ "<FAKE_PAD_201>",
231
+ "<FAKE_PAD_202>",
232
+ "<FAKE_PAD_203>",
233
+ "<FAKE_PAD_204>",
234
+ "<FAKE_PAD_205>",
235
+ "<FAKE_PAD_206>",
236
+ "<FAKE_PAD_207>",
237
+ "<FAKE_PAD_208>",
238
+ "<FAKE_PAD_209>",
239
+ "<FAKE_PAD_210>",
240
+ "<FAKE_PAD_211>",
241
+ "<FAKE_PAD_212>",
242
+ "<FAKE_PAD_213>",
243
+ "<FAKE_PAD_214>",
244
+ "<FAKE_PAD_215>",
245
+ "<FAKE_PAD_216>",
246
+ "<FAKE_PAD_217>",
247
+ "<FAKE_PAD_218>",
248
+ "<FAKE_PAD_219>",
249
+ "<FAKE_PAD_220>",
250
+ "<FAKE_PAD_221>",
251
+ "<FAKE_PAD_222>",
252
+ "<FAKE_PAD_223>",
253
+ "<FAKE_PAD_224>",
254
+ "<FAKE_PAD_225>",
255
+ "<FAKE_PAD_226>",
256
+ "<FAKE_PAD_227>",
257
+ "<FAKE_PAD_228>",
258
+ "<FAKE_PAD_229>",
259
+ "<FAKE_PAD_230>",
260
+ "<FAKE_PAD_231>",
261
+ "<FAKE_PAD_232>",
262
+ "<FAKE_PAD_233>",
263
+ "<FAKE_PAD_234>",
264
+ "<FAKE_PAD_235>",
265
+ "<FAKE_PAD_236>",
266
+ "<FAKE_PAD_237>",
267
+ "<FAKE_PAD_238>",
268
+ "<FAKE_PAD_239>",
269
+ "<FAKE_PAD_240>",
270
+ "<FAKE_PAD_241>",
271
+ "<FAKE_PAD_242>",
272
+ "<FAKE_PAD_243>",
273
+ "<FAKE_PAD_244>",
274
+ "<FAKE_PAD_245>",
275
+ "<FAKE_PAD_246>",
276
+ "<FAKE_PAD_247>",
277
+ "<FAKE_PAD_248>",
278
+ "<FAKE_PAD_249>",
279
+ "<FAKE_PAD_250>",
280
+ "<FAKE_PAD_251>",
281
+ "<FAKE_PAD_252>",
282
+ "<FAKE_PAD_253>",
283
+ "<audio>",
284
+ "</audio>",
285
+ "<AUDIO_CONTEXT>",
286
+ "<interrupt>",
287
+ "<FAKE_PAD_PAD_0>",
288
+ "<FAKE_PAD_PAD_1>",
289
+ "<FAKE_PAD_PAD_2>",
290
+ "<FAKE_PAD_PAD_3>",
291
+ "<FAKE_PAD_PAD_4>",
292
+ "<FAKE_PAD_PAD_5>",
293
+ "<FAKE_PAD_PAD_6>",
294
+ "<FAKE_PAD_PAD_7>",
295
+ "<FAKE_PAD_PAD_8>",
296
+ "<FAKE_PAD_PAD_9>",
297
+ "<FAKE_PAD_PAD_10>",
298
+ "<FAKE_PAD_PAD_11>",
299
+ "<FAKE_PAD_PAD_12>",
300
+ "<FAKE_PAD_PAD_13>",
301
+ "<FAKE_PAD_PAD_14>",
302
+ "<FAKE_PAD_PAD_15>",
303
+ "<FAKE_PAD_PAD_16>",
304
+ "<FAKE_PAD_PAD_17>",
305
+ "<FAKE_PAD_PAD_18>",
306
+ "<FAKE_PAD_PAD_19>",
307
+ "<FAKE_PAD_PAD_20>",
308
+ "<FAKE_PAD_PAD_21>",
309
+ "<FAKE_PAD_PAD_22>",
310
+ "<FAKE_PAD_PAD_23>",
311
+ "<FAKE_PAD_PAD_24>",
312
+ "<FAKE_PAD_PAD_25>",
313
+ "<FAKE_PAD_PAD_26>",
314
+ "<FAKE_PAD_PAD_27>"
315
+ ],
316
+ "eos_token": {
317
+ "content": "<|im_end|>",
318
+ "lstrip": false,
319
+ "normalized": false,
320
+ "rstrip": false,
321
+ "single_word": false
322
+ },
323
+ "pad_token": {
324
+ "content": "<|endoftext|>",
325
+ "lstrip": false,
326
+ "normalized": false,
327
+ "rstrip": false,
328
+ "single_word": false
329
+ }
330
+ }
taozi.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3d286d93323ff1ed598503c40cf028dc3faa946c662fa8d509b201165d56356
3
+ size 807404
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,2931 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": false,
5
+ "added_tokens_decoder": {
6
+ "151643": {
7
+ "content": "<|endoftext|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "151644": {
15
+ "content": "<|im_start|>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "151645": {
23
+ "content": "<|im_end|>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "151646": {
31
+ "content": "<|object_ref_start|>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "151647": {
39
+ "content": "<|object_ref_end|>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": true
45
+ },
46
+ "151648": {
47
+ "content": "<|box_start|>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": true
53
+ },
54
+ "151649": {
55
+ "content": "<|box_end|>",
56
+ "lstrip": false,
57
+ "normalized": false,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": true
61
+ },
62
+ "151650": {
63
+ "content": "<|quad_start|>",
64
+ "lstrip": false,
65
+ "normalized": false,
66
+ "rstrip": false,
67
+ "single_word": false,
68
+ "special": true
69
+ },
70
+ "151651": {
71
+ "content": "<|quad_end|>",
72
+ "lstrip": false,
73
+ "normalized": false,
74
+ "rstrip": false,
75
+ "single_word": false,
76
+ "special": true
77
+ },
78
+ "151652": {
79
+ "content": "<|vision_start|>",
80
+ "lstrip": false,
81
+ "normalized": false,
82
+ "rstrip": false,
83
+ "single_word": false,
84
+ "special": true
85
+ },
86
+ "151653": {
87
+ "content": "<|vision_end|>",
88
+ "lstrip": false,
89
+ "normalized": false,
90
+ "rstrip": false,
91
+ "single_word": false,
92
+ "special": true
93
+ },
94
+ "151654": {
95
+ "content": "<|vision_pad|>",
96
+ "lstrip": false,
97
+ "normalized": false,
98
+ "rstrip": false,
99
+ "single_word": false,
100
+ "special": true
101
+ },
102
+ "151655": {
103
+ "content": "<|image_pad|>",
104
+ "lstrip": false,
105
+ "normalized": false,
106
+ "rstrip": false,
107
+ "single_word": false,
108
+ "special": true
109
+ },
110
+ "151656": {
111
+ "content": "<|video_pad|>",
112
+ "lstrip": false,
113
+ "normalized": false,
114
+ "rstrip": false,
115
+ "single_word": false,
116
+ "special": true
117
+ },
118
+ "151657": {
119
+ "content": "<tool_call>",
120
+ "lstrip": false,
121
+ "normalized": false,
122
+ "rstrip": false,
123
+ "single_word": false,
124
+ "special": false
125
+ },
126
+ "151658": {
127
+ "content": "</tool_call>",
128
+ "lstrip": false,
129
+ "normalized": false,
130
+ "rstrip": false,
131
+ "single_word": false,
132
+ "special": false
133
+ },
134
+ "151659": {
135
+ "content": "<|fim_prefix|>",
136
+ "lstrip": false,
137
+ "normalized": false,
138
+ "rstrip": false,
139
+ "single_word": false,
140
+ "special": false
141
+ },
142
+ "151660": {
143
+ "content": "<|fim_middle|>",
144
+ "lstrip": false,
145
+ "normalized": false,
146
+ "rstrip": false,
147
+ "single_word": false,
148
+ "special": false
149
+ },
150
+ "151661": {
151
+ "content": "<|fim_suffix|>",
152
+ "lstrip": false,
153
+ "normalized": false,
154
+ "rstrip": false,
155
+ "single_word": false,
156
+ "special": false
157
+ },
158
+ "151662": {
159
+ "content": "<|fim_pad|>",
160
+ "lstrip": false,
161
+ "normalized": false,
162
+ "rstrip": false,
163
+ "single_word": false,
164
+ "special": false
165
+ },
166
+ "151663": {
167
+ "content": "<|repo_name|>",
168
+ "lstrip": false,
169
+ "normalized": false,
170
+ "rstrip": false,
171
+ "single_word": false,
172
+ "special": false
173
+ },
174
+ "151664": {
175
+ "content": "<|file_sep|>",
176
+ "lstrip": false,
177
+ "normalized": false,
178
+ "rstrip": false,
179
+ "single_word": false,
180
+ "special": false
181
+ },
182
+ "151665": {
183
+ "content": "<tool_response>",
184
+ "lstrip": false,
185
+ "normalized": false,
186
+ "rstrip": false,
187
+ "single_word": false,
188
+ "special": false
189
+ },
190
+ "151666": {
191
+ "content": "</tool_response>",
192
+ "lstrip": false,
193
+ "normalized": false,
194
+ "rstrip": false,
195
+ "single_word": false,
196
+ "special": false
197
+ },
198
+ "151667": {
199
+ "content": "<think>",
200
+ "lstrip": false,
201
+ "normalized": false,
202
+ "rstrip": false,
203
+ "single_word": false,
204
+ "special": false
205
+ },
206
+ "151668": {
207
+ "content": "</think>",
208
+ "lstrip": false,
209
+ "normalized": false,
210
+ "rstrip": false,
211
+ "single_word": false,
212
+ "special": false
213
+ },
214
+ "151669": {
215
+ "content": "<IMG_CONTEXT>",
216
+ "lstrip": false,
217
+ "normalized": false,
218
+ "rstrip": false,
219
+ "single_word": false,
220
+ "special": true
221
+ },
222
+ "151670": {
223
+ "content": "<img>",
224
+ "lstrip": false,
225
+ "normalized": false,
226
+ "rstrip": false,
227
+ "single_word": false,
228
+ "special": true
229
+ },
230
+ "151671": {
231
+ "content": "</img>",
232
+ "lstrip": false,
233
+ "normalized": false,
234
+ "rstrip": false,
235
+ "single_word": false,
236
+ "special": true
237
+ },
238
+ "151672": {
239
+ "content": "<quad>",
240
+ "lstrip": false,
241
+ "normalized": false,
242
+ "rstrip": false,
243
+ "single_word": false,
244
+ "special": true
245
+ },
246
+ "151673": {
247
+ "content": "</quad>",
248
+ "lstrip": false,
249
+ "normalized": false,
250
+ "rstrip": false,
251
+ "single_word": false,
252
+ "special": true
253
+ },
254
+ "151674": {
255
+ "content": "<ref>",
256
+ "lstrip": false,
257
+ "normalized": false,
258
+ "rstrip": false,
259
+ "single_word": false,
260
+ "special": true
261
+ },
262
+ "151675": {
263
+ "content": "</ref>",
264
+ "lstrip": false,
265
+ "normalized": false,
266
+ "rstrip": false,
267
+ "single_word": false,
268
+ "special": true
269
+ },
270
+ "151676": {
271
+ "content": "<box>",
272
+ "lstrip": false,
273
+ "normalized": false,
274
+ "rstrip": false,
275
+ "single_word": false,
276
+ "special": true
277
+ },
278
+ "151677": {
279
+ "content": "</box>",
280
+ "lstrip": false,
281
+ "normalized": false,
282
+ "rstrip": false,
283
+ "single_word": false,
284
+ "special": true
285
+ },
286
+ "151678": {
287
+ "content": "<|action_start|>",
288
+ "lstrip": false,
289
+ "normalized": false,
290
+ "rstrip": false,
291
+ "single_word": false,
292
+ "special": true
293
+ },
294
+ "151679": {
295
+ "content": "<|action_end|>",
296
+ "lstrip": false,
297
+ "normalized": false,
298
+ "rstrip": false,
299
+ "single_word": false,
300
+ "special": true
301
+ },
302
+ "151680": {
303
+ "content": "<|plugin|>",
304
+ "lstrip": false,
305
+ "normalized": false,
306
+ "rstrip": false,
307
+ "single_word": false,
308
+ "special": true
309
+ },
310
+ "151681": {
311
+ "content": "<|interpreter|>",
312
+ "lstrip": false,
313
+ "normalized": false,
314
+ "rstrip": false,
315
+ "single_word": false,
316
+ "special": true
317
+ },
318
+ "151682": {
319
+ "content": "<FAKE_PAD_0>",
320
+ "lstrip": false,
321
+ "normalized": false,
322
+ "rstrip": false,
323
+ "single_word": false,
324
+ "special": true
325
+ },
326
+ "151683": {
327
+ "content": "<FAKE_PAD_1>",
328
+ "lstrip": false,
329
+ "normalized": false,
330
+ "rstrip": false,
331
+ "single_word": false,
332
+ "special": true
333
+ },
334
+ "151684": {
335
+ "content": "<FAKE_PAD_2>",
336
+ "lstrip": false,
337
+ "normalized": false,
338
+ "rstrip": false,
339
+ "single_word": false,
340
+ "special": true
341
+ },
342
+ "151685": {
343
+ "content": "<FAKE_PAD_3>",
344
+ "lstrip": false,
345
+ "normalized": false,
346
+ "rstrip": false,
347
+ "single_word": false,
348
+ "special": true
349
+ },
350
+ "151686": {
351
+ "content": "<FAKE_PAD_4>",
352
+ "lstrip": false,
353
+ "normalized": false,
354
+ "rstrip": false,
355
+ "single_word": false,
356
+ "special": true
357
+ },
358
+ "151687": {
359
+ "content": "<FAKE_PAD_5>",
360
+ "lstrip": false,
361
+ "normalized": false,
362
+ "rstrip": false,
363
+ "single_word": false,
364
+ "special": true
365
+ },
366
+ "151688": {
367
+ "content": "<FAKE_PAD_6>",
368
+ "lstrip": false,
369
+ "normalized": false,
370
+ "rstrip": false,
371
+ "single_word": false,
372
+ "special": true
373
+ },
374
+ "151689": {
375
+ "content": "<FAKE_PAD_7>",
376
+ "lstrip": false,
377
+ "normalized": false,
378
+ "rstrip": false,
379
+ "single_word": false,
380
+ "special": true
381
+ },
382
+ "151690": {
383
+ "content": "<FAKE_PAD_8>",
384
+ "lstrip": false,
385
+ "normalized": false,
386
+ "rstrip": false,
387
+ "single_word": false,
388
+ "special": true
389
+ },
390
+ "151691": {
391
+ "content": "<FAKE_PAD_9>",
392
+ "lstrip": false,
393
+ "normalized": false,
394
+ "rstrip": false,
395
+ "single_word": false,
396
+ "special": true
397
+ },
398
+ "151692": {
399
+ "content": "<FAKE_PAD_10>",
400
+ "lstrip": false,
401
+ "normalized": false,
402
+ "rstrip": false,
403
+ "single_word": false,
404
+ "special": true
405
+ },
406
+ "151693": {
407
+ "content": "<FAKE_PAD_11>",
408
+ "lstrip": false,
409
+ "normalized": false,
410
+ "rstrip": false,
411
+ "single_word": false,
412
+ "special": true
413
+ },
414
+ "151694": {
415
+ "content": "<FAKE_PAD_12>",
416
+ "lstrip": false,
417
+ "normalized": false,
418
+ "rstrip": false,
419
+ "single_word": false,
420
+ "special": true
421
+ },
422
+ "151695": {
423
+ "content": "<FAKE_PAD_13>",
424
+ "lstrip": false,
425
+ "normalized": false,
426
+ "rstrip": false,
427
+ "single_word": false,
428
+ "special": true
429
+ },
430
+ "151696": {
431
+ "content": "<FAKE_PAD_14>",
432
+ "lstrip": false,
433
+ "normalized": false,
434
+ "rstrip": false,
435
+ "single_word": false,
436
+ "special": true
437
+ },
438
+ "151697": {
439
+ "content": "<FAKE_PAD_15>",
440
+ "lstrip": false,
441
+ "normalized": false,
442
+ "rstrip": false,
443
+ "single_word": false,
444
+ "special": true
445
+ },
446
+ "151698": {
447
+ "content": "<FAKE_PAD_16>",
448
+ "lstrip": false,
449
+ "normalized": false,
450
+ "rstrip": false,
451
+ "single_word": false,
452
+ "special": true
453
+ },
454
+ "151699": {
455
+ "content": "<FAKE_PAD_17>",
456
+ "lstrip": false,
457
+ "normalized": false,
458
+ "rstrip": false,
459
+ "single_word": false,
460
+ "special": true
461
+ },
462
+ "151700": {
463
+ "content": "<FAKE_PAD_18>",
464
+ "lstrip": false,
465
+ "normalized": false,
466
+ "rstrip": false,
467
+ "single_word": false,
468
+ "special": true
469
+ },
470
+ "151701": {
471
+ "content": "<FAKE_PAD_19>",
472
+ "lstrip": false,
473
+ "normalized": false,
474
+ "rstrip": false,
475
+ "single_word": false,
476
+ "special": true
477
+ },
478
+ "151702": {
479
+ "content": "<FAKE_PAD_20>",
480
+ "lstrip": false,
481
+ "normalized": false,
482
+ "rstrip": false,
483
+ "single_word": false,
484
+ "special": true
485
+ },
486
+ "151703": {
487
+ "content": "<FAKE_PAD_21>",
488
+ "lstrip": false,
489
+ "normalized": false,
490
+ "rstrip": false,
491
+ "single_word": false,
492
+ "special": true
493
+ },
494
+ "151704": {
495
+ "content": "<FAKE_PAD_22>",
496
+ "lstrip": false,
497
+ "normalized": false,
498
+ "rstrip": false,
499
+ "single_word": false,
500
+ "special": true
501
+ },
502
+ "151705": {
503
+ "content": "<FAKE_PAD_23>",
504
+ "lstrip": false,
505
+ "normalized": false,
506
+ "rstrip": false,
507
+ "single_word": false,
508
+ "special": true
509
+ },
510
+ "151706": {
511
+ "content": "<FAKE_PAD_24>",
512
+ "lstrip": false,
513
+ "normalized": false,
514
+ "rstrip": false,
515
+ "single_word": false,
516
+ "special": true
517
+ },
518
+ "151707": {
519
+ "content": "<FAKE_PAD_25>",
520
+ "lstrip": false,
521
+ "normalized": false,
522
+ "rstrip": false,
523
+ "single_word": false,
524
+ "special": true
525
+ },
526
+ "151708": {
527
+ "content": "<FAKE_PAD_26>",
528
+ "lstrip": false,
529
+ "normalized": false,
530
+ "rstrip": false,
531
+ "single_word": false,
532
+ "special": true
533
+ },
534
+ "151709": {
535
+ "content": "<FAKE_PAD_27>",
536
+ "lstrip": false,
537
+ "normalized": false,
538
+ "rstrip": false,
539
+ "single_word": false,
540
+ "special": true
541
+ },
542
+ "151710": {
543
+ "content": "<FAKE_PAD_28>",
544
+ "lstrip": false,
545
+ "normalized": false,
546
+ "rstrip": false,
547
+ "single_word": false,
548
+ "special": true
549
+ },
550
+ "151711": {
551
+ "content": "<FAKE_PAD_29>",
552
+ "lstrip": false,
553
+ "normalized": false,
554
+ "rstrip": false,
555
+ "single_word": false,
556
+ "special": true
557
+ },
558
+ "151712": {
559
+ "content": "<FAKE_PAD_30>",
560
+ "lstrip": false,
561
+ "normalized": false,
562
+ "rstrip": false,
563
+ "single_word": false,
564
+ "special": true
565
+ },
566
+ "151713": {
567
+ "content": "<FAKE_PAD_31>",
568
+ "lstrip": false,
569
+ "normalized": false,
570
+ "rstrip": false,
571
+ "single_word": false,
572
+ "special": true
573
+ },
574
+ "151714": {
575
+ "content": "<FAKE_PAD_32>",
576
+ "lstrip": false,
577
+ "normalized": false,
578
+ "rstrip": false,
579
+ "single_word": false,
580
+ "special": true
581
+ },
582
+ "151715": {
583
+ "content": "<FAKE_PAD_33>",
584
+ "lstrip": false,
585
+ "normalized": false,
586
+ "rstrip": false,
587
+ "single_word": false,
588
+ "special": true
589
+ },
590
+ "151716": {
591
+ "content": "<FAKE_PAD_34>",
592
+ "lstrip": false,
593
+ "normalized": false,
594
+ "rstrip": false,
595
+ "single_word": false,
596
+ "special": true
597
+ },
598
+ "151717": {
599
+ "content": "<FAKE_PAD_35>",
600
+ "lstrip": false,
601
+ "normalized": false,
602
+ "rstrip": false,
603
+ "single_word": false,
604
+ "special": true
605
+ },
606
+ "151718": {
607
+ "content": "<FAKE_PAD_36>",
608
+ "lstrip": false,
609
+ "normalized": false,
610
+ "rstrip": false,
611
+ "single_word": false,
612
+ "special": true
613
+ },
614
+ "151719": {
615
+ "content": "<FAKE_PAD_37>",
616
+ "lstrip": false,
617
+ "normalized": false,
618
+ "rstrip": false,
619
+ "single_word": false,
620
+ "special": true
621
+ },
622
+ "151720": {
623
+ "content": "<FAKE_PAD_38>",
624
+ "lstrip": false,
625
+ "normalized": false,
626
+ "rstrip": false,
627
+ "single_word": false,
628
+ "special": true
629
+ },
630
+ "151721": {
631
+ "content": "<FAKE_PAD_39>",
632
+ "lstrip": false,
633
+ "normalized": false,
634
+ "rstrip": false,
635
+ "single_word": false,
636
+ "special": true
637
+ },
638
+ "151722": {
639
+ "content": "<FAKE_PAD_40>",
640
+ "lstrip": false,
641
+ "normalized": false,
642
+ "rstrip": false,
643
+ "single_word": false,
644
+ "special": true
645
+ },
646
+ "151723": {
647
+ "content": "<FAKE_PAD_41>",
648
+ "lstrip": false,
649
+ "normalized": false,
650
+ "rstrip": false,
651
+ "single_word": false,
652
+ "special": true
653
+ },
654
+ "151724": {
655
+ "content": "<FAKE_PAD_42>",
656
+ "lstrip": false,
657
+ "normalized": false,
658
+ "rstrip": false,
659
+ "single_word": false,
660
+ "special": true
661
+ },
662
+ "151725": {
663
+ "content": "<FAKE_PAD_43>",
664
+ "lstrip": false,
665
+ "normalized": false,
666
+ "rstrip": false,
667
+ "single_word": false,
668
+ "special": true
669
+ },
670
+ "151726": {
671
+ "content": "<FAKE_PAD_44>",
672
+ "lstrip": false,
673
+ "normalized": false,
674
+ "rstrip": false,
675
+ "single_word": false,
676
+ "special": true
677
+ },
678
+ "151727": {
679
+ "content": "<FAKE_PAD_45>",
680
+ "lstrip": false,
681
+ "normalized": false,
682
+ "rstrip": false,
683
+ "single_word": false,
684
+ "special": true
685
+ },
686
+ "151728": {
687
+ "content": "<FAKE_PAD_46>",
688
+ "lstrip": false,
689
+ "normalized": false,
690
+ "rstrip": false,
691
+ "single_word": false,
692
+ "special": true
693
+ },
694
+ "151729": {
695
+ "content": "<FAKE_PAD_47>",
696
+ "lstrip": false,
697
+ "normalized": false,
698
+ "rstrip": false,
699
+ "single_word": false,
700
+ "special": true
701
+ },
702
+ "151730": {
703
+ "content": "<FAKE_PAD_48>",
704
+ "lstrip": false,
705
+ "normalized": false,
706
+ "rstrip": false,
707
+ "single_word": false,
708
+ "special": true
709
+ },
710
+ "151731": {
711
+ "content": "<FAKE_PAD_49>",
712
+ "lstrip": false,
713
+ "normalized": false,
714
+ "rstrip": false,
715
+ "single_word": false,
716
+ "special": true
717
+ },
718
+ "151732": {
719
+ "content": "<FAKE_PAD_50>",
720
+ "lstrip": false,
721
+ "normalized": false,
722
+ "rstrip": false,
723
+ "single_word": false,
724
+ "special": true
725
+ },
726
+ "151733": {
727
+ "content": "<FAKE_PAD_51>",
728
+ "lstrip": false,
729
+ "normalized": false,
730
+ "rstrip": false,
731
+ "single_word": false,
732
+ "special": true
733
+ },
734
+ "151734": {
735
+ "content": "<FAKE_PAD_52>",
736
+ "lstrip": false,
737
+ "normalized": false,
738
+ "rstrip": false,
739
+ "single_word": false,
740
+ "special": true
741
+ },
742
+ "151735": {
743
+ "content": "<FAKE_PAD_53>",
744
+ "lstrip": false,
745
+ "normalized": false,
746
+ "rstrip": false,
747
+ "single_word": false,
748
+ "special": true
749
+ },
750
+ "151736": {
751
+ "content": "<FAKE_PAD_54>",
752
+ "lstrip": false,
753
+ "normalized": false,
754
+ "rstrip": false,
755
+ "single_word": false,
756
+ "special": true
757
+ },
758
+ "151737": {
759
+ "content": "<FAKE_PAD_55>",
760
+ "lstrip": false,
761
+ "normalized": false,
762
+ "rstrip": false,
763
+ "single_word": false,
764
+ "special": true
765
+ },
766
+ "151738": {
767
+ "content": "<FAKE_PAD_56>",
768
+ "lstrip": false,
769
+ "normalized": false,
770
+ "rstrip": false,
771
+ "single_word": false,
772
+ "special": true
773
+ },
774
+ "151739": {
775
+ "content": "<FAKE_PAD_57>",
776
+ "lstrip": false,
777
+ "normalized": false,
778
+ "rstrip": false,
779
+ "single_word": false,
780
+ "special": true
781
+ },
782
+ "151740": {
783
+ "content": "<FAKE_PAD_58>",
784
+ "lstrip": false,
785
+ "normalized": false,
786
+ "rstrip": false,
787
+ "single_word": false,
788
+ "special": true
789
+ },
790
+ "151741": {
791
+ "content": "<FAKE_PAD_59>",
792
+ "lstrip": false,
793
+ "normalized": false,
794
+ "rstrip": false,
795
+ "single_word": false,
796
+ "special": true
797
+ },
798
+ "151742": {
799
+ "content": "<FAKE_PAD_60>",
800
+ "lstrip": false,
801
+ "normalized": false,
802
+ "rstrip": false,
803
+ "single_word": false,
804
+ "special": true
805
+ },
806
+ "151743": {
807
+ "content": "<FAKE_PAD_61>",
808
+ "lstrip": false,
809
+ "normalized": false,
810
+ "rstrip": false,
811
+ "single_word": false,
812
+ "special": true
813
+ },
814
+ "151744": {
815
+ "content": "<FAKE_PAD_62>",
816
+ "lstrip": false,
817
+ "normalized": false,
818
+ "rstrip": false,
819
+ "single_word": false,
820
+ "special": true
821
+ },
822
+ "151745": {
823
+ "content": "<FAKE_PAD_63>",
824
+ "lstrip": false,
825
+ "normalized": false,
826
+ "rstrip": false,
827
+ "single_word": false,
828
+ "special": true
829
+ },
830
+ "151746": {
831
+ "content": "<FAKE_PAD_64>",
832
+ "lstrip": false,
833
+ "normalized": false,
834
+ "rstrip": false,
835
+ "single_word": false,
836
+ "special": true
837
+ },
838
+ "151747": {
839
+ "content": "<FAKE_PAD_65>",
840
+ "lstrip": false,
841
+ "normalized": false,
842
+ "rstrip": false,
843
+ "single_word": false,
844
+ "special": true
845
+ },
846
+ "151748": {
847
+ "content": "<FAKE_PAD_66>",
848
+ "lstrip": false,
849
+ "normalized": false,
850
+ "rstrip": false,
851
+ "single_word": false,
852
+ "special": true
853
+ },
854
+ "151749": {
855
+ "content": "<FAKE_PAD_67>",
856
+ "lstrip": false,
857
+ "normalized": false,
858
+ "rstrip": false,
859
+ "single_word": false,
860
+ "special": true
861
+ },
862
+ "151750": {
863
+ "content": "<FAKE_PAD_68>",
864
+ "lstrip": false,
865
+ "normalized": false,
866
+ "rstrip": false,
867
+ "single_word": false,
868
+ "special": true
869
+ },
870
+ "151751": {
871
+ "content": "<FAKE_PAD_69>",
872
+ "lstrip": false,
873
+ "normalized": false,
874
+ "rstrip": false,
875
+ "single_word": false,
876
+ "special": true
877
+ },
878
+ "151752": {
879
+ "content": "<FAKE_PAD_70>",
880
+ "lstrip": false,
881
+ "normalized": false,
882
+ "rstrip": false,
883
+ "single_word": false,
884
+ "special": true
885
+ },
886
+ "151753": {
887
+ "content": "<FAKE_PAD_71>",
888
+ "lstrip": false,
889
+ "normalized": false,
890
+ "rstrip": false,
891
+ "single_word": false,
892
+ "special": true
893
+ },
894
+ "151754": {
895
+ "content": "<FAKE_PAD_72>",
896
+ "lstrip": false,
897
+ "normalized": false,
898
+ "rstrip": false,
899
+ "single_word": false,
900
+ "special": true
901
+ },
902
+ "151755": {
903
+ "content": "<FAKE_PAD_73>",
904
+ "lstrip": false,
905
+ "normalized": false,
906
+ "rstrip": false,
907
+ "single_word": false,
908
+ "special": true
909
+ },
910
+ "151756": {
911
+ "content": "<FAKE_PAD_74>",
912
+ "lstrip": false,
913
+ "normalized": false,
914
+ "rstrip": false,
915
+ "single_word": false,
916
+ "special": true
917
+ },
918
+ "151757": {
919
+ "content": "<FAKE_PAD_75>",
920
+ "lstrip": false,
921
+ "normalized": false,
922
+ "rstrip": false,
923
+ "single_word": false,
924
+ "special": true
925
+ },
926
+ "151758": {
927
+ "content": "<FAKE_PAD_76>",
928
+ "lstrip": false,
929
+ "normalized": false,
930
+ "rstrip": false,
931
+ "single_word": false,
932
+ "special": true
933
+ },
934
+ "151759": {
935
+ "content": "<FAKE_PAD_77>",
936
+ "lstrip": false,
937
+ "normalized": false,
938
+ "rstrip": false,
939
+ "single_word": false,
940
+ "special": true
941
+ },
942
+ "151760": {
943
+ "content": "<FAKE_PAD_78>",
944
+ "lstrip": false,
945
+ "normalized": false,
946
+ "rstrip": false,
947
+ "single_word": false,
948
+ "special": true
949
+ },
950
+ "151761": {
951
+ "content": "<FAKE_PAD_79>",
952
+ "lstrip": false,
953
+ "normalized": false,
954
+ "rstrip": false,
955
+ "single_word": false,
956
+ "special": true
957
+ },
958
+ "151762": {
959
+ "content": "<FAKE_PAD_80>",
960
+ "lstrip": false,
961
+ "normalized": false,
962
+ "rstrip": false,
963
+ "single_word": false,
964
+ "special": true
965
+ },
966
+ "151763": {
967
+ "content": "<FAKE_PAD_81>",
968
+ "lstrip": false,
969
+ "normalized": false,
970
+ "rstrip": false,
971
+ "single_word": false,
972
+ "special": true
973
+ },
974
+ "151764": {
975
+ "content": "<FAKE_PAD_82>",
976
+ "lstrip": false,
977
+ "normalized": false,
978
+ "rstrip": false,
979
+ "single_word": false,
980
+ "special": true
981
+ },
982
+ "151765": {
983
+ "content": "<FAKE_PAD_83>",
984
+ "lstrip": false,
985
+ "normalized": false,
986
+ "rstrip": false,
987
+ "single_word": false,
988
+ "special": true
989
+ },
990
+ "151766": {
991
+ "content": "<FAKE_PAD_84>",
992
+ "lstrip": false,
993
+ "normalized": false,
994
+ "rstrip": false,
995
+ "single_word": false,
996
+ "special": true
997
+ },
998
+ "151767": {
999
+ "content": "<FAKE_PAD_85>",
1000
+ "lstrip": false,
1001
+ "normalized": false,
1002
+ "rstrip": false,
1003
+ "single_word": false,
1004
+ "special": true
1005
+ },
1006
+ "151768": {
1007
+ "content": "<FAKE_PAD_86>",
1008
+ "lstrip": false,
1009
+ "normalized": false,
1010
+ "rstrip": false,
1011
+ "single_word": false,
1012
+ "special": true
1013
+ },
1014
+ "151769": {
1015
+ "content": "<FAKE_PAD_87>",
1016
+ "lstrip": false,
1017
+ "normalized": false,
1018
+ "rstrip": false,
1019
+ "single_word": false,
1020
+ "special": true
1021
+ },
1022
+ "151770": {
1023
+ "content": "<FAKE_PAD_88>",
1024
+ "lstrip": false,
1025
+ "normalized": false,
1026
+ "rstrip": false,
1027
+ "single_word": false,
1028
+ "special": true
1029
+ },
1030
+ "151771": {
1031
+ "content": "<FAKE_PAD_89>",
1032
+ "lstrip": false,
1033
+ "normalized": false,
1034
+ "rstrip": false,
1035
+ "single_word": false,
1036
+ "special": true
1037
+ },
1038
+ "151772": {
1039
+ "content": "<FAKE_PAD_90>",
1040
+ "lstrip": false,
1041
+ "normalized": false,
1042
+ "rstrip": false,
1043
+ "single_word": false,
1044
+ "special": true
1045
+ },
1046
+ "151773": {
1047
+ "content": "<FAKE_PAD_91>",
1048
+ "lstrip": false,
1049
+ "normalized": false,
1050
+ "rstrip": false,
1051
+ "single_word": false,
1052
+ "special": true
1053
+ },
1054
+ "151774": {
1055
+ "content": "<FAKE_PAD_92>",
1056
+ "lstrip": false,
1057
+ "normalized": false,
1058
+ "rstrip": false,
1059
+ "single_word": false,
1060
+ "special": true
1061
+ },
1062
+ "151775": {
1063
+ "content": "<FAKE_PAD_93>",
1064
+ "lstrip": false,
1065
+ "normalized": false,
1066
+ "rstrip": false,
1067
+ "single_word": false,
1068
+ "special": true
1069
+ },
1070
+ "151776": {
1071
+ "content": "<FAKE_PAD_94>",
1072
+ "lstrip": false,
1073
+ "normalized": false,
1074
+ "rstrip": false,
1075
+ "single_word": false,
1076
+ "special": true
1077
+ },
1078
+ "151777": {
1079
+ "content": "<FAKE_PAD_95>",
1080
+ "lstrip": false,
1081
+ "normalized": false,
1082
+ "rstrip": false,
1083
+ "single_word": false,
1084
+ "special": true
1085
+ },
1086
+ "151778": {
1087
+ "content": "<FAKE_PAD_96>",
1088
+ "lstrip": false,
1089
+ "normalized": false,
1090
+ "rstrip": false,
1091
+ "single_word": false,
1092
+ "special": true
1093
+ },
1094
+ "151779": {
1095
+ "content": "<FAKE_PAD_97>",
1096
+ "lstrip": false,
1097
+ "normalized": false,
1098
+ "rstrip": false,
1099
+ "single_word": false,
1100
+ "special": true
1101
+ },
1102
+ "151780": {
1103
+ "content": "<FAKE_PAD_98>",
1104
+ "lstrip": false,
1105
+ "normalized": false,
1106
+ "rstrip": false,
1107
+ "single_word": false,
1108
+ "special": true
1109
+ },
1110
+ "151781": {
1111
+ "content": "<FAKE_PAD_99>",
1112
+ "lstrip": false,
1113
+ "normalized": false,
1114
+ "rstrip": false,
1115
+ "single_word": false,
1116
+ "special": true
1117
+ },
1118
+ "151782": {
1119
+ "content": "<FAKE_PAD_100>",
1120
+ "lstrip": false,
1121
+ "normalized": false,
1122
+ "rstrip": false,
1123
+ "single_word": false,
1124
+ "special": true
1125
+ },
1126
+ "151783": {
1127
+ "content": "<FAKE_PAD_101>",
1128
+ "lstrip": false,
1129
+ "normalized": false,
1130
+ "rstrip": false,
1131
+ "single_word": false,
1132
+ "special": true
1133
+ },
1134
+ "151784": {
1135
+ "content": "<FAKE_PAD_102>",
1136
+ "lstrip": false,
1137
+ "normalized": false,
1138
+ "rstrip": false,
1139
+ "single_word": false,
1140
+ "special": true
1141
+ },
1142
+ "151785": {
1143
+ "content": "<FAKE_PAD_103>",
1144
+ "lstrip": false,
1145
+ "normalized": false,
1146
+ "rstrip": false,
1147
+ "single_word": false,
1148
+ "special": true
1149
+ },
1150
+ "151786": {
1151
+ "content": "<FAKE_PAD_104>",
1152
+ "lstrip": false,
1153
+ "normalized": false,
1154
+ "rstrip": false,
1155
+ "single_word": false,
1156
+ "special": true
1157
+ },
1158
+ "151787": {
1159
+ "content": "<FAKE_PAD_105>",
1160
+ "lstrip": false,
1161
+ "normalized": false,
1162
+ "rstrip": false,
1163
+ "single_word": false,
1164
+ "special": true
1165
+ },
1166
+ "151788": {
1167
+ "content": "<FAKE_PAD_106>",
1168
+ "lstrip": false,
1169
+ "normalized": false,
1170
+ "rstrip": false,
1171
+ "single_word": false,
1172
+ "special": true
1173
+ },
1174
+ "151789": {
1175
+ "content": "<FAKE_PAD_107>",
1176
+ "lstrip": false,
1177
+ "normalized": false,
1178
+ "rstrip": false,
1179
+ "single_word": false,
1180
+ "special": true
1181
+ },
1182
+ "151790": {
1183
+ "content": "<FAKE_PAD_108>",
1184
+ "lstrip": false,
1185
+ "normalized": false,
1186
+ "rstrip": false,
1187
+ "single_word": false,
1188
+ "special": true
1189
+ },
1190
+ "151791": {
1191
+ "content": "<FAKE_PAD_109>",
1192
+ "lstrip": false,
1193
+ "normalized": false,
1194
+ "rstrip": false,
1195
+ "single_word": false,
1196
+ "special": true
1197
+ },
1198
+ "151792": {
1199
+ "content": "<FAKE_PAD_110>",
1200
+ "lstrip": false,
1201
+ "normalized": false,
1202
+ "rstrip": false,
1203
+ "single_word": false,
1204
+ "special": true
1205
+ },
1206
+ "151793": {
1207
+ "content": "<FAKE_PAD_111>",
1208
+ "lstrip": false,
1209
+ "normalized": false,
1210
+ "rstrip": false,
1211
+ "single_word": false,
1212
+ "special": true
1213
+ },
1214
+ "151794": {
1215
+ "content": "<FAKE_PAD_112>",
1216
+ "lstrip": false,
1217
+ "normalized": false,
1218
+ "rstrip": false,
1219
+ "single_word": false,
1220
+ "special": true
1221
+ },
1222
+ "151795": {
1223
+ "content": "<FAKE_PAD_113>",
1224
+ "lstrip": false,
1225
+ "normalized": false,
1226
+ "rstrip": false,
1227
+ "single_word": false,
1228
+ "special": true
1229
+ },
1230
+ "151796": {
1231
+ "content": "<FAKE_PAD_114>",
1232
+ "lstrip": false,
1233
+ "normalized": false,
1234
+ "rstrip": false,
1235
+ "single_word": false,
1236
+ "special": true
1237
+ },
1238
+ "151797": {
1239
+ "content": "<FAKE_PAD_115>",
1240
+ "lstrip": false,
1241
+ "normalized": false,
1242
+ "rstrip": false,
1243
+ "single_word": false,
1244
+ "special": true
1245
+ },
1246
+ "151798": {
1247
+ "content": "<FAKE_PAD_116>",
1248
+ "lstrip": false,
1249
+ "normalized": false,
1250
+ "rstrip": false,
1251
+ "single_word": false,
1252
+ "special": true
1253
+ },
1254
+ "151799": {
1255
+ "content": "<FAKE_PAD_117>",
1256
+ "lstrip": false,
1257
+ "normalized": false,
1258
+ "rstrip": false,
1259
+ "single_word": false,
1260
+ "special": true
1261
+ },
1262
+ "151800": {
1263
+ "content": "<FAKE_PAD_118>",
1264
+ "lstrip": false,
1265
+ "normalized": false,
1266
+ "rstrip": false,
1267
+ "single_word": false,
1268
+ "special": true
1269
+ },
1270
+ "151801": {
1271
+ "content": "<FAKE_PAD_119>",
1272
+ "lstrip": false,
1273
+ "normalized": false,
1274
+ "rstrip": false,
1275
+ "single_word": false,
1276
+ "special": true
1277
+ },
1278
+ "151802": {
1279
+ "content": "<FAKE_PAD_120>",
1280
+ "lstrip": false,
1281
+ "normalized": false,
1282
+ "rstrip": false,
1283
+ "single_word": false,
1284
+ "special": true
1285
+ },
1286
+ "151803": {
1287
+ "content": "<FAKE_PAD_121>",
1288
+ "lstrip": false,
1289
+ "normalized": false,
1290
+ "rstrip": false,
1291
+ "single_word": false,
1292
+ "special": true
1293
+ },
1294
+ "151804": {
1295
+ "content": "<FAKE_PAD_122>",
1296
+ "lstrip": false,
1297
+ "normalized": false,
1298
+ "rstrip": false,
1299
+ "single_word": false,
1300
+ "special": true
1301
+ },
1302
+ "151805": {
1303
+ "content": "<FAKE_PAD_123>",
1304
+ "lstrip": false,
1305
+ "normalized": false,
1306
+ "rstrip": false,
1307
+ "single_word": false,
1308
+ "special": true
1309
+ },
1310
+ "151806": {
1311
+ "content": "<FAKE_PAD_124>",
1312
+ "lstrip": false,
1313
+ "normalized": false,
1314
+ "rstrip": false,
1315
+ "single_word": false,
1316
+ "special": true
1317
+ },
1318
+ "151807": {
1319
+ "content": "<FAKE_PAD_125>",
1320
+ "lstrip": false,
1321
+ "normalized": false,
1322
+ "rstrip": false,
1323
+ "single_word": false,
1324
+ "special": true
1325
+ },
1326
+ "151808": {
1327
+ "content": "<FAKE_PAD_126>",
1328
+ "lstrip": false,
1329
+ "normalized": false,
1330
+ "rstrip": false,
1331
+ "single_word": false,
1332
+ "special": true
1333
+ },
1334
+ "151809": {
1335
+ "content": "<FAKE_PAD_127>",
1336
+ "lstrip": false,
1337
+ "normalized": false,
1338
+ "rstrip": false,
1339
+ "single_word": false,
1340
+ "special": true
1341
+ },
1342
+ "151810": {
1343
+ "content": "<FAKE_PAD_128>",
1344
+ "lstrip": false,
1345
+ "normalized": false,
1346
+ "rstrip": false,
1347
+ "single_word": false,
1348
+ "special": true
1349
+ },
1350
+ "151811": {
1351
+ "content": "<FAKE_PAD_129>",
1352
+ "lstrip": false,
1353
+ "normalized": false,
1354
+ "rstrip": false,
1355
+ "single_word": false,
1356
+ "special": true
1357
+ },
1358
+ "151812": {
1359
+ "content": "<FAKE_PAD_130>",
1360
+ "lstrip": false,
1361
+ "normalized": false,
1362
+ "rstrip": false,
1363
+ "single_word": false,
1364
+ "special": true
1365
+ },
1366
+ "151813": {
1367
+ "content": "<FAKE_PAD_131>",
1368
+ "lstrip": false,
1369
+ "normalized": false,
1370
+ "rstrip": false,
1371
+ "single_word": false,
1372
+ "special": true
1373
+ },
1374
+ "151814": {
1375
+ "content": "<FAKE_PAD_132>",
1376
+ "lstrip": false,
1377
+ "normalized": false,
1378
+ "rstrip": false,
1379
+ "single_word": false,
1380
+ "special": true
1381
+ },
1382
+ "151815": {
1383
+ "content": "<FAKE_PAD_133>",
1384
+ "lstrip": false,
1385
+ "normalized": false,
1386
+ "rstrip": false,
1387
+ "single_word": false,
1388
+ "special": true
1389
+ },
1390
+ "151816": {
1391
+ "content": "<FAKE_PAD_134>",
1392
+ "lstrip": false,
1393
+ "normalized": false,
1394
+ "rstrip": false,
1395
+ "single_word": false,
1396
+ "special": true
1397
+ },
1398
+ "151817": {
1399
+ "content": "<FAKE_PAD_135>",
1400
+ "lstrip": false,
1401
+ "normalized": false,
1402
+ "rstrip": false,
1403
+ "single_word": false,
1404
+ "special": true
1405
+ },
1406
+ "151818": {
1407
+ "content": "<FAKE_PAD_136>",
1408
+ "lstrip": false,
1409
+ "normalized": false,
1410
+ "rstrip": false,
1411
+ "single_word": false,
1412
+ "special": true
1413
+ },
1414
+ "151819": {
1415
+ "content": "<FAKE_PAD_137>",
1416
+ "lstrip": false,
1417
+ "normalized": false,
1418
+ "rstrip": false,
1419
+ "single_word": false,
1420
+ "special": true
1421
+ },
1422
+ "151820": {
1423
+ "content": "<FAKE_PAD_138>",
1424
+ "lstrip": false,
1425
+ "normalized": false,
1426
+ "rstrip": false,
1427
+ "single_word": false,
1428
+ "special": true
1429
+ },
1430
+ "151821": {
1431
+ "content": "<FAKE_PAD_139>",
1432
+ "lstrip": false,
1433
+ "normalized": false,
1434
+ "rstrip": false,
1435
+ "single_word": false,
1436
+ "special": true
1437
+ },
1438
+ "151822": {
1439
+ "content": "<FAKE_PAD_140>",
1440
+ "lstrip": false,
1441
+ "normalized": false,
1442
+ "rstrip": false,
1443
+ "single_word": false,
1444
+ "special": true
1445
+ },
1446
+ "151823": {
1447
+ "content": "<FAKE_PAD_141>",
1448
+ "lstrip": false,
1449
+ "normalized": false,
1450
+ "rstrip": false,
1451
+ "single_word": false,
1452
+ "special": true
1453
+ },
1454
+ "151824": {
1455
+ "content": "<FAKE_PAD_142>",
1456
+ "lstrip": false,
1457
+ "normalized": false,
1458
+ "rstrip": false,
1459
+ "single_word": false,
1460
+ "special": true
1461
+ },
1462
+ "151825": {
1463
+ "content": "<FAKE_PAD_143>",
1464
+ "lstrip": false,
1465
+ "normalized": false,
1466
+ "rstrip": false,
1467
+ "single_word": false,
1468
+ "special": true
1469
+ },
1470
+ "151826": {
1471
+ "content": "<FAKE_PAD_144>",
1472
+ "lstrip": false,
1473
+ "normalized": false,
1474
+ "rstrip": false,
1475
+ "single_word": false,
1476
+ "special": true
1477
+ },
1478
+ "151827": {
1479
+ "content": "<FAKE_PAD_145>",
1480
+ "lstrip": false,
1481
+ "normalized": false,
1482
+ "rstrip": false,
1483
+ "single_word": false,
1484
+ "special": true
1485
+ },
1486
+ "151828": {
1487
+ "content": "<FAKE_PAD_146>",
1488
+ "lstrip": false,
1489
+ "normalized": false,
1490
+ "rstrip": false,
1491
+ "single_word": false,
1492
+ "special": true
1493
+ },
1494
+ "151829": {
1495
+ "content": "<FAKE_PAD_147>",
1496
+ "lstrip": false,
1497
+ "normalized": false,
1498
+ "rstrip": false,
1499
+ "single_word": false,
1500
+ "special": true
1501
+ },
1502
+ "151830": {
1503
+ "content": "<FAKE_PAD_148>",
1504
+ "lstrip": false,
1505
+ "normalized": false,
1506
+ "rstrip": false,
1507
+ "single_word": false,
1508
+ "special": true
1509
+ },
1510
+ "151831": {
1511
+ "content": "<FAKE_PAD_149>",
1512
+ "lstrip": false,
1513
+ "normalized": false,
1514
+ "rstrip": false,
1515
+ "single_word": false,
1516
+ "special": true
1517
+ },
1518
+ "151832": {
1519
+ "content": "<FAKE_PAD_150>",
1520
+ "lstrip": false,
1521
+ "normalized": false,
1522
+ "rstrip": false,
1523
+ "single_word": false,
1524
+ "special": true
1525
+ },
1526
+ "151833": {
1527
+ "content": "<FAKE_PAD_151>",
1528
+ "lstrip": false,
1529
+ "normalized": false,
1530
+ "rstrip": false,
1531
+ "single_word": false,
1532
+ "special": true
1533
+ },
1534
+ "151834": {
1535
+ "content": "<FAKE_PAD_152>",
1536
+ "lstrip": false,
1537
+ "normalized": false,
1538
+ "rstrip": false,
1539
+ "single_word": false,
1540
+ "special": true
1541
+ },
1542
+ "151835": {
1543
+ "content": "<FAKE_PAD_153>",
1544
+ "lstrip": false,
1545
+ "normalized": false,
1546
+ "rstrip": false,
1547
+ "single_word": false,
1548
+ "special": true
1549
+ },
1550
+ "151836": {
1551
+ "content": "<FAKE_PAD_154>",
1552
+ "lstrip": false,
1553
+ "normalized": false,
1554
+ "rstrip": false,
1555
+ "single_word": false,
1556
+ "special": true
1557
+ },
1558
+ "151837": {
1559
+ "content": "<FAKE_PAD_155>",
1560
+ "lstrip": false,
1561
+ "normalized": false,
1562
+ "rstrip": false,
1563
+ "single_word": false,
1564
+ "special": true
1565
+ },
1566
+ "151838": {
1567
+ "content": "<FAKE_PAD_156>",
1568
+ "lstrip": false,
1569
+ "normalized": false,
1570
+ "rstrip": false,
1571
+ "single_word": false,
1572
+ "special": true
1573
+ },
1574
+ "151839": {
1575
+ "content": "<FAKE_PAD_157>",
1576
+ "lstrip": false,
1577
+ "normalized": false,
1578
+ "rstrip": false,
1579
+ "single_word": false,
1580
+ "special": true
1581
+ },
1582
+ "151840": {
1583
+ "content": "<FAKE_PAD_158>",
1584
+ "lstrip": false,
1585
+ "normalized": false,
1586
+ "rstrip": false,
1587
+ "single_word": false,
1588
+ "special": true
1589
+ },
1590
+ "151841": {
1591
+ "content": "<FAKE_PAD_159>",
1592
+ "lstrip": false,
1593
+ "normalized": false,
1594
+ "rstrip": false,
1595
+ "single_word": false,
1596
+ "special": true
1597
+ },
1598
+ "151842": {
1599
+ "content": "<FAKE_PAD_160>",
1600
+ "lstrip": false,
1601
+ "normalized": false,
1602
+ "rstrip": false,
1603
+ "single_word": false,
1604
+ "special": true
1605
+ },
1606
+ "151843": {
1607
+ "content": "<FAKE_PAD_161>",
1608
+ "lstrip": false,
1609
+ "normalized": false,
1610
+ "rstrip": false,
1611
+ "single_word": false,
1612
+ "special": true
1613
+ },
1614
+ "151844": {
1615
+ "content": "<FAKE_PAD_162>",
1616
+ "lstrip": false,
1617
+ "normalized": false,
1618
+ "rstrip": false,
1619
+ "single_word": false,
1620
+ "special": true
1621
+ },
1622
+ "151845": {
1623
+ "content": "<FAKE_PAD_163>",
1624
+ "lstrip": false,
1625
+ "normalized": false,
1626
+ "rstrip": false,
1627
+ "single_word": false,
1628
+ "special": true
1629
+ },
1630
+ "151846": {
1631
+ "content": "<FAKE_PAD_164>",
1632
+ "lstrip": false,
1633
+ "normalized": false,
1634
+ "rstrip": false,
1635
+ "single_word": false,
1636
+ "special": true
1637
+ },
1638
+ "151847": {
1639
+ "content": "<FAKE_PAD_165>",
1640
+ "lstrip": false,
1641
+ "normalized": false,
1642
+ "rstrip": false,
1643
+ "single_word": false,
1644
+ "special": true
1645
+ },
1646
+ "151848": {
1647
+ "content": "<FAKE_PAD_166>",
1648
+ "lstrip": false,
1649
+ "normalized": false,
1650
+ "rstrip": false,
1651
+ "single_word": false,
1652
+ "special": true
1653
+ },
1654
+ "151849": {
1655
+ "content": "<FAKE_PAD_167>",
1656
+ "lstrip": false,
1657
+ "normalized": false,
1658
+ "rstrip": false,
1659
+ "single_word": false,
1660
+ "special": true
1661
+ },
1662
+ "151850": {
1663
+ "content": "<FAKE_PAD_168>",
1664
+ "lstrip": false,
1665
+ "normalized": false,
1666
+ "rstrip": false,
1667
+ "single_word": false,
1668
+ "special": true
1669
+ },
1670
+ "151851": {
1671
+ "content": "<FAKE_PAD_169>",
1672
+ "lstrip": false,
1673
+ "normalized": false,
1674
+ "rstrip": false,
1675
+ "single_word": false,
1676
+ "special": true
1677
+ },
1678
+ "151852": {
1679
+ "content": "<FAKE_PAD_170>",
1680
+ "lstrip": false,
1681
+ "normalized": false,
1682
+ "rstrip": false,
1683
+ "single_word": false,
1684
+ "special": true
1685
+ },
1686
+ "151853": {
1687
+ "content": "<FAKE_PAD_171>",
1688
+ "lstrip": false,
1689
+ "normalized": false,
1690
+ "rstrip": false,
1691
+ "single_word": false,
1692
+ "special": true
1693
+ },
1694
+ "151854": {
1695
+ "content": "<FAKE_PAD_172>",
1696
+ "lstrip": false,
1697
+ "normalized": false,
1698
+ "rstrip": false,
1699
+ "single_word": false,
1700
+ "special": true
1701
+ },
1702
+ "151855": {
1703
+ "content": "<FAKE_PAD_173>",
1704
+ "lstrip": false,
1705
+ "normalized": false,
1706
+ "rstrip": false,
1707
+ "single_word": false,
1708
+ "special": true
1709
+ },
1710
+ "151856": {
1711
+ "content": "<FAKE_PAD_174>",
1712
+ "lstrip": false,
1713
+ "normalized": false,
1714
+ "rstrip": false,
1715
+ "single_word": false,
1716
+ "special": true
1717
+ },
1718
+ "151857": {
1719
+ "content": "<FAKE_PAD_175>",
1720
+ "lstrip": false,
1721
+ "normalized": false,
1722
+ "rstrip": false,
1723
+ "single_word": false,
1724
+ "special": true
1725
+ },
1726
+ "151858": {
1727
+ "content": "<FAKE_PAD_176>",
1728
+ "lstrip": false,
1729
+ "normalized": false,
1730
+ "rstrip": false,
1731
+ "single_word": false,
1732
+ "special": true
1733
+ },
1734
+ "151859": {
1735
+ "content": "<FAKE_PAD_177>",
1736
+ "lstrip": false,
1737
+ "normalized": false,
1738
+ "rstrip": false,
1739
+ "single_word": false,
1740
+ "special": true
1741
+ },
1742
+ "151860": {
1743
+ "content": "<FAKE_PAD_178>",
1744
+ "lstrip": false,
1745
+ "normalized": false,
1746
+ "rstrip": false,
1747
+ "single_word": false,
1748
+ "special": true
1749
+ },
1750
+ "151861": {
1751
+ "content": "<FAKE_PAD_179>",
1752
+ "lstrip": false,
1753
+ "normalized": false,
1754
+ "rstrip": false,
1755
+ "single_word": false,
1756
+ "special": true
1757
+ },
1758
+ "151862": {
1759
+ "content": "<FAKE_PAD_180>",
1760
+ "lstrip": false,
1761
+ "normalized": false,
1762
+ "rstrip": false,
1763
+ "single_word": false,
1764
+ "special": true
1765
+ },
1766
+ "151863": {
1767
+ "content": "<FAKE_PAD_181>",
1768
+ "lstrip": false,
1769
+ "normalized": false,
1770
+ "rstrip": false,
1771
+ "single_word": false,
1772
+ "special": true
1773
+ },
1774
+ "151864": {
1775
+ "content": "<FAKE_PAD_182>",
1776
+ "lstrip": false,
1777
+ "normalized": false,
1778
+ "rstrip": false,
1779
+ "single_word": false,
1780
+ "special": true
1781
+ },
1782
+ "151865": {
1783
+ "content": "<FAKE_PAD_183>",
1784
+ "lstrip": false,
1785
+ "normalized": false,
1786
+ "rstrip": false,
1787
+ "single_word": false,
1788
+ "special": true
1789
+ },
1790
+ "151866": {
1791
+ "content": "<FAKE_PAD_184>",
1792
+ "lstrip": false,
1793
+ "normalized": false,
1794
+ "rstrip": false,
1795
+ "single_word": false,
1796
+ "special": true
1797
+ },
1798
+ "151867": {
1799
+ "content": "<FAKE_PAD_185>",
1800
+ "lstrip": false,
1801
+ "normalized": false,
1802
+ "rstrip": false,
1803
+ "single_word": false,
1804
+ "special": true
1805
+ },
1806
+ "151868": {
1807
+ "content": "<FAKE_PAD_186>",
1808
+ "lstrip": false,
1809
+ "normalized": false,
1810
+ "rstrip": false,
1811
+ "single_word": false,
1812
+ "special": true
1813
+ },
1814
+ "151869": {
1815
+ "content": "<FAKE_PAD_187>",
1816
+ "lstrip": false,
1817
+ "normalized": false,
1818
+ "rstrip": false,
1819
+ "single_word": false,
1820
+ "special": true
1821
+ },
1822
+ "151870": {
1823
+ "content": "<FAKE_PAD_188>",
1824
+ "lstrip": false,
1825
+ "normalized": false,
1826
+ "rstrip": false,
1827
+ "single_word": false,
1828
+ "special": true
1829
+ },
1830
+ "151871": {
1831
+ "content": "<FAKE_PAD_189>",
1832
+ "lstrip": false,
1833
+ "normalized": false,
1834
+ "rstrip": false,
1835
+ "single_word": false,
1836
+ "special": true
1837
+ },
1838
+ "151872": {
1839
+ "content": "<FAKE_PAD_190>",
1840
+ "lstrip": false,
1841
+ "normalized": false,
1842
+ "rstrip": false,
1843
+ "single_word": false,
1844
+ "special": true
1845
+ },
1846
+ "151873": {
1847
+ "content": "<FAKE_PAD_191>",
1848
+ "lstrip": false,
1849
+ "normalized": false,
1850
+ "rstrip": false,
1851
+ "single_word": false,
1852
+ "special": true
1853
+ },
1854
+ "151874": {
1855
+ "content": "<FAKE_PAD_192>",
1856
+ "lstrip": false,
1857
+ "normalized": false,
1858
+ "rstrip": false,
1859
+ "single_word": false,
1860
+ "special": true
1861
+ },
1862
+ "151875": {
1863
+ "content": "<FAKE_PAD_193>",
1864
+ "lstrip": false,
1865
+ "normalized": false,
1866
+ "rstrip": false,
1867
+ "single_word": false,
1868
+ "special": true
1869
+ },
1870
+ "151876": {
1871
+ "content": "<FAKE_PAD_194>",
1872
+ "lstrip": false,
1873
+ "normalized": false,
1874
+ "rstrip": false,
1875
+ "single_word": false,
1876
+ "special": true
1877
+ },
1878
+ "151877": {
1879
+ "content": "<FAKE_PAD_195>",
1880
+ "lstrip": false,
1881
+ "normalized": false,
1882
+ "rstrip": false,
1883
+ "single_word": false,
1884
+ "special": true
1885
+ },
1886
+ "151878": {
1887
+ "content": "<FAKE_PAD_196>",
1888
+ "lstrip": false,
1889
+ "normalized": false,
1890
+ "rstrip": false,
1891
+ "single_word": false,
1892
+ "special": true
1893
+ },
1894
+ "151879": {
1895
+ "content": "<FAKE_PAD_197>",
1896
+ "lstrip": false,
1897
+ "normalized": false,
1898
+ "rstrip": false,
1899
+ "single_word": false,
1900
+ "special": true
1901
+ },
1902
+ "151880": {
1903
+ "content": "<FAKE_PAD_198>",
1904
+ "lstrip": false,
1905
+ "normalized": false,
1906
+ "rstrip": false,
1907
+ "single_word": false,
1908
+ "special": true
1909
+ },
1910
+ "151881": {
1911
+ "content": "<FAKE_PAD_199>",
1912
+ "lstrip": false,
1913
+ "normalized": false,
1914
+ "rstrip": false,
1915
+ "single_word": false,
1916
+ "special": true
1917
+ },
1918
+ "151882": {
1919
+ "content": "<FAKE_PAD_200>",
1920
+ "lstrip": false,
1921
+ "normalized": false,
1922
+ "rstrip": false,
1923
+ "single_word": false,
1924
+ "special": true
1925
+ },
1926
+ "151883": {
1927
+ "content": "<FAKE_PAD_201>",
1928
+ "lstrip": false,
1929
+ "normalized": false,
1930
+ "rstrip": false,
1931
+ "single_word": false,
1932
+ "special": true
1933
+ },
1934
+ "151884": {
1935
+ "content": "<FAKE_PAD_202>",
1936
+ "lstrip": false,
1937
+ "normalized": false,
1938
+ "rstrip": false,
1939
+ "single_word": false,
1940
+ "special": true
1941
+ },
1942
+ "151885": {
1943
+ "content": "<FAKE_PAD_203>",
1944
+ "lstrip": false,
1945
+ "normalized": false,
1946
+ "rstrip": false,
1947
+ "single_word": false,
1948
+ "special": true
1949
+ },
1950
+ "151886": {
1951
+ "content": "<FAKE_PAD_204>",
1952
+ "lstrip": false,
1953
+ "normalized": false,
1954
+ "rstrip": false,
1955
+ "single_word": false,
1956
+ "special": true
1957
+ },
1958
+ "151887": {
1959
+ "content": "<FAKE_PAD_205>",
1960
+ "lstrip": false,
1961
+ "normalized": false,
1962
+ "rstrip": false,
1963
+ "single_word": false,
1964
+ "special": true
1965
+ },
1966
+ "151888": {
1967
+ "content": "<FAKE_PAD_206>",
1968
+ "lstrip": false,
1969
+ "normalized": false,
1970
+ "rstrip": false,
1971
+ "single_word": false,
1972
+ "special": true
1973
+ },
1974
+ "151889": {
1975
+ "content": "<FAKE_PAD_207>",
1976
+ "lstrip": false,
1977
+ "normalized": false,
1978
+ "rstrip": false,
1979
+ "single_word": false,
1980
+ "special": true
1981
+ },
1982
+ "151890": {
1983
+ "content": "<FAKE_PAD_208>",
1984
+ "lstrip": false,
1985
+ "normalized": false,
1986
+ "rstrip": false,
1987
+ "single_word": false,
1988
+ "special": true
1989
+ },
1990
+ "151891": {
1991
+ "content": "<FAKE_PAD_209>",
1992
+ "lstrip": false,
1993
+ "normalized": false,
1994
+ "rstrip": false,
1995
+ "single_word": false,
1996
+ "special": true
1997
+ },
1998
+ "151892": {
1999
+ "content": "<FAKE_PAD_210>",
2000
+ "lstrip": false,
2001
+ "normalized": false,
2002
+ "rstrip": false,
2003
+ "single_word": false,
2004
+ "special": true
2005
+ },
2006
+ "151893": {
2007
+ "content": "<FAKE_PAD_211>",
2008
+ "lstrip": false,
2009
+ "normalized": false,
2010
+ "rstrip": false,
2011
+ "single_word": false,
2012
+ "special": true
2013
+ },
2014
+ "151894": {
2015
+ "content": "<FAKE_PAD_212>",
2016
+ "lstrip": false,
2017
+ "normalized": false,
2018
+ "rstrip": false,
2019
+ "single_word": false,
2020
+ "special": true
2021
+ },
2022
+ "151895": {
2023
+ "content": "<FAKE_PAD_213>",
2024
+ "lstrip": false,
2025
+ "normalized": false,
2026
+ "rstrip": false,
2027
+ "single_word": false,
2028
+ "special": true
2029
+ },
2030
+ "151896": {
2031
+ "content": "<FAKE_PAD_214>",
2032
+ "lstrip": false,
2033
+ "normalized": false,
2034
+ "rstrip": false,
2035
+ "single_word": false,
2036
+ "special": true
2037
+ },
2038
+ "151897": {
2039
+ "content": "<FAKE_PAD_215>",
2040
+ "lstrip": false,
2041
+ "normalized": false,
2042
+ "rstrip": false,
2043
+ "single_word": false,
2044
+ "special": true
2045
+ },
2046
+ "151898": {
2047
+ "content": "<FAKE_PAD_216>",
2048
+ "lstrip": false,
2049
+ "normalized": false,
2050
+ "rstrip": false,
2051
+ "single_word": false,
2052
+ "special": true
2053
+ },
2054
+ "151899": {
2055
+ "content": "<FAKE_PAD_217>",
2056
+ "lstrip": false,
2057
+ "normalized": false,
2058
+ "rstrip": false,
2059
+ "single_word": false,
2060
+ "special": true
2061
+ },
2062
+ "151900": {
2063
+ "content": "<FAKE_PAD_218>",
2064
+ "lstrip": false,
2065
+ "normalized": false,
2066
+ "rstrip": false,
2067
+ "single_word": false,
2068
+ "special": true
2069
+ },
2070
+ "151901": {
2071
+ "content": "<FAKE_PAD_219>",
2072
+ "lstrip": false,
2073
+ "normalized": false,
2074
+ "rstrip": false,
2075
+ "single_word": false,
2076
+ "special": true
2077
+ },
2078
+ "151902": {
2079
+ "content": "<FAKE_PAD_220>",
2080
+ "lstrip": false,
2081
+ "normalized": false,
2082
+ "rstrip": false,
2083
+ "single_word": false,
2084
+ "special": true
2085
+ },
2086
+ "151903": {
2087
+ "content": "<FAKE_PAD_221>",
2088
+ "lstrip": false,
2089
+ "normalized": false,
2090
+ "rstrip": false,
2091
+ "single_word": false,
2092
+ "special": true
2093
+ },
2094
+ "151904": {
2095
+ "content": "<FAKE_PAD_222>",
2096
+ "lstrip": false,
2097
+ "normalized": false,
2098
+ "rstrip": false,
2099
+ "single_word": false,
2100
+ "special": true
2101
+ },
2102
+ "151905": {
2103
+ "content": "<FAKE_PAD_223>",
2104
+ "lstrip": false,
2105
+ "normalized": false,
2106
+ "rstrip": false,
2107
+ "single_word": false,
2108
+ "special": true
2109
+ },
2110
+ "151906": {
2111
+ "content": "<FAKE_PAD_224>",
2112
+ "lstrip": false,
2113
+ "normalized": false,
2114
+ "rstrip": false,
2115
+ "single_word": false,
2116
+ "special": true
2117
+ },
2118
+ "151907": {
2119
+ "content": "<FAKE_PAD_225>",
2120
+ "lstrip": false,
2121
+ "normalized": false,
2122
+ "rstrip": false,
2123
+ "single_word": false,
2124
+ "special": true
2125
+ },
2126
+ "151908": {
2127
+ "content": "<FAKE_PAD_226>",
2128
+ "lstrip": false,
2129
+ "normalized": false,
2130
+ "rstrip": false,
2131
+ "single_word": false,
2132
+ "special": true
2133
+ },
2134
+ "151909": {
2135
+ "content": "<FAKE_PAD_227>",
2136
+ "lstrip": false,
2137
+ "normalized": false,
2138
+ "rstrip": false,
2139
+ "single_word": false,
2140
+ "special": true
2141
+ },
2142
+ "151910": {
2143
+ "content": "<FAKE_PAD_228>",
2144
+ "lstrip": false,
2145
+ "normalized": false,
2146
+ "rstrip": false,
2147
+ "single_word": false,
2148
+ "special": true
2149
+ },
2150
+ "151911": {
2151
+ "content": "<FAKE_PAD_229>",
2152
+ "lstrip": false,
2153
+ "normalized": false,
2154
+ "rstrip": false,
2155
+ "single_word": false,
2156
+ "special": true
2157
+ },
2158
+ "151912": {
2159
+ "content": "<FAKE_PAD_230>",
2160
+ "lstrip": false,
2161
+ "normalized": false,
2162
+ "rstrip": false,
2163
+ "single_word": false,
2164
+ "special": true
2165
+ },
2166
+ "151913": {
2167
+ "content": "<FAKE_PAD_231>",
2168
+ "lstrip": false,
2169
+ "normalized": false,
2170
+ "rstrip": false,
2171
+ "single_word": false,
2172
+ "special": true
2173
+ },
2174
+ "151914": {
2175
+ "content": "<FAKE_PAD_232>",
2176
+ "lstrip": false,
2177
+ "normalized": false,
2178
+ "rstrip": false,
2179
+ "single_word": false,
2180
+ "special": true
2181
+ },
2182
+ "151915": {
2183
+ "content": "<FAKE_PAD_233>",
2184
+ "lstrip": false,
2185
+ "normalized": false,
2186
+ "rstrip": false,
2187
+ "single_word": false,
2188
+ "special": true
2189
+ },
2190
+ "151916": {
2191
+ "content": "<FAKE_PAD_234>",
2192
+ "lstrip": false,
2193
+ "normalized": false,
2194
+ "rstrip": false,
2195
+ "single_word": false,
2196
+ "special": true
2197
+ },
2198
+ "151917": {
2199
+ "content": "<FAKE_PAD_235>",
2200
+ "lstrip": false,
2201
+ "normalized": false,
2202
+ "rstrip": false,
2203
+ "single_word": false,
2204
+ "special": true
2205
+ },
2206
+ "151918": {
2207
+ "content": "<FAKE_PAD_236>",
2208
+ "lstrip": false,
2209
+ "normalized": false,
2210
+ "rstrip": false,
2211
+ "single_word": false,
2212
+ "special": true
2213
+ },
2214
+ "151919": {
2215
+ "content": "<FAKE_PAD_237>",
2216
+ "lstrip": false,
2217
+ "normalized": false,
2218
+ "rstrip": false,
2219
+ "single_word": false,
2220
+ "special": true
2221
+ },
2222
+ "151920": {
2223
+ "content": "<FAKE_PAD_238>",
2224
+ "lstrip": false,
2225
+ "normalized": false,
2226
+ "rstrip": false,
2227
+ "single_word": false,
2228
+ "special": true
2229
+ },
2230
+ "151921": {
2231
+ "content": "<FAKE_PAD_239>",
2232
+ "lstrip": false,
2233
+ "normalized": false,
2234
+ "rstrip": false,
2235
+ "single_word": false,
2236
+ "special": true
2237
+ },
2238
+ "151922": {
2239
+ "content": "<FAKE_PAD_240>",
2240
+ "lstrip": false,
2241
+ "normalized": false,
2242
+ "rstrip": false,
2243
+ "single_word": false,
2244
+ "special": true
2245
+ },
2246
+ "151923": {
2247
+ "content": "<FAKE_PAD_241>",
2248
+ "lstrip": false,
2249
+ "normalized": false,
2250
+ "rstrip": false,
2251
+ "single_word": false,
2252
+ "special": true
2253
+ },
2254
+ "151924": {
2255
+ "content": "<FAKE_PAD_242>",
2256
+ "lstrip": false,
2257
+ "normalized": false,
2258
+ "rstrip": false,
2259
+ "single_word": false,
2260
+ "special": true
2261
+ },
2262
+ "151925": {
2263
+ "content": "<FAKE_PAD_243>",
2264
+ "lstrip": false,
2265
+ "normalized": false,
2266
+ "rstrip": false,
2267
+ "single_word": false,
2268
+ "special": true
2269
+ },
2270
+ "151926": {
2271
+ "content": "<FAKE_PAD_244>",
2272
+ "lstrip": false,
2273
+ "normalized": false,
2274
+ "rstrip": false,
2275
+ "single_word": false,
2276
+ "special": true
2277
+ },
2278
+ "151927": {
2279
+ "content": "<FAKE_PAD_245>",
2280
+ "lstrip": false,
2281
+ "normalized": false,
2282
+ "rstrip": false,
2283
+ "single_word": false,
2284
+ "special": true
2285
+ },
2286
+ "151928": {
2287
+ "content": "<FAKE_PAD_246>",
2288
+ "lstrip": false,
2289
+ "normalized": false,
2290
+ "rstrip": false,
2291
+ "single_word": false,
2292
+ "special": true
2293
+ },
2294
+ "151929": {
2295
+ "content": "<FAKE_PAD_247>",
2296
+ "lstrip": false,
2297
+ "normalized": false,
2298
+ "rstrip": false,
2299
+ "single_word": false,
2300
+ "special": true
2301
+ },
2302
+ "151930": {
2303
+ "content": "<FAKE_PAD_248>",
2304
+ "lstrip": false,
2305
+ "normalized": false,
2306
+ "rstrip": false,
2307
+ "single_word": false,
2308
+ "special": true
2309
+ },
2310
+ "151931": {
2311
+ "content": "<FAKE_PAD_249>",
2312
+ "lstrip": false,
2313
+ "normalized": false,
2314
+ "rstrip": false,
2315
+ "single_word": false,
2316
+ "special": true
2317
+ },
2318
+ "151932": {
2319
+ "content": "<FAKE_PAD_250>",
2320
+ "lstrip": false,
2321
+ "normalized": false,
2322
+ "rstrip": false,
2323
+ "single_word": false,
2324
+ "special": true
2325
+ },
2326
+ "151933": {
2327
+ "content": "<FAKE_PAD_251>",
2328
+ "lstrip": false,
2329
+ "normalized": false,
2330
+ "rstrip": false,
2331
+ "single_word": false,
2332
+ "special": true
2333
+ },
2334
+ "151934": {
2335
+ "content": "<FAKE_PAD_252>",
2336
+ "lstrip": false,
2337
+ "normalized": false,
2338
+ "rstrip": false,
2339
+ "single_word": false,
2340
+ "special": true
2341
+ },
2342
+ "151935": {
2343
+ "content": "<FAKE_PAD_253>",
2344
+ "lstrip": false,
2345
+ "normalized": false,
2346
+ "rstrip": false,
2347
+ "single_word": false,
2348
+ "special": true
2349
+ },
2350
+ "151936": {
2351
+ "content": "<audio>",
2352
+ "lstrip": false,
2353
+ "normalized": false,
2354
+ "rstrip": false,
2355
+ "single_word": false,
2356
+ "special": true
2357
+ },
2358
+ "151937": {
2359
+ "content": "</audio>",
2360
+ "lstrip": false,
2361
+ "normalized": false,
2362
+ "rstrip": false,
2363
+ "single_word": false,
2364
+ "special": true
2365
+ },
2366
+ "151938": {
2367
+ "content": "<AUDIO_CONTEXT>",
2368
+ "lstrip": false,
2369
+ "normalized": false,
2370
+ "rstrip": false,
2371
+ "single_word": false,
2372
+ "special": true
2373
+ },
2374
+ "151939": {
2375
+ "content": "<interrupt>",
2376
+ "lstrip": false,
2377
+ "normalized": false,
2378
+ "rstrip": false,
2379
+ "single_word": false,
2380
+ "special": true
2381
+ },
2382
+ "151940": {
2383
+ "content": "<FAKE_PAD_PAD_0>",
2384
+ "lstrip": false,
2385
+ "normalized": false,
2386
+ "rstrip": false,
2387
+ "single_word": false,
2388
+ "special": true
2389
+ },
2390
+ "151941": {
2391
+ "content": "<FAKE_PAD_PAD_1>",
2392
+ "lstrip": false,
2393
+ "normalized": false,
2394
+ "rstrip": false,
2395
+ "single_word": false,
2396
+ "special": true
2397
+ },
2398
+ "151942": {
2399
+ "content": "<FAKE_PAD_PAD_2>",
2400
+ "lstrip": false,
2401
+ "normalized": false,
2402
+ "rstrip": false,
2403
+ "single_word": false,
2404
+ "special": true
2405
+ },
2406
+ "151943": {
2407
+ "content": "<FAKE_PAD_PAD_3>",
2408
+ "lstrip": false,
2409
+ "normalized": false,
2410
+ "rstrip": false,
2411
+ "single_word": false,
2412
+ "special": true
2413
+ },
2414
+ "151944": {
2415
+ "content": "<FAKE_PAD_PAD_4>",
2416
+ "lstrip": false,
2417
+ "normalized": false,
2418
+ "rstrip": false,
2419
+ "single_word": false,
2420
+ "special": true
2421
+ },
2422
+ "151945": {
2423
+ "content": "<FAKE_PAD_PAD_5>",
2424
+ "lstrip": false,
2425
+ "normalized": false,
2426
+ "rstrip": false,
2427
+ "single_word": false,
2428
+ "special": true
2429
+ },
2430
+ "151946": {
2431
+ "content": "<FAKE_PAD_PAD_6>",
2432
+ "lstrip": false,
2433
+ "normalized": false,
2434
+ "rstrip": false,
2435
+ "single_word": false,
2436
+ "special": true
2437
+ },
2438
+ "151947": {
2439
+ "content": "<FAKE_PAD_PAD_7>",
2440
+ "lstrip": false,
2441
+ "normalized": false,
2442
+ "rstrip": false,
2443
+ "single_word": false,
2444
+ "special": true
2445
+ },
2446
+ "151948": {
2447
+ "content": "<FAKE_PAD_PAD_8>",
2448
+ "lstrip": false,
2449
+ "normalized": false,
2450
+ "rstrip": false,
2451
+ "single_word": false,
2452
+ "special": true
2453
+ },
2454
+ "151949": {
2455
+ "content": "<FAKE_PAD_PAD_9>",
2456
+ "lstrip": false,
2457
+ "normalized": false,
2458
+ "rstrip": false,
2459
+ "single_word": false,
2460
+ "special": true
2461
+ },
2462
+ "151950": {
2463
+ "content": "<FAKE_PAD_PAD_10>",
2464
+ "lstrip": false,
2465
+ "normalized": false,
2466
+ "rstrip": false,
2467
+ "single_word": false,
2468
+ "special": true
2469
+ },
2470
+ "151951": {
2471
+ "content": "<FAKE_PAD_PAD_11>",
2472
+ "lstrip": false,
2473
+ "normalized": false,
2474
+ "rstrip": false,
2475
+ "single_word": false,
2476
+ "special": true
2477
+ },
2478
+ "151952": {
2479
+ "content": "<FAKE_PAD_PAD_12>",
2480
+ "lstrip": false,
2481
+ "normalized": false,
2482
+ "rstrip": false,
2483
+ "single_word": false,
2484
+ "special": true
2485
+ },
2486
+ "151953": {
2487
+ "content": "<FAKE_PAD_PAD_13>",
2488
+ "lstrip": false,
2489
+ "normalized": false,
2490
+ "rstrip": false,
2491
+ "single_word": false,
2492
+ "special": true
2493
+ },
2494
+ "151954": {
2495
+ "content": "<FAKE_PAD_PAD_14>",
2496
+ "lstrip": false,
2497
+ "normalized": false,
2498
+ "rstrip": false,
2499
+ "single_word": false,
2500
+ "special": true
2501
+ },
2502
+ "151955": {
2503
+ "content": "<FAKE_PAD_PAD_15>",
2504
+ "lstrip": false,
2505
+ "normalized": false,
2506
+ "rstrip": false,
2507
+ "single_word": false,
2508
+ "special": true
2509
+ },
2510
+ "151956": {
2511
+ "content": "<FAKE_PAD_PAD_16>",
2512
+ "lstrip": false,
2513
+ "normalized": false,
2514
+ "rstrip": false,
2515
+ "single_word": false,
2516
+ "special": true
2517
+ },
2518
+ "151957": {
2519
+ "content": "<FAKE_PAD_PAD_17>",
2520
+ "lstrip": false,
2521
+ "normalized": false,
2522
+ "rstrip": false,
2523
+ "single_word": false,
2524
+ "special": true
2525
+ },
2526
+ "151958": {
2527
+ "content": "<FAKE_PAD_PAD_18>",
2528
+ "lstrip": false,
2529
+ "normalized": false,
2530
+ "rstrip": false,
2531
+ "single_word": false,
2532
+ "special": true
2533
+ },
2534
+ "151959": {
2535
+ "content": "<FAKE_PAD_PAD_19>",
2536
+ "lstrip": false,
2537
+ "normalized": false,
2538
+ "rstrip": false,
2539
+ "single_word": false,
2540
+ "special": true
2541
+ },
2542
+ "151960": {
2543
+ "content": "<FAKE_PAD_PAD_20>",
2544
+ "lstrip": false,
2545
+ "normalized": false,
2546
+ "rstrip": false,
2547
+ "single_word": false,
2548
+ "special": true
2549
+ },
2550
+ "151961": {
2551
+ "content": "<FAKE_PAD_PAD_21>",
2552
+ "lstrip": false,
2553
+ "normalized": false,
2554
+ "rstrip": false,
2555
+ "single_word": false,
2556
+ "special": true
2557
+ },
2558
+ "151962": {
2559
+ "content": "<FAKE_PAD_PAD_22>",
2560
+ "lstrip": false,
2561
+ "normalized": false,
2562
+ "rstrip": false,
2563
+ "single_word": false,
2564
+ "special": true
2565
+ },
2566
+ "151963": {
2567
+ "content": "<FAKE_PAD_PAD_23>",
2568
+ "lstrip": false,
2569
+ "normalized": false,
2570
+ "rstrip": false,
2571
+ "single_word": false,
2572
+ "special": true
2573
+ },
2574
+ "151964": {
2575
+ "content": "<FAKE_PAD_PAD_24>",
2576
+ "lstrip": false,
2577
+ "normalized": false,
2578
+ "rstrip": false,
2579
+ "single_word": false,
2580
+ "special": true
2581
+ },
2582
+ "151965": {
2583
+ "content": "<FAKE_PAD_PAD_25>",
2584
+ "lstrip": false,
2585
+ "normalized": false,
2586
+ "rstrip": false,
2587
+ "single_word": false,
2588
+ "special": true
2589
+ },
2590
+ "151966": {
2591
+ "content": "<FAKE_PAD_PAD_26>",
2592
+ "lstrip": false,
2593
+ "normalized": false,
2594
+ "rstrip": false,
2595
+ "single_word": false,
2596
+ "special": true
2597
+ },
2598
+ "151967": {
2599
+ "content": "<FAKE_PAD_PAD_27>",
2600
+ "lstrip": false,
2601
+ "normalized": false,
2602
+ "rstrip": false,
2603
+ "single_word": false,
2604
+ "special": true
2605
+ }
2606
+ },
2607
+ "additional_special_tokens": [
2608
+ "<|im_start|>",
2609
+ "<|im_end|>",
2610
+ "<|object_ref_start|>",
2611
+ "<|object_ref_end|>",
2612
+ "<|box_start|>",
2613
+ "<|box_end|>",
2614
+ "<|quad_start|>",
2615
+ "<|quad_end|>",
2616
+ "<|vision_start|>",
2617
+ "<|vision_end|>",
2618
+ "<|vision_pad|>",
2619
+ "<|image_pad|>",
2620
+ "<|video_pad|>",
2621
+ "<IMG_CONTEXT>",
2622
+ "<img>",
2623
+ "</img>",
2624
+ "<quad>",
2625
+ "</quad>",
2626
+ "<ref>",
2627
+ "</ref>",
2628
+ "<box>",
2629
+ "</box>",
2630
+ "<|action_start|>",
2631
+ "<|action_end|>",
2632
+ "<|plugin|>",
2633
+ "<|interpreter|>",
2634
+ "<FAKE_PAD_0>",
2635
+ "<FAKE_PAD_1>",
2636
+ "<FAKE_PAD_2>",
2637
+ "<FAKE_PAD_3>",
2638
+ "<FAKE_PAD_4>",
2639
+ "<FAKE_PAD_5>",
2640
+ "<FAKE_PAD_6>",
2641
+ "<FAKE_PAD_7>",
2642
+ "<FAKE_PAD_8>",
2643
+ "<FAKE_PAD_9>",
2644
+ "<FAKE_PAD_10>",
2645
+ "<FAKE_PAD_11>",
2646
+ "<FAKE_PAD_12>",
2647
+ "<FAKE_PAD_13>",
2648
+ "<FAKE_PAD_14>",
2649
+ "<FAKE_PAD_15>",
2650
+ "<FAKE_PAD_16>",
2651
+ "<FAKE_PAD_17>",
2652
+ "<FAKE_PAD_18>",
2653
+ "<FAKE_PAD_19>",
2654
+ "<FAKE_PAD_20>",
2655
+ "<FAKE_PAD_21>",
2656
+ "<FAKE_PAD_22>",
2657
+ "<FAKE_PAD_23>",
2658
+ "<FAKE_PAD_24>",
2659
+ "<FAKE_PAD_25>",
2660
+ "<FAKE_PAD_26>",
2661
+ "<FAKE_PAD_27>",
2662
+ "<FAKE_PAD_28>",
2663
+ "<FAKE_PAD_29>",
2664
+ "<FAKE_PAD_30>",
2665
+ "<FAKE_PAD_31>",
2666
+ "<FAKE_PAD_32>",
2667
+ "<FAKE_PAD_33>",
2668
+ "<FAKE_PAD_34>",
2669
+ "<FAKE_PAD_35>",
2670
+ "<FAKE_PAD_36>",
2671
+ "<FAKE_PAD_37>",
2672
+ "<FAKE_PAD_38>",
2673
+ "<FAKE_PAD_39>",
2674
+ "<FAKE_PAD_40>",
2675
+ "<FAKE_PAD_41>",
2676
+ "<FAKE_PAD_42>",
2677
+ "<FAKE_PAD_43>",
2678
+ "<FAKE_PAD_44>",
2679
+ "<FAKE_PAD_45>",
2680
+ "<FAKE_PAD_46>",
2681
+ "<FAKE_PAD_47>",
2682
+ "<FAKE_PAD_48>",
2683
+ "<FAKE_PAD_49>",
2684
+ "<FAKE_PAD_50>",
2685
+ "<FAKE_PAD_51>",
2686
+ "<FAKE_PAD_52>",
2687
+ "<FAKE_PAD_53>",
2688
+ "<FAKE_PAD_54>",
2689
+ "<FAKE_PAD_55>",
2690
+ "<FAKE_PAD_56>",
2691
+ "<FAKE_PAD_57>",
2692
+ "<FAKE_PAD_58>",
2693
+ "<FAKE_PAD_59>",
2694
+ "<FAKE_PAD_60>",
2695
+ "<FAKE_PAD_61>",
2696
+ "<FAKE_PAD_62>",
2697
+ "<FAKE_PAD_63>",
2698
+ "<FAKE_PAD_64>",
2699
+ "<FAKE_PAD_65>",
2700
+ "<FAKE_PAD_66>",
2701
+ "<FAKE_PAD_67>",
2702
+ "<FAKE_PAD_68>",
2703
+ "<FAKE_PAD_69>",
2704
+ "<FAKE_PAD_70>",
2705
+ "<FAKE_PAD_71>",
2706
+ "<FAKE_PAD_72>",
2707
+ "<FAKE_PAD_73>",
2708
+ "<FAKE_PAD_74>",
2709
+ "<FAKE_PAD_75>",
2710
+ "<FAKE_PAD_76>",
2711
+ "<FAKE_PAD_77>",
2712
+ "<FAKE_PAD_78>",
2713
+ "<FAKE_PAD_79>",
2714
+ "<FAKE_PAD_80>",
2715
+ "<FAKE_PAD_81>",
2716
+ "<FAKE_PAD_82>",
2717
+ "<FAKE_PAD_83>",
2718
+ "<FAKE_PAD_84>",
2719
+ "<FAKE_PAD_85>",
2720
+ "<FAKE_PAD_86>",
2721
+ "<FAKE_PAD_87>",
2722
+ "<FAKE_PAD_88>",
2723
+ "<FAKE_PAD_89>",
2724
+ "<FAKE_PAD_90>",
2725
+ "<FAKE_PAD_91>",
2726
+ "<FAKE_PAD_92>",
2727
+ "<FAKE_PAD_93>",
2728
+ "<FAKE_PAD_94>",
2729
+ "<FAKE_PAD_95>",
2730
+ "<FAKE_PAD_96>",
2731
+ "<FAKE_PAD_97>",
2732
+ "<FAKE_PAD_98>",
2733
+ "<FAKE_PAD_99>",
2734
+ "<FAKE_PAD_100>",
2735
+ "<FAKE_PAD_101>",
2736
+ "<FAKE_PAD_102>",
2737
+ "<FAKE_PAD_103>",
2738
+ "<FAKE_PAD_104>",
2739
+ "<FAKE_PAD_105>",
2740
+ "<FAKE_PAD_106>",
2741
+ "<FAKE_PAD_107>",
2742
+ "<FAKE_PAD_108>",
2743
+ "<FAKE_PAD_109>",
2744
+ "<FAKE_PAD_110>",
2745
+ "<FAKE_PAD_111>",
2746
+ "<FAKE_PAD_112>",
2747
+ "<FAKE_PAD_113>",
2748
+ "<FAKE_PAD_114>",
2749
+ "<FAKE_PAD_115>",
2750
+ "<FAKE_PAD_116>",
2751
+ "<FAKE_PAD_117>",
2752
+ "<FAKE_PAD_118>",
2753
+ "<FAKE_PAD_119>",
2754
+ "<FAKE_PAD_120>",
2755
+ "<FAKE_PAD_121>",
2756
+ "<FAKE_PAD_122>",
2757
+ "<FAKE_PAD_123>",
2758
+ "<FAKE_PAD_124>",
2759
+ "<FAKE_PAD_125>",
2760
+ "<FAKE_PAD_126>",
2761
+ "<FAKE_PAD_127>",
2762
+ "<FAKE_PAD_128>",
2763
+ "<FAKE_PAD_129>",
2764
+ "<FAKE_PAD_130>",
2765
+ "<FAKE_PAD_131>",
2766
+ "<FAKE_PAD_132>",
2767
+ "<FAKE_PAD_133>",
2768
+ "<FAKE_PAD_134>",
2769
+ "<FAKE_PAD_135>",
2770
+ "<FAKE_PAD_136>",
2771
+ "<FAKE_PAD_137>",
2772
+ "<FAKE_PAD_138>",
2773
+ "<FAKE_PAD_139>",
2774
+ "<FAKE_PAD_140>",
2775
+ "<FAKE_PAD_141>",
2776
+ "<FAKE_PAD_142>",
2777
+ "<FAKE_PAD_143>",
2778
+ "<FAKE_PAD_144>",
2779
+ "<FAKE_PAD_145>",
2780
+ "<FAKE_PAD_146>",
2781
+ "<FAKE_PAD_147>",
2782
+ "<FAKE_PAD_148>",
2783
+ "<FAKE_PAD_149>",
2784
+ "<FAKE_PAD_150>",
2785
+ "<FAKE_PAD_151>",
2786
+ "<FAKE_PAD_152>",
2787
+ "<FAKE_PAD_153>",
2788
+ "<FAKE_PAD_154>",
2789
+ "<FAKE_PAD_155>",
2790
+ "<FAKE_PAD_156>",
2791
+ "<FAKE_PAD_157>",
2792
+ "<FAKE_PAD_158>",
2793
+ "<FAKE_PAD_159>",
2794
+ "<FAKE_PAD_160>",
2795
+ "<FAKE_PAD_161>",
2796
+ "<FAKE_PAD_162>",
2797
+ "<FAKE_PAD_163>",
2798
+ "<FAKE_PAD_164>",
2799
+ "<FAKE_PAD_165>",
2800
+ "<FAKE_PAD_166>",
2801
+ "<FAKE_PAD_167>",
2802
+ "<FAKE_PAD_168>",
2803
+ "<FAKE_PAD_169>",
2804
+ "<FAKE_PAD_170>",
2805
+ "<FAKE_PAD_171>",
2806
+ "<FAKE_PAD_172>",
2807
+ "<FAKE_PAD_173>",
2808
+ "<FAKE_PAD_174>",
2809
+ "<FAKE_PAD_175>",
2810
+ "<FAKE_PAD_176>",
2811
+ "<FAKE_PAD_177>",
2812
+ "<FAKE_PAD_178>",
2813
+ "<FAKE_PAD_179>",
2814
+ "<FAKE_PAD_180>",
2815
+ "<FAKE_PAD_181>",
2816
+ "<FAKE_PAD_182>",
2817
+ "<FAKE_PAD_183>",
2818
+ "<FAKE_PAD_184>",
2819
+ "<FAKE_PAD_185>",
2820
+ "<FAKE_PAD_186>",
2821
+ "<FAKE_PAD_187>",
2822
+ "<FAKE_PAD_188>",
2823
+ "<FAKE_PAD_189>",
2824
+ "<FAKE_PAD_190>",
2825
+ "<FAKE_PAD_191>",
2826
+ "<FAKE_PAD_192>",
2827
+ "<FAKE_PAD_193>",
2828
+ "<FAKE_PAD_194>",
2829
+ "<FAKE_PAD_195>",
2830
+ "<FAKE_PAD_196>",
2831
+ "<FAKE_PAD_197>",
2832
+ "<FAKE_PAD_198>",
2833
+ "<FAKE_PAD_199>",
2834
+ "<FAKE_PAD_200>",
2835
+ "<FAKE_PAD_201>",
2836
+ "<FAKE_PAD_202>",
2837
+ "<FAKE_PAD_203>",
2838
+ "<FAKE_PAD_204>",
2839
+ "<FAKE_PAD_205>",
2840
+ "<FAKE_PAD_206>",
2841
+ "<FAKE_PAD_207>",
2842
+ "<FAKE_PAD_208>",
2843
+ "<FAKE_PAD_209>",
2844
+ "<FAKE_PAD_210>",
2845
+ "<FAKE_PAD_211>",
2846
+ "<FAKE_PAD_212>",
2847
+ "<FAKE_PAD_213>",
2848
+ "<FAKE_PAD_214>",
2849
+ "<FAKE_PAD_215>",
2850
+ "<FAKE_PAD_216>",
2851
+ "<FAKE_PAD_217>",
2852
+ "<FAKE_PAD_218>",
2853
+ "<FAKE_PAD_219>",
2854
+ "<FAKE_PAD_220>",
2855
+ "<FAKE_PAD_221>",
2856
+ "<FAKE_PAD_222>",
2857
+ "<FAKE_PAD_223>",
2858
+ "<FAKE_PAD_224>",
2859
+ "<FAKE_PAD_225>",
2860
+ "<FAKE_PAD_226>",
2861
+ "<FAKE_PAD_227>",
2862
+ "<FAKE_PAD_228>",
2863
+ "<FAKE_PAD_229>",
2864
+ "<FAKE_PAD_230>",
2865
+ "<FAKE_PAD_231>",
2866
+ "<FAKE_PAD_232>",
2867
+ "<FAKE_PAD_233>",
2868
+ "<FAKE_PAD_234>",
2869
+ "<FAKE_PAD_235>",
2870
+ "<FAKE_PAD_236>",
2871
+ "<FAKE_PAD_237>",
2872
+ "<FAKE_PAD_238>",
2873
+ "<FAKE_PAD_239>",
2874
+ "<FAKE_PAD_240>",
2875
+ "<FAKE_PAD_241>",
2876
+ "<FAKE_PAD_242>",
2877
+ "<FAKE_PAD_243>",
2878
+ "<FAKE_PAD_244>",
2879
+ "<FAKE_PAD_245>",
2880
+ "<FAKE_PAD_246>",
2881
+ "<FAKE_PAD_247>",
2882
+ "<FAKE_PAD_248>",
2883
+ "<FAKE_PAD_249>",
2884
+ "<FAKE_PAD_250>",
2885
+ "<FAKE_PAD_251>",
2886
+ "<FAKE_PAD_252>",
2887
+ "<FAKE_PAD_253>",
2888
+ "<audio>",
2889
+ "</audio>",
2890
+ "<AUDIO_CONTEXT>",
2891
+ "<interrupt>",
2892
+ "<FAKE_PAD_PAD_0>",
2893
+ "<FAKE_PAD_PAD_1>",
2894
+ "<FAKE_PAD_PAD_2>",
2895
+ "<FAKE_PAD_PAD_3>",
2896
+ "<FAKE_PAD_PAD_4>",
2897
+ "<FAKE_PAD_PAD_5>",
2898
+ "<FAKE_PAD_PAD_6>",
2899
+ "<FAKE_PAD_PAD_7>",
2900
+ "<FAKE_PAD_PAD_8>",
2901
+ "<FAKE_PAD_PAD_9>",
2902
+ "<FAKE_PAD_PAD_10>",
2903
+ "<FAKE_PAD_PAD_11>",
2904
+ "<FAKE_PAD_PAD_12>",
2905
+ "<FAKE_PAD_PAD_13>",
2906
+ "<FAKE_PAD_PAD_14>",
2907
+ "<FAKE_PAD_PAD_15>",
2908
+ "<FAKE_PAD_PAD_16>",
2909
+ "<FAKE_PAD_PAD_17>",
2910
+ "<FAKE_PAD_PAD_18>",
2911
+ "<FAKE_PAD_PAD_19>",
2912
+ "<FAKE_PAD_PAD_20>",
2913
+ "<FAKE_PAD_PAD_21>",
2914
+ "<FAKE_PAD_PAD_22>",
2915
+ "<FAKE_PAD_PAD_23>",
2916
+ "<FAKE_PAD_PAD_24>",
2917
+ "<FAKE_PAD_PAD_25>",
2918
+ "<FAKE_PAD_PAD_26>",
2919
+ "<FAKE_PAD_PAD_27>"
2920
+ ],
2921
+ "bos_token": null,
2922
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == \"user\" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set content = message.content %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '</think>' in message.content %}\n {%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}\n {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- if loop.index0 > ns.last_query_index %}\n {%- if loop.last or (not loop.last and reasoning_content) %}\n {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- '<think>\\n\\n</think>\\n\\n' }}\n {%- endif %}\n{%- endif %}",
2923
+ "clean_up_tokenization_spaces": false,
2924
+ "eos_token": "<|im_end|>",
2925
+ "errors": "replace",
2926
+ "model_max_length": 4096,
2927
+ "pad_token": "<|endoftext|>",
2928
+ "split_special_tokens": false,
2929
+ "tokenizer_class": "Qwen2Tokenizer",
2930
+ "unk_token": null
2931
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff