sensefvg commited on 14 days ago

Commit

b3f3294

verified ·

1 Parent(s): 120295b

upload initial model

Browse files

Files changed (31) hide show

.gitattributes +1 -0
README.md +3 -0
added_tokens.json +327 -0
campplus.onnx +3 -0
config.json +628 -0
configuration_flow.py +102 -0
configuration_hifigan.py +87 -0
configuration_interactiveomni.py +125 -0
configuration_intern_vit.py +119 -0
configuration_voicelm.py +63 -0
configuration_whisper.py +340 -0
conversation.py +340 -0
generation_config.json +4 -0
merges.txt +0 -0
model-00001-of-00005.safetensors +3 -0
model-00002-of-00005.safetensors +3 -0
model-00003-of-00005.safetensors +3 -0
model-00004-of-00005.safetensors +3 -0
model-00005-of-00005.safetensors +3 -0
model.safetensors.index.json +0 -0
modeling_flow.py +2318 -0
modeling_hifigan.py +479 -0
modeling_interactiveomni.py +773 -0
modeling_intern_vit.py +427 -0
modeling_voicelm.py +192 -0
modeling_whisper.py +0 -0
special_tokens_map.json +330 -0
taozi.wav +3 -0
tokenizer.json +0 -0
tokenizer_config.json +2931 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+taozi.wav filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,3 @@

+---
+license: mit
+---

added_tokens.json ADDED Viewed

	@@ -0,0 +1,327 @@

+{
+  "</audio>": 151937,
+  "</box>": 151677,
+  "</img>": 151671,
+  "</quad>": 151673,
+  "</ref>": 151675,
+  "</think>": 151668,
+  "</tool_call>": 151658,
+  "</tool_response>": 151666,
+  "<AUDIO_CONTEXT>": 151938,
+  "<FAKE_PAD_0>": 151682,
+  "<FAKE_PAD_100>": 151782,
+  "<FAKE_PAD_101>": 151783,
+  "<FAKE_PAD_102>": 151784,
+  "<FAKE_PAD_103>": 151785,
+  "<FAKE_PAD_104>": 151786,
+  "<FAKE_PAD_105>": 151787,
+  "<FAKE_PAD_106>": 151788,
+  "<FAKE_PAD_107>": 151789,
+  "<FAKE_PAD_108>": 151790,
+  "<FAKE_PAD_109>": 151791,
+  "<FAKE_PAD_10>": 151692,
+  "<FAKE_PAD_110>": 151792,
+  "<FAKE_PAD_111>": 151793,
+  "<FAKE_PAD_112>": 151794,
+  "<FAKE_PAD_113>": 151795,
+  "<FAKE_PAD_114>": 151796,
+  "<FAKE_PAD_115>": 151797,
+  "<FAKE_PAD_116>": 151798,
+  "<FAKE_PAD_117>": 151799,
+  "<FAKE_PAD_118>": 151800,
+  "<FAKE_PAD_119>": 151801,
+  "<FAKE_PAD_11>": 151693,
+  "<FAKE_PAD_120>": 151802,
+  "<FAKE_PAD_121>": 151803,
+  "<FAKE_PAD_122>": 151804,
+  "<FAKE_PAD_123>": 151805,
+  "<FAKE_PAD_124>": 151806,
+  "<FAKE_PAD_125>": 151807,
+  "<FAKE_PAD_126>": 151808,
+  "<FAKE_PAD_127>": 151809,
+  "<FAKE_PAD_128>": 151810,
+  "<FAKE_PAD_129>": 151811,
+  "<FAKE_PAD_12>": 151694,
+  "<FAKE_PAD_130>": 151812,
+  "<FAKE_PAD_131>": 151813,
+  "<FAKE_PAD_132>": 151814,
+  "<FAKE_PAD_133>": 151815,
+  "<FAKE_PAD_134>": 151816,
+  "<FAKE_PAD_135>": 151817,
+  "<FAKE_PAD_136>": 151818,
+  "<FAKE_PAD_137>": 151819,
+  "<FAKE_PAD_138>": 151820,
+  "<FAKE_PAD_139>": 151821,
+  "<FAKE_PAD_13>": 151695,
+  "<FAKE_PAD_140>": 151822,
+  "<FAKE_PAD_141>": 151823,
+  "<FAKE_PAD_142>": 151824,
+  "<FAKE_PAD_143>": 151825,
+  "<FAKE_PAD_144>": 151826,
+  "<FAKE_PAD_145>": 151827,
+  "<FAKE_PAD_146>": 151828,
+  "<FAKE_PAD_147>": 151829,
+  "<FAKE_PAD_148>": 151830,
+  "<FAKE_PAD_149>": 151831,
+  "<FAKE_PAD_14>": 151696,
+  "<FAKE_PAD_150>": 151832,
+  "<FAKE_PAD_151>": 151833,
+  "<FAKE_PAD_152>": 151834,
+  "<FAKE_PAD_153>": 151835,
+  "<FAKE_PAD_154>": 151836,
+  "<FAKE_PAD_155>": 151837,
+  "<FAKE_PAD_156>": 151838,
+  "<FAKE_PAD_157>": 151839,
+  "<FAKE_PAD_158>": 151840,
+  "<FAKE_PAD_159>": 151841,
+  "<FAKE_PAD_15>": 151697,
+  "<FAKE_PAD_160>": 151842,
+  "<FAKE_PAD_161>": 151843,
+  "<FAKE_PAD_162>": 151844,
+  "<FAKE_PAD_163>": 151845,
+  "<FAKE_PAD_164>": 151846,
+  "<FAKE_PAD_165>": 151847,
+  "<FAKE_PAD_166>": 151848,
+  "<FAKE_PAD_167>": 151849,
+  "<FAKE_PAD_168>": 151850,
+  "<FAKE_PAD_169>": 151851,
+  "<FAKE_PAD_16>": 151698,
+  "<FAKE_PAD_170>": 151852,
+  "<FAKE_PAD_171>": 151853,
+  "<FAKE_PAD_172>": 151854,
+  "<FAKE_PAD_173>": 151855,
+  "<FAKE_PAD_174>": 151856,
+  "<FAKE_PAD_175>": 151857,
+  "<FAKE_PAD_176>": 151858,
+  "<FAKE_PAD_177>": 151859,
+  "<FAKE_PAD_178>": 151860,
+  "<FAKE_PAD_179>": 151861,
+  "<FAKE_PAD_17>": 151699,
+  "<FAKE_PAD_180>": 151862,
+  "<FAKE_PAD_181>": 151863,
+  "<FAKE_PAD_182>": 151864,
+  "<FAKE_PAD_183>": 151865,
+  "<FAKE_PAD_184>": 151866,
+  "<FAKE_PAD_185>": 151867,
+  "<FAKE_PAD_186>": 151868,
+  "<FAKE_PAD_187>": 151869,
+  "<FAKE_PAD_188>": 151870,
+  "<FAKE_PAD_189>": 151871,
+  "<FAKE_PAD_18>": 151700,
+  "<FAKE_PAD_190>": 151872,
+  "<FAKE_PAD_191>": 151873,
+  "<FAKE_PAD_192>": 151874,
+  "<FAKE_PAD_193>": 151875,
+  "<FAKE_PAD_194>": 151876,
+  "<FAKE_PAD_195>": 151877,
+  "<FAKE_PAD_196>": 151878,
+  "<FAKE_PAD_197>": 151879,
+  "<FAKE_PAD_198>": 151880,
+  "<FAKE_PAD_199>": 151881,
+  "<FAKE_PAD_19>": 151701,
+  "<FAKE_PAD_1>": 151683,
+  "<FAKE_PAD_200>": 151882,
+  "<FAKE_PAD_201>": 151883,
+  "<FAKE_PAD_202>": 151884,
+  "<FAKE_PAD_203>": 151885,
+  "<FAKE_PAD_204>": 151886,
+  "<FAKE_PAD_205>": 151887,
+  "<FAKE_PAD_206>": 151888,
+  "<FAKE_PAD_207>": 151889,
+  "<FAKE_PAD_208>": 151890,
+  "<FAKE_PAD_209>": 151891,
+  "<FAKE_PAD_20>": 151702,
+  "<FAKE_PAD_210>": 151892,
+  "<FAKE_PAD_211>": 151893,
+  "<FAKE_PAD_212>": 151894,
+  "<FAKE_PAD_213>": 151895,
+  "<FAKE_PAD_214>": 151896,
+  "<FAKE_PAD_215>": 151897,
+  "<FAKE_PAD_216>": 151898,
+  "<FAKE_PAD_217>": 151899,
+  "<FAKE_PAD_218>": 151900,
+  "<FAKE_PAD_219>": 151901,
+  "<FAKE_PAD_21>": 151703,
+  "<FAKE_PAD_220>": 151902,
+  "<FAKE_PAD_221>": 151903,
+  "<FAKE_PAD_222>": 151904,
+  "<FAKE_PAD_223>": 151905,
+  "<FAKE_PAD_224>": 151906,
+  "<FAKE_PAD_225>": 151907,
+  "<FAKE_PAD_226>": 151908,
+  "<FAKE_PAD_227>": 151909,
+  "<FAKE_PAD_228>": 151910,
+  "<FAKE_PAD_229>": 151911,
+  "<FAKE_PAD_22>": 151704,
+  "<FAKE_PAD_230>": 151912,
+  "<FAKE_PAD_231>": 151913,
+  "<FAKE_PAD_232>": 151914,
+  "<FAKE_PAD_233>": 151915,
+  "<FAKE_PAD_234>": 151916,
+  "<FAKE_PAD_235>": 151917,
+  "<FAKE_PAD_236>": 151918,
+  "<FAKE_PAD_237>": 151919,
+  "<FAKE_PAD_238>": 151920,
+  "<FAKE_PAD_239>": 151921,
+  "<FAKE_PAD_23>": 151705,
+  "<FAKE_PAD_240>": 151922,
+  "<FAKE_PAD_241>": 151923,
+  "<FAKE_PAD_242>": 151924,
+  "<FAKE_PAD_243>": 151925,
+  "<FAKE_PAD_244>": 151926,
+  "<FAKE_PAD_245>": 151927,
+  "<FAKE_PAD_246>": 151928,
+  "<FAKE_PAD_247>": 151929,
+  "<FAKE_PAD_248>": 151930,
+  "<FAKE_PAD_249>": 151931,
+  "<FAKE_PAD_24>": 151706,
+  "<FAKE_PAD_250>": 151932,
+  "<FAKE_PAD_251>": 151933,
+  "<FAKE_PAD_252>": 151934,
+  "<FAKE_PAD_253>": 151935,
+  "<FAKE_PAD_25>": 151707,
+  "<FAKE_PAD_26>": 151708,
+  "<FAKE_PAD_27>": 151709,
+  "<FAKE_PAD_28>": 151710,
+  "<FAKE_PAD_29>": 151711,
+  "<FAKE_PAD_2>": 151684,
+  "<FAKE_PAD_30>": 151712,
+  "<FAKE_PAD_31>": 151713,
+  "<FAKE_PAD_32>": 151714,
+  "<FAKE_PAD_33>": 151715,
+  "<FAKE_PAD_34>": 151716,
+  "<FAKE_PAD_35>": 151717,
+  "<FAKE_PAD_36>": 151718,
+  "<FAKE_PAD_37>": 151719,
+  "<FAKE_PAD_38>": 151720,
+  "<FAKE_PAD_39>": 151721,
+  "<FAKE_PAD_3>": 151685,
+  "<FAKE_PAD_40>": 151722,
+  "<FAKE_PAD_41>": 151723,
+  "<FAKE_PAD_42>": 151724,
+  "<FAKE_PAD_43>": 151725,
+  "<FAKE_PAD_44>": 151726,
+  "<FAKE_PAD_45>": 151727,
+  "<FAKE_PAD_46>": 151728,
+  "<FAKE_PAD_47>": 151729,
+  "<FAKE_PAD_48>": 151730,
+  "<FAKE_PAD_49>": 151731,
+  "<FAKE_PAD_4>": 151686,
+  "<FAKE_PAD_50>": 151732,
+  "<FAKE_PAD_51>": 151733,
+  "<FAKE_PAD_52>": 151734,
+  "<FAKE_PAD_53>": 151735,
+  "<FAKE_PAD_54>": 151736,
+  "<FAKE_PAD_55>": 151737,
+  "<FAKE_PAD_56>": 151738,
+  "<FAKE_PAD_57>": 151739,
+  "<FAKE_PAD_58>": 151740,
+  "<FAKE_PAD_59>": 151741,
+  "<FAKE_PAD_5>": 151687,
+  "<FAKE_PAD_60>": 151742,
+  "<FAKE_PAD_61>": 151743,
+  "<FAKE_PAD_62>": 151744,
+  "<FAKE_PAD_63>": 151745,
+  "<FAKE_PAD_64>": 151746,
+  "<FAKE_PAD_65>": 151747,
+  "<FAKE_PAD_66>": 151748,
+  "<FAKE_PAD_67>": 151749,
+  "<FAKE_PAD_68>": 151750,
+  "<FAKE_PAD_69>": 151751,
+  "<FAKE_PAD_6>": 151688,
+  "<FAKE_PAD_70>": 151752,
+  "<FAKE_PAD_71>": 151753,
+  "<FAKE_PAD_72>": 151754,
+  "<FAKE_PAD_73>": 151755,
+  "<FAKE_PAD_74>": 151756,
+  "<FAKE_PAD_75>": 151757,
+  "<FAKE_PAD_76>": 151758,
+  "<FAKE_PAD_77>": 151759,
+  "<FAKE_PAD_78>": 151760,
+  "<FAKE_PAD_79>": 151761,
+  "<FAKE_PAD_7>": 151689,
+  "<FAKE_PAD_80>": 151762,
+  "<FAKE_PAD_81>": 151763,
+  "<FAKE_PAD_82>": 151764,
+  "<FAKE_PAD_83>": 151765,
+  "<FAKE_PAD_84>": 151766,
+  "<FAKE_PAD_85>": 151767,
+  "<FAKE_PAD_86>": 151768,
+  "<FAKE_PAD_87>": 151769,
+  "<FAKE_PAD_88>": 151770,
+  "<FAKE_PAD_89>": 151771,
+  "<FAKE_PAD_8>": 151690,
+  "<FAKE_PAD_90>": 151772,
+  "<FAKE_PAD_91>": 151773,
+  "<FAKE_PAD_92>": 151774,
+  "<FAKE_PAD_93>": 151775,
+  "<FAKE_PAD_94>": 151776,
+  "<FAKE_PAD_95>": 151777,
+  "<FAKE_PAD_96>": 151778,
+  "<FAKE_PAD_97>": 151779,
+  "<FAKE_PAD_98>": 151780,
+  "<FAKE_PAD_99>": 151781,
+  "<FAKE_PAD_9>": 151691,
+  "<FAKE_PAD_PAD_0>": 151940,
+  "<FAKE_PAD_PAD_10>": 151950,
+  "<FAKE_PAD_PAD_11>": 151951,
+  "<FAKE_PAD_PAD_12>": 151952,
+  "<FAKE_PAD_PAD_13>": 151953,
+  "<FAKE_PAD_PAD_14>": 151954,
+  "<FAKE_PAD_PAD_15>": 151955,
+  "<FAKE_PAD_PAD_16>": 151956,
+  "<FAKE_PAD_PAD_17>": 151957,
+  "<FAKE_PAD_PAD_18>": 151958,
+  "<FAKE_PAD_PAD_19>": 151959,
+  "<FAKE_PAD_PAD_1>": 151941,
+  "<FAKE_PAD_PAD_20>": 151960,
+  "<FAKE_PAD_PAD_21>": 151961,
+  "<FAKE_PAD_PAD_22>": 151962,
+  "<FAKE_PAD_PAD_23>": 151963,
+  "<FAKE_PAD_PAD_24>": 151964,
+  "<FAKE_PAD_PAD_25>": 151965,
+  "<FAKE_PAD_PAD_26>": 151966,
+  "<FAKE_PAD_PAD_27>": 151967,
+  "<FAKE_PAD_PAD_2>": 151942,
+  "<FAKE_PAD_PAD_3>": 151943,
+  "<FAKE_PAD_PAD_4>": 151944,
+  "<FAKE_PAD_PAD_5>": 151945,
+  "<FAKE_PAD_PAD_6>": 151946,
+  "<FAKE_PAD_PAD_7>": 151947,
+  "<FAKE_PAD_PAD_8>": 151948,
+  "<FAKE_PAD_PAD_9>": 151949,
+  "<IMG_CONTEXT>": 151669,
+  "<audio>": 151936,
+  "<box>": 151676,
+  "<img>": 151670,
+  "<interrupt>": 151939,
+  "<quad>": 151672,
+  "<ref>": 151674,
+  "<think>": 151667,
+  "<tool_call>": 151657,
+  "<tool_response>": 151665,
+  "<|action_end|>": 151679,
+  "<|action_start|>": 151678,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|interpreter|>": 151681,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|plugin|>": 151680,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

campplus.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a6ac6a63997761ae2997373e2ee1c47040854b4b759ea41ec48e4e42df0f4d73
+size 28303423

config.json ADDED Viewed

	@@ -0,0 +1,628 @@

+{
+  "_commit_hash": null,
+  "_name_or_path": "InteractiveOmni",
+  "architectures": [
+    "InteractiveOmniModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_interactiveomni.InteractiveOmniConfig",
+    "AutoModel": "modeling_interactiveomni.InteractiveOmniModel",
+    "AutoModelForCausalLM": "modeling_interactiveomni.InteractiveOmniModel"
+  },
+  "audio_config": {
+    "_name_or_path": "openai/whisper-large-v3",
+    "activation_dropout": 0.0,
+    "activation_function": "gelu",
+    "apply_spec_augment": false,
+    "architectures": [
+      "WhisperForConditionalGeneration"
+    ],
+    "attention_dropout": 0.0,
+    "begin_suppress_tokens": [
+      220,
+      50257
+    ],
+    "bos_token_id": 50257,
+    "classifier_proj_size": 256,
+    "d_model": 1280,
+    "decoder_attention_heads": 20,
+    "decoder_ffn_dim": 5120,
+    "decoder_layerdrop": 0.0,
+    "decoder_layers": 32,
+    "decoder_start_token_id": 50258,
+    "dropout": 0.0,
+    "encoder_attention_heads": 20,
+    "encoder_ffn_dim": 5120,
+    "encoder_layerdrop": 0.0,
+    "encoder_layers": 32,
+    "eos_token_id": 50257,
+    "init_std": 0.02,
+    "is_encoder_decoder": true,
+    "mask_feature_length": 10,
+    "mask_feature_min_masks": 0,
+    "mask_feature_prob": 0.0,
+    "mask_time_length": 10,
+    "mask_time_min_masks": 2,
+    "mask_time_prob": 0.05,
+    "max_length": 448,
+    "max_source_positions": 1500,
+    "max_target_positions": 448,
+    "median_filter_width": 7,
+    "model_type": "whisper",
+    "num_hidden_layers": 32,
+    "num_mel_bins": 128,
+    "pad_token_id": 50256,
+    "scale_embedding": false,
+    "torch_dtype": "float16",
+    "transformers_version": "4.36.0.dev0",
+    "use_cache": true,
+    "use_weighted_layer_sum": false,
+    "vocab_size": 51866
+  },
+  "audio_preprocessor_config": {
+    "chunk_length": 30,
+    "feature_extractor_type": "WhisperFeatureExtractor",
+    "feature_size": 128,
+    "hop_length": 160,
+    "n_fft": 400,
+    "n_samples": 480000,
+    "nb_max_frames": 3000,
+    "padding_side": "right",
+    "padding_value": 0.0,
+    "processor_class": "WhisperProcessor",
+    "return_attention_mask": false,
+    "sampling_rate": 16000
+  },
+  "downsample_ratio": 0.25,
+  "dynamic_image_size": true,
+  "force_image_size": 448,
+  "llm_config": {
+    "_name_or_path": "Qwen/Qwen3-8B",
+    "add_cross_attention": false,
+    "architectures": [
+      "Qwen3ForCausalLM"
+    ],
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "attn_implementation": "flash_attention_2",
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bias": false,
+    "bos_token_id": 151643,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 151645,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 4096,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_range": 0.02,
+    "intermediate_size": 12288,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 40960,
+    "max_window_layers": 40,
+    "min_length": 0,
+    "model_type": "qwen3",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 32,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 36,
+    "num_key_value_heads": 8,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": {
+      "factor": 2.0,
+      "type": "dynamic"
+    },
+    "rope_theta": 1000000.0,
+    "sep_token_id": null,
+    "sliding_window": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": false,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "bfloat16",
+    "torchscript": false,
+    "transformers_version": "4.51.0",
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "use_cache": false,
+    "use_sliding_window": false,
+    "vocab_size": 151968
+  },
+  "max_dynamic_patch": 12,
+  "min_dynamic_patch": 1,
+  "model_type": "interactiveomni",
+  "pad2square": false,
+  "ps_version": "v2",
+  "select_layer": -1,
+  "template": "interactiveomni_template",
+  "torch_dtype": "bfloat16",
+  "transformers_version": null,
+  "use_backbone_lora": 0,
+  "use_llm_lora": 0,
+  "use_thumbnail": true,
+  "vision_config": {
+    "_name_or_path": "OpenGVLab/InternViT-300M-448px",
+    "add_cross_attention": false,
+    "architectures": [
+      "InternVisionModel"
+    ],
+    "auto_map": {
+      "AutoConfig": "configuration_intern_vit.InternVisionConfig",
+      "AutoModel": "modeling_intern_vit.InternVisionModel"
+    },
+    "attention_dropout": 0.0,
+    "drop_path_rate": 0.1,
+    "dropout": 0.0,
+    "hidden_act": "gelu",
+    "hidden_size": 1024,
+    "image_size": 448,
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "layer_norm_eps": 1e-06,
+    "model_type": "intern_vit_6b",
+    "norm_type": "layer_norm",
+    "num_attention_heads": 16,
+    "num_channels": 3,
+    "num_hidden_layers": 24,
+    "qk_normalization": false,
+    "qkv_bias": true,
+    "torch_dtype": "bfloat16",
+    "transformers_version": "4.37.2",
+    "use_flash_attn": true
+  },
+  "flow_config": {
+    "_attn_implementation_internal": null,
+    "_commit_hash": null,
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": [
+      "CausalMaskedDiffWithXvec"
+    ],
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_config": {
+      "cfm_params": {
+        "inference_cfg_rate": 0.7,
+        "reg_loss_type": "l1",
+        "sigma_min": 1e-06,
+        "solver": "euler",
+        "t_scheduler": "cosine",
+        "training_cfg_rate": 0.2
+      },
+      "estimator_config": {
+        "act_fn": "gelu",
+        "attention_head_dim": 64,
+        "causal": true,
+        "channels": [
+          256
+        ],
+        "dropout": 0.0,
+        "in_channels": 320,
+        "n_blocks": 4,
+        "num_heads": 8,
+        "num_mid_blocks": 12,
+        "out_channels": 80
+      },
+      "in_channels": 240,
+      "n_spks": 1,
+      "spk_emb_dim": 80
+    },
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_config": {
+      "attention_dropout_rate": 0.1,
+      "attention_heads": 8,
+      "dropout_rate": 0.1,
+      "input_layer": "linear",
+      "input_size": 512,
+      "linear_units": 2048,
+      "macaron_style": false,
+      "normalize_before": true,
+      "num_blocks": 6,
+      "output_size": 512,
+      "pos_enc_layer_type": "rel_pos_espnet",
+      "positional_dropout_rate": 0.1,
+      "selfattention_layer_type": "rel_selfattn",
+      "use_cnn_module": false
+    },
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "input_frame_rate": 25,
+    "input_size": 512,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "no_repeat_ngram_size": 0,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_return_sequences": 1,
+    "only_mask_loss": true,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "output_size": 80,
+    "output_type": "mel",
+    "pad_token_id": null,
+    "pre_lookahead_len": 3,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "spk_embed_dim": 192,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "token_mel_ratio": 2,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "float32",
+    "torchscript": false,
+    "transformers_version": null,
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "vocab_size": 6561
+  },
+  "hifigan_config": {
+    "_attn_implementation_internal": null,
+    "_commit_hash": null,
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": [
+      "HiFTGenerator"
+    ],
+    "audio_limit": 0.99,
+    "bad_words_ids": null,
+    "base_channels": 512,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "f0_predictor_config": {
+      "cond_channels": 512,
+      "in_channels": 80,
+      "num_class": 1
+    },
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "in_channels": 80,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "istft_params": {
+      "hop_len": 4,
+      "n_fft": 16
+    },
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "lrelu_slope": 0.1,
+    "max_length": 20,
+    "min_length": 0,
+    "nb_harmonics": 8,
+    "no_repeat_ngram_size": 0,
+    "nsf_alpha": 0.1,
+    "nsf_sigma": 0.003,
+    "nsf_voiced_threshold": 10,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "resblock_dilation_sizes": [
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ]
+    ],
+    "resblock_kernel_sizes": [
+      3,
+      7,
+      11
+    ],
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sampling_rate": 24000,
+    "sep_token_id": null,
+    "source_resblock_dilation_sizes": [
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ]
+    ],
+    "source_resblock_kernel_sizes": [
+      7,
+      7,
+      11
+    ],
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "float32",
+    "torchscript": false,
+    "transformers_version": null,
+    "typical_p": 1.0,
+    "upsample_kernel_sizes": [
+      16,
+      11,
+      7
+    ],
+    "upsample_rates": [
+      8,
+      5,
+      3
+    ],
+    "use_bfloat16": false
+  },
+  "voicelm_config": {
+    "_attn_implementation_internal": null,
+    "_commit_hash": null,
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_normalized_loss": true,
+    "length_penalty": 1.0,
+    "llm_config": {
+      "add_cross_attention": false,
+      "architectures": [
+        "Qwen2ForCausalLM"
+      ],
+      "attention_dropout": 0.0,
+      "bad_words_ids": null,
+      "begin_suppress_tokens": null,
+      "bos_token_id": 151643,
+      "chunk_size_feed_forward": 0,
+      "cross_attention_hidden_size": null,
+      "decoder_start_token_id": null,
+      "diversity_penalty": 0.0,
+      "do_sample": false,
+      "early_stopping": false,
+      "encoder_no_repeat_ngram_size": 0,
+      "eos_token_id": 151643,
+      "exponential_decay_length_penalty": null,
+      "finetuning_task": null,
+      "forced_bos_token_id": null,
+      "forced_eos_token_id": null,
+      "hidden_act": "silu",
+      "hidden_size": 896,
+      "id2label": {
+        "0": "LABEL_0",
+        "1": "LABEL_1"
+      },
+      "initializer_range": 0.02,
+      "intermediate_size": 4864,
+      "is_decoder": false,
+      "is_encoder_decoder": false,
+      "label2id": {
+        "LABEL_0": 0,
+        "LABEL_1": 1
+      },
+      "length_penalty": 1.0,
+      "max_length": 20,
+      "max_position_embeddings": 32768,
+      "max_window_layers": 24,
+      "min_length": 0,
+      "model_type": "qwen2",
+      "no_repeat_ngram_size": 0,
+      "num_attention_heads": 14,
+      "num_beam_groups": 1,
+      "num_beams": 1,
+      "num_hidden_layers": 24,
+      "num_key_value_heads": 2,
+      "num_return_sequences": 1,
+      "output_attentions": false,
+      "output_hidden_states": false,
+      "output_scores": false,
+      "pad_token_id": null,
+      "prefix": null,
+      "problem_type": null,
+      "pruned_heads": {},
+      "remove_invalid_values": false,
+      "repetition_penalty": 1.0,
+      "return_dict": true,
+      "return_dict_in_generate": false,
+      "rms_norm_eps": 1e-06,
+      "rope_theta": 1000000.0,
+      "sep_token_id": null,
+      "sliding_window": 32768,
+      "suppress_tokens": null,
+      "task_specific_params": null,
+      "temperature": 1.0,
+      "tf_legacy_loss": false,
+      "tie_encoder_decoder": false,
+      "tie_word_embeddings": true,
+      "tokenizer_class": null,
+      "top_k": 50,
+      "top_p": 1.0,
+      "torch_dtype": "bfloat16",
+      "torchscript": false,
+      "transformers_version": "4.37.2",
+      "typical_p": 1.0,
+      "use_bfloat16": false,
+      "use_cache": false,
+      "use_mrope": false,
+      "use_sliding_window": false,
+      "vocab_size": 151936
+    },
+    "llm_input_size": 896,
+    "llm_output_size": 896,
+    "lsm_weight": 0,
+    "max_length": 20,
+    "min_length": 0,
+    "no_repeat_ngram_size": 0,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sampling_config": {
+      "tau_r": 0.1,
+      "top_k": 15,
+      "top_p": 0.7,
+      "win_size": 10
+    },
+    "sep_token_id": null,
+    "speech_token_size": 6561,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": null,
+    "typical_p": 1.0,
+    "use_bfloat16": false
+  }
+}

configuration_flow.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# --------------------------------------------------------
+# SenseTime
+# Copyright (c) 2025 SenseTime
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import copy
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class FlowConfig(PretrainedConfig):
+    def __init__(
+            self,
+            input_size = 512,
+            output_size= 80,
+            spk_embed_dim = 192,
+            output_type = 'mel',
+            vocab_size = 6561,
+            input_frame_rate = 25,
+            only_mask_loss = True,
+            token_mel_ratio=2,
+            pre_lookahead_len=3,
+            encoder_config={'output_size': 512,
+                            'attention_heads': 8,
+                            'linear_units': 2048,
+                            'num_blocks': 6,
+                            'dropout_rate': 0.1,
+                            'positional_dropout_rate': 0.1,
+                            'attention_dropout_rate': 0.1,
+                            'normalize_before': True,
+                            'input_layer': 'linear',
+                            'pos_enc_layer_type': 'rel_pos_espnet',
+                            'selfattention_layer_type': 'rel_selfattn',
+                            'input_size': 512,
+                            'use_cnn_module': False,
+                            'macaron_style': False,
+                            },
+            decoder_config={'in_channels': 240,
+                            'n_spks': 1,
+                            'spk_emb_dim': 80,
+                            'cfm_params': {
+                                'sigma_min': 1e-06,
+                                'solver': 'euler',
+                                't_scheduler': 'cosine',
+                                'training_cfg_rate': 0.2,
+                                'inference_cfg_rate': 0.7,
+                               'reg_loss_type': 'l1',
+                               },
+                            'estimator_config':{
+                                'in_channels': 320,
+                                'out_channels': 80,
+                                'causal': True,
+                                'channels': [256],
+                                'dropout': 0.0,
+                                'attention_head_dim': 64,
+                                'n_blocks': 4,
+                                'num_mid_blocks': 12,
+                                'num_heads': 8,
+                                'act_fn': 'gelu'
+                                }
+                            },
+            **kwargs):
+        super().__init__(**kwargs)
+        self.encoder_config = encoder_config
+        self.decoder_config = decoder_config
+        self.input_size = input_size
+        self.output_size = output_size
+        self.spk_embed_dim = spk_embed_dim
+        self.output_type = output_type
+        self.vocab_size = vocab_size
+        self.input_frame_rate = input_frame_rate
+        self.only_mask_loss = only_mask_loss
+        self.token_mel_ratio = token_mel_ratio
+        self.pre_lookahead_len = pre_lookahead_len
+        pass
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output['encoder_config'] = self.encoder_config
+        output['decoder_config'] = self.decoder_config
+        output['input_size'] = self.input_size
+        output['output_size'] = self.output_size
+        output['spk_embed_dim'] = self.spk_embed_dim
+        output['output_type'] = self.output_type
+        output['vocab_size'] = self.vocab_size
+        output['input_frame_rate'] = self.input_frame_rate
+        output['only_mask_loss'] = self.only_mask_loss
+        output['token_mel_ratio'] = self.token_mel_ratio
+        output['pre_lookahead_len'] = self.pre_lookahead_len
+        return output

configuration_hifigan.py ADDED Viewed

	@@ -0,0 +1,87 @@

+# --------------------------------------------------------
+# SenseTime
+# Copyright (c) 2025 SenseTime
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import copy
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class HiFiGanConfig(PretrainedConfig):
+    def __init__(
+            self,
+            in_channels = 80,
+            base_channels = 512,
+            nb_harmonics = 8,
+            sampling_rate =24000,
+            nsf_alpha= 0.1,
+            nsf_sigma= 0.003,
+            nsf_voiced_threshold = 10,
+            upsample_rates = [8, 5, 3],
+            upsample_kernel_sizes = [16, 11, 7],
+            istft_params ={'n_fft': 16,
+                           'hop_len': 4,
+                           },
+            resblock_kernel_sizes = [3, 7, 11],
+            resblock_dilation_sizes = [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+            source_resblock_kernel_sizes = [7, 7, 11],
+            source_resblock_dilation_sizes = [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+            lrelu_slope = 0.1,
+            audio_limit =0.99,
+            f0_predictor_config={
+                'num_class': 1,
+                'in_channels': 80,
+                'cond_channels': 512
+                },
+            **kwargs):
+        super().__init__(**kwargs)
+        self.in_channels = in_channels
+        self.base_channels = base_channels
+        self.nb_harmonics = nb_harmonics
+        self.sampling_rate = sampling_rate
+        self.nsf_alpha = nsf_alpha
+        self.nsf_sigma = nsf_sigma
+        self.nsf_voiced_threshold = nsf_voiced_threshold
+        self.upsample_rates = upsample_rates
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.istft_params = istft_params
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes= resblock_dilation_sizes
+        self.source_resblock_kernel_sizes = source_resblock_kernel_sizes
+        self.source_resblock_dilation_sizes = source_resblock_dilation_sizes
+        self.lrelu_slope = lrelu_slope
+        self.audio_limit = audio_limit
+        self.f0_predictor_config = f0_predictor_config
+        pass
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output['in_channels'] = self.in_channels
+        output['base_channels'] = self.base_channels
+        output['nb_harmonics'] = self.nb_harmonics
+        output['sampling_rate'] = self.sampling_rate
+        output['nsf_alpha'] = self.nsf_alpha
+        output['nsf_sigma'] = self.nsf_sigma
+        output['nsf_voiced_threshold'] = self.nsf_voiced_threshold
+        output['upsample_rates'] = self.upsample_rates
+        output['upsample_kernel_sizes'] = self.upsample_kernel_sizes
+        output['istft_params'] = self.istft_params
+        output['resblock_kernel_sizes'] = self.resblock_kernel_sizes
+        output['resblock_dilation_sizes'] = self.resblock_dilation_sizes
+        output['source_resblock_dilation_sizes'] = self.source_resblock_dilation_sizes
+        output['lrelu_slope'] = self.lrelu_slope
+        output['audio_limit'] = self.audio_limit
+        output['f0_predictor_config'] = self.f0_predictor_config
+        return output

configuration_interactiveomni.py ADDED Viewed

	@@ -0,0 +1,125 @@

+# --------------------------------------------------------
+# SenseTime
+# Copyright (c) 2025 SenseTime
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import copy
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from transformers import LlamaConfig, Qwen2Config, Qwen3Config
+from .configuration_intern_vit import InternVisionConfig
+from .configuration_whisper import WhisperConfig
+from .configuration_voicelm import VoiceLMConfig
+from .configuration_flow import FlowConfig
+from .configuration_hifigan import HiFiGanConfig
+logger = logging.get_logger(__name__)
+class InteractiveOmniConfig(PretrainedConfig):
+    model_type = 'interactiveomni'
+    is_composition = True
+    def __init__(
+            self,
+            vision_config=None,
+            llm_config=None,
+            audio_config=None,
+            voicelm_config=None,
+            flow_config=None,
+            hifigan_config=None,
+            use_backbone_lora=0,
+            use_llm_lora=0,
+            pad2square=False,
+            select_layer=-4,
+            force_image_size=None,
+            downsample_ratio=0.5,
+            template=None,
+            dynamic_image_size=False,
+            use_thumbnail=False,
+            ps_version='v1',
+            min_dynamic_patch=1,
+            max_dynamic_patch=6,
+            **kwargs):
+        super().__init__(**kwargs)
+        if vision_config is None:
+            vision_config = {}
+            logger.info('vision_config is None. Initializing the InternVisionConfig with default values.')
+        if llm_config is None:
+            llm_config = {}
+            logger.info('llm_config is None. Initializing the Qwen3Config as default values.')
+        if audio_config is None:
+            audio_config = {}
+            logger.info('audio_config is None. Initializing the WhisperConfig as default values.')
+        if voicelm_config is None:
+            voicelm_config = {}
+            logger.info('voicelm_config is None. Initializing the VoiceLMConfig as default values')
+        if flow_config is None:
+            flow_config = {}
+            logger.info('flow_config is None. Initializing the FlowConfig as default values')
+        if hifigan_config is None:
+            hifigan_config = {}
+            logger.info('hifigan_config is None. Initializing the HiFiGanConfig as default values')
+        self.vision_config = InternVisionConfig(**vision_config)
+        self.audio_config = WhisperConfig(**audio_config)
+        self.llm_config = Qwen3Config(**llm_config)
+        self.voicelm_config = VoiceLMConfig(**voicelm_config)
+        self.flow_config = FlowConfig(**flow_config)
+        self.hifigan_config = HiFiGanConfig(**hifigan_config)
+        self.use_backbone_lora = use_backbone_lora
+        self.use_llm_lora = use_llm_lora
+        self.pad2square = pad2square
+        self.select_layer = select_layer
+        self.force_image_size = force_image_size
+        self.downsample_ratio = downsample_ratio
+        self.template = template
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail = use_thumbnail
+        self.ps_version = ps_version  # pixel shuffle version
+        self.min_dynamic_patch = min_dynamic_patch
+        self.max_dynamic_patch = max_dynamic_patch
+        logger.info(f'vision_select_layer: {self.select_layer}')
+        logger.info(f'ps_version: {self.ps_version}')
+        logger.info(f'min_dynamic_patch: {self.min_dynamic_patch}')
+        logger.info(f'max_dynamic_patch: {self.max_dynamic_patch}')
+        pass
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output['vision_config'] = self.vision_config.to_dict()
+        output['audio_config'] = self.audio_config.to_dict()
+        output['llm_config'] = self.llm_config.to_dict()
+        output['voicelm_config'] = self.voicelm_config.to_dict()
+        output['flow_config'] = self.flow_config.to_dict()
+        output['hifigan_config'] = self.hifigan_config.to_dict()
+        output['model_type'] = self.__class__.model_type
+        output['use_backbone_lora'] = self.use_backbone_lora
+        output['use_llm_lora'] = self.use_llm_lora
+        output['pad2square'] = self.pad2square
+        output['select_layer'] = self.select_layer
+        output['force_image_size'] = self.force_image_size
+        output['downsample_ratio'] = self.downsample_ratio
+        output['template'] = self.template
+        output['dynamic_image_size'] = self.dynamic_image_size
+        output['use_thumbnail'] = self.use_thumbnail
+        output['ps_version'] = self.ps_version
+        output['min_dynamic_patch'] = self.min_dynamic_patch
+        output['max_dynamic_patch'] = self.max_dynamic_patch
+        return output

configuration_intern_vit.py ADDED Viewed

	@@ -0,0 +1,119 @@

+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import os
+from typing import Union
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class InternVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`InternVisionModel`]. It is used to
+    instantiate a vision encoder according to the specified arguments, defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of color channels in the input images (e.g., 3 for RGB).
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        qkv_bias (`bool`, *optional*, defaults to `False`):
+            Whether to add a bias to the queries and values in the self-attention layers.
+        hidden_size (`int`, *optional*, defaults to 3200):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_attention_heads (`int`, *optional*, defaults to 25):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 12800):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        qk_normalization (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the queries and keys in the self-attention layers.
+        num_hidden_layers (`int`, *optional*, defaults to 48):
+            Number of hidden layers in the Transformer encoder.
+        use_flash_attn (`bool`, *optional*, defaults to `True`):
+            Whether to use flash attention mechanism.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+            The epsilon used by the layer normalization layers.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            Dropout rate for stochastic depth.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 0.1):
+            A factor for layer scale.
+    """
+    model_type = 'intern_vit_6b'
+    def __init__(
+            self,
+            num_channels=3,
+            patch_size=14,
+            image_size=224,
+            qkv_bias=False,
+            hidden_size=3200,
+            num_attention_heads=25,
+            intermediate_size=12800,
+            qk_normalization=True,
+            num_hidden_layers=48,
+            use_flash_attn=True,
+            hidden_act='gelu',
+            norm_type='rms_norm',
+            layer_norm_eps=1e-6,
+            dropout=0.0,
+            drop_path_rate=0.0,
+            attention_dropout=0.0,
+            initializer_range=0.02,
+            initializer_factor=0.1,
+            **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.drop_path_rate = drop_path_rate
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.norm_type = norm_type
+        self.qkv_bias = qkv_bias
+        self.qk_normalization = qk_normalization
+        self.use_flash_attn = use_flash_attn
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> 'PretrainedConfig':
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        if 'vision_config' in config_dict:
+            config_dict = config_dict['vision_config']
+        if 'model_type' in config_dict and hasattr(cls, 'model_type') and config_dict['model_type'] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
+            )
+        return cls.from_dict(config_dict, **kwargs)

configuration_voicelm.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# --------------------------------------------------------
+# SenseTime
+# Copyright (c) 2025 SenseTime
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import copy
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from transformers import LlamaConfig, Qwen2Config
+logger = logging.get_logger(__name__)
+class VoiceLMConfig(PretrainedConfig):
+    def __init__(
+            self,
+            llm_input_size = 896,
+            llm_output_size = 896,
+            speech_token_size = 6561,
+            length_normalized_loss = True,
+            lsm_weight = 0,
+            llm_config=None,
+            sampling_config={
+                'top_p': 0.8,
+                'top_k': 25,
+                'win_size': 10,
+                'tau_r': 0.1,
+                },
+            **kwargs):
+        super().__init__(**kwargs)
+        self.llm_input_size = llm_input_size
+        self.llm_output_size = llm_output_size
+        self.speech_token_size = speech_token_size
+        self.length_normalized_loss = length_normalized_loss
+        self.lsm_weight = lsm_weight
+        self.sampling_config = sampling_config
+        if llm_config is None:
+            llm_config = {}
+            logger.info('llm_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`).')
+        self.llm_config = Qwen2Config(**llm_config)
+        pass
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output['llm_input_size'] = self.llm_input_size
+        output['llm_output_size'] = self.llm_output_size
+        output['speech_token_size'] = self.speech_token_size
+        output['length_normalized_loss'] = self.length_normalized_loss
+        output['lsm_weight'] = self.lsm_weight
+        output['sampling_config'] = self.sampling_config
+        output['llm_config'] = self.llm_config.to_dict()
+        return output

configuration_whisper.py ADDED Viewed

	@@ -0,0 +1,340 @@

+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Whisper model configuration"""
+from collections import OrderedDict
+from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
+from transformers.configuration_utils import PretrainedConfig
+from transformers.onnx import OnnxConfig, OnnxSeq2SeqConfigWithPast
+from transformers.utils import logging
+if TYPE_CHECKING:
+    from transformers.feature_extraction_utils import FeatureExtractionMixin
+    from transformers.tokenization_utils_base import PreTrainedTokenizerBase
+    from transformers.utils import TensorType
+logger = logging.get_logger(__name__)
+# fmt: off
+NON_SPEECH_TOKENS = [
+    1, 2, 7, 8, 9, 10, 14, 25,
+    26, 27, 28, 29, 31, 58, 59, 60, 61, 62,
+    63, 90, 91, 92, 93, 357, 366, 438, 532, 685,
+    705, 796, 930, 1058, 1220, 1267, 1279, 1303, 1343, 1377,
+    1391, 1635, 1782, 1875, 2162, 2361, 2488, 3467, 4008, 4211,
+    4600, 4808, 5299, 5855, 6329, 7203, 9609, 9959, 10563, 10786,
+    11420, 11709, 11907, 13163, 13697, 13700, 14808, 15306, 16410, 16791,
+    17992, 19203, 19510, 20724, 22305, 22935, 27007, 30109, 30420, 33409,
+    34949, 40283, 40493, 40549, 47282, 49146, 50257, 50359, 50360, 50361
+]
+NON_SPEECH_TOKENS_MULTI = [
+    1, 2, 7, 8, 9, 10, 14, 25,
+    26, 27, 28, 29, 31, 58, 59, 60, 61, 62,
+    63, 90, 91, 92, 93, 359, 503, 522, 542, 873,
+    893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627,
+    3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647,
+    7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793,
+    14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675,
+    22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865,
+    42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362
+]
+# fmt: on
+class WhisperConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`WhisperModel`]. It is used to instantiate a
+    Whisper model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Whisper
+    [openai/whisper-tiny](https://huggingface.co/openai/whisper-tiny) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 51865):
+            Vocabulary size of the Whisper model. Defines the number of different tokens that can be represented by the
+            `decoder_input_ids` passed when calling [`WhisperModel`]
+        num_mel_bins (`int`, *optional*, defaults to 80):
+            Number of mel features used per input features. Should correspond to the value used in the
+            `WhisperProcessor` class.
+        encoder_layers (`int`, *optional*, defaults to 4):
+            Number of encoder layers.
+        decoder_layers (`int`, *optional*, defaults to 4):
+            Number of decoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 6):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 6):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 1536):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in encoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 1536):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        decoder_start_token_id (`int`, *optional*, defaults to 50257):
+            Corresponds to the "<|startoftranscript|>" token, which is automatically used when no `decoder_input_ids`
+            are provided to the `generate` function. It is used to guide the model`s generation process depending on
+            the task.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
+            Whether the model is used as an encoder/decoder or not.
+        activation_function (`str`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        d_model (`int`, *optional*, defaults to 384):
+            Dimensionality of the layers.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        scale_embedding (`bool`, *optional*, defaults to False):
+            Scale embeddings by diving by sqrt(d_model).
+        max_source_positions (`int`, *optional*, defaults to 1500):
+            The maximum sequence length of log-mel filter-bank features that this model might ever be used with.
+        max_target_positions (`int`, *optional*, defaults to 448):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        pad_token_id (`int`, *optional*, defaults to 50256):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 50256):
+            Begin of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 50256):
+            End of stream token id.
+        suppress_tokens (`List[int]`, *optional*):
+            A list containing the non-speech tokens that will be used by the logit processor in the `generate`
+            function. NON_SPEECH_TOKENS and NON_SPEECH_TOKENS_MULTI each correspond to the `english-only` and the
+            `multilingual` model.
+        begin_suppress_tokens (`List[int]`, *optional*, defaults to `[220,50256]`):
+            A list containing tokens that will be supressed at the beginning of the sampling process. Initialized as
+            the token for `" "` (`blank_token_id`) and the `eos_token_id`
+        use_weighted_layer_sum (`bool`, *optional*, defaults to `False`):
+            Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
+            instance of [`WhisperForAudioClassification`].
+        classifier_proj_size (`int`, *optional*, defaults to 256):
+            Dimensionality of the projection before token mean-pooling for classification. Only relevant when using an
+            instance of [`WhisperForAudioClassification`].
+        apply_spec_augment (`bool`, *optional*, defaults to `False`):
+            Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see
+            [SpecAugment: A Simple Data Augmentation Method for Automatic Speech
+            Recognition](https://arxiv.org/abs/1904.08779).
+        mask_time_prob (`float`, *optional*, defaults to 0.05):
+            Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
+            procecure generates `mask_time_prob*len(time_axis)/mask_time_length` independent masks over the axis. If
+            reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
+            masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
+            actual percentage of masked vectors. This is only relevant if `apply_spec_augment == True`.
+        mask_time_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the time axis.
+        mask_time_min_masks (`int`, *optional*, defaults to 2),:
+            The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step,
+            irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length <
+            mask_time_min_masks''
+        mask_feature_prob (`float`, *optional*, defaults to 0.0):
+            Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
+            masking procecure generates `mask_feature_prob*len(feature_axis)/mask_time_length` independent masks over
+            the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
+            span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
+            may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
+            True`.
+        mask_feature_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the feature axis.
+        mask_feature_min_masks (`int`, *optional*, defaults to 0),:
+            The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
+            step, irrespectively of `mask_feature_prob`. Only relevant if
+            `mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks`.
+        median_filter_width (`int`, *optional*, defaults to 7):
+            Width of the median filter used to smoothen to cross-attention outputs when computing token timestamps.
+            Should be an odd number.
+    Example:
+    ```python
+    >>> from transformers import WhisperConfig, WhisperModel
+    >>> # Initializing a Whisper tiny style configuration
+    >>> configuration = WhisperConfig()
+    >>> # Initializing a model (with random weights) from the tiny style configuration
+    >>> model = WhisperModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "whisper"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
+    def __init__(
+        self,
+        vocab_size=51865,
+        num_mel_bins=80,
+        encoder_layers=4,
+        encoder_attention_heads=6,
+        decoder_layers=4,
+        decoder_attention_heads=6,
+        decoder_ffn_dim=1536,
+        encoder_ffn_dim=1536,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        decoder_start_token_id=50257,
+        use_cache=True,
+        is_encoder_decoder=True,
+        activation_function="gelu",
+        d_model=384,
+        dropout=0.0,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        scale_embedding=False,
+        max_source_positions=1500,
+        max_target_positions=448,
+        pad_token_id=50256,
+        bos_token_id=50256,
+        eos_token_id=50256,
+        suppress_tokens=None,
+        begin_suppress_tokens=[220, 50256],
+        use_weighted_layer_sum=False,
+        classifier_proj_size=256,
+        apply_spec_augment=False,
+        mask_time_prob=0.05,
+        mask_time_length=10,
+        mask_time_min_masks=2,
+        mask_feature_prob=0.0,
+        mask_feature_length=10,
+        mask_feature_min_masks=0,
+        median_filter_width=7,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.num_mel_bins = num_mel_bins
+        self.d_model = d_model
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.use_cache = use_cache
+        self.num_hidden_layers = encoder_layers
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+        self.max_source_positions = max_source_positions
+        self.max_target_positions = max_target_positions
+        # Audio Classification-specific parameters. Feel free to ignore for other classes.
+        self.classifier_proj_size = classifier_proj_size
+        self.use_weighted_layer_sum = use_weighted_layer_sum
+        # fine-tuning config parameters for SpecAugment: https://arxiv.org/abs/1904.08779
+        self.apply_spec_augment = apply_spec_augment
+        self.mask_time_prob = mask_time_prob
+        self.mask_time_length = mask_time_length
+        self.mask_time_min_masks = mask_time_min_masks
+        self.mask_feature_prob = mask_feature_prob
+        self.mask_feature_length = mask_feature_length
+        self.mask_feature_min_masks = mask_feature_min_masks
+        self.median_filter_width = median_filter_width
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            decoder_start_token_id=decoder_start_token_id,
+            suppress_tokens=suppress_tokens,
+            begin_suppress_tokens=begin_suppress_tokens,
+            **kwargs,
+        )
+class WhisperOnnxConfig(OnnxSeq2SeqConfigWithPast):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        common_inputs = OrderedDict(
+            [
+                ("input_features", {0: "batch", 1: "feature_size", 2: "encoder_sequence"}),
+            ]
+        )
+        if self.use_past:
+            common_inputs["decoder_input_ids"] = {0: "batch"}
+        else:
+            common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"}
+        if self.use_past:
+            self.fill_with_past_key_values_(common_inputs, direction="inputs")
+        return common_inputs
+    def generate_dummy_inputs(
+        self,
+        preprocessor: Union["PreTrainedTokenizerBase", "FeatureExtractionMixin"],
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+        framework: Optional["TensorType"] = None,
+        sampling_rate: int = 22050,
+        time_duration: float = 5.0,
+        frequency: int = 220,
+    ) -> Mapping[str, Any]:
+        dummy_inputs = OrderedDict()
+        encoder_inputs = OnnxConfig.generate_dummy_inputs(
+            self,
+            preprocessor=preprocessor.feature_extractor,
+            batch_size=batch_size,
+            framework=framework,
+            sampling_rate=sampling_rate,
+            time_duration=time_duration,
+            frequency=frequency,
+        )
+        encoder_sequence_length = encoder_inputs["input_features"].shape[2]
+        seq_length = encoder_sequence_length // 2 if self.use_past else seq_length
+        decoder_inputs = super().generate_dummy_inputs(
+            preprocessor.tokenizer, batch_size, seq_length, is_pair, framework
+        )
+        dummy_inputs["input_features"] = encoder_inputs.pop("input_features")
+        dummy_inputs["decoder_input_ids"] = decoder_inputs.pop("decoder_input_ids")
+        if "past_key_values" in decoder_inputs:
+            dummy_inputs["past_key_values"] = decoder_inputs.pop("past_key_values")
+        return dummy_inputs
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-3

conversation.py ADDED Viewed

	@@ -0,0 +1,340 @@

+"""
+Conversation prompt templates.
+We kindly request that you import fastchat instead of copying this file if you wish to use it.
+If you have any changes in mind, please contribute back so the community can benefit collectively and continue to maintain these valuable templates.
+"""
+import dataclasses
+from enum import IntEnum, auto
+from typing import Any, Dict, List, Tuple, Union
+class SeparatorStyle(IntEnum):
+    """Separator styles."""
+    ADD_COLON_SINGLE = auto()
+    ADD_COLON_TWO = auto()
+    ADD_COLON_SPACE_SINGLE = auto()
+    NO_COLON_SINGLE = auto()
+    NO_COLON_TWO = auto()
+    ADD_NEW_LINE_SINGLE = auto()
+    LLAMA2 = auto()
+    CHATGLM = auto()
+    CHATML = auto()
+    CHATINTERN = auto()
+    DOLLY = auto()
+    RWKV = auto()
+    PHOENIX = auto()
+    ROBIN = auto()
+    FALCON_CHAT = auto()
+    CHATGLM3 = auto()
+    MPT = auto()
+@dataclasses.dataclass
+class Conversation:
+    """A class that manages prompt templates and keeps all conversation history."""
+    # The name of this template
+    name: str
+    # The template of the system prompt
+    system_template: str = '{system_message}'
+    # The system message
+    system_message: str = ''
+    # The names of two roles
+    roles: Tuple[str] = ('USER', 'ASSISTANT')
+    # All messages. Each item is (role, message).
+    messages: List[List[str]] = ()
+    # The number of few shot examples
+    offset: int = 0
+    # The separator style and configurations
+    sep_style: SeparatorStyle = SeparatorStyle.ADD_COLON_SINGLE
+    sep: str = '\n'
+    sep2: str = None
+    # Stop criteria (the default one is EOS token)
+    stop_str: Union[str, List[str]] = None
+    # Stops generation if meeting any token in this list
+    stop_token_ids: List[int] = None
+    def get_prompt(self) -> str:
+        """Get the prompt for generation."""
+        system_prompt = self.system_template.format(system_message=self.system_message)
+        if self.sep_style == SeparatorStyle.ADD_COLON_SINGLE:
+            ret = system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ': ' + message + self.sep
+                else:
+                    ret += role + ':'
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_COLON_TWO:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt + seps[0]
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + ': ' + message + seps[i % 2]
+                else:
+                    ret += role + ':'
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_COLON_SPACE_SINGLE:
+            ret = system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ': ' + message + self.sep
+                else:
+                    ret += role + ': '  # must be end with a space
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_NEW_LINE_SINGLE:
+            ret = '' if system_prompt == '' else system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + '\n' + message + self.sep
+                else:
+                    ret += role + '\n'
+            return ret
+        elif self.sep_style == SeparatorStyle.NO_COLON_SINGLE:
+            ret = system_prompt
+            for role, message in self.messages:
+                if message:
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+            return ret
+        elif self.sep_style == SeparatorStyle.NO_COLON_TWO:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + message + seps[i % 2]
+                else:
+                    ret += role
+            return ret
+        elif self.sep_style == SeparatorStyle.RWKV:
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += (
+                        role
+                        + ': '
+                        + message.replace('\r\n', '\n').replace('\n\n', '\n')
+                    )
+                    ret += '\n\n'
+                else:
+                    ret += role + ':'
+            return ret
+        elif self.sep_style == SeparatorStyle.LLAMA2:
+            seps = [self.sep, self.sep2]
+            if self.system_message:
+                ret = system_prompt
+            else:
+                ret = '[INST] '
+            for i, (role, message) in enumerate(self.messages):
+                tag = self.roles[i % 2]
+                if message:
+                    if i == 0:
+                        ret += message + ' '
+                    else:
+                        ret += tag + ' ' + message + seps[i % 2]
+                else:
+                    ret += tag
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATGLM:
+            # source: https://huggingface.co/THUDM/chatglm-6b/blob/1d240ba371910e9282298d4592532d7f0f3e9f3e/modeling_chatglm.py#L1302-L1308
+            # source2: https://huggingface.co/THUDM/chatglm2-6b/blob/e186c891cf64310ac66ef10a87e6635fa6c2a579/modeling_chatglm.py#L926
+            round_add_n = 1 if self.name == 'chatglm2' else 0
+            if system_prompt:
+                ret = system_prompt + self.sep
+            else:
+                ret = ''
+            for i, (role, message) in enumerate(self.messages):
+                if i % 2 == 0:
+                    ret += f'[Round {i//2 + round_add_n}]{self.sep}'
+                if message:
+                    ret += f'{role}：{message}{self.sep}'
+                else:
+                    ret += f'{role}：'
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATML:
+            ret = '' if system_prompt == '' else system_prompt + self.sep + '\n'
+            for role, message in self.messages:
+                if message:
+                    ret += role + '\n' + message + self.sep + '\n'
+                else:
+                    ret += role + '\n'
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATGLM3:
+            ret = ''
+            if self.system_message:
+                ret += system_prompt
+            for role, message in self.messages:
+                if message:
+                    ret += role + '\n' + ' ' + message
+                else:
+                    ret += role
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATINTERN:
+            # source: https://huggingface.co/internlm/internlm-chat-7b-8k/blob/bd546fa984b4b0b86958f56bf37f94aa75ab8831/modeling_internlm.py#L771
+            seps = [self.sep, self.sep2]
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                # if i % 2 == 0:
+                #     ret += "<s>"
+                if message:
+                    ret += role + ':' + message + seps[i % 2] + '\n'
+                else:
+                    ret += role + ':'
+            return ret
+        elif self.sep_style == SeparatorStyle.DOLLY:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + ':\n' + message + seps[i % 2]
+                    if i % 2 == 1:
+                        ret += '\n\n'
+                else:
+                    ret += role + ':\n'
+            return ret
+        elif self.sep_style == SeparatorStyle.PHOENIX:
+            ret = system_prompt
+            for role, message in self.messages:
+                if message:
+                    ret += role + ': ' + '<s>' + message + '</s>'
+                else:
+                    ret += role + ': ' + '<s>'
+            return ret
+        elif self.sep_style == SeparatorStyle.ROBIN:
+            ret = system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ':\n' + message + self.sep
+                else:
+                    ret += role + ':\n'
+            return ret
+        elif self.sep_style == SeparatorStyle.FALCON_CHAT:
+            ret = ''
+            if self.system_message:
+                ret += system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ': ' + message + self.sep
+                else:
+                    ret += role + ':'
+            return ret
+        elif self.sep_style == SeparatorStyle.MPT:
+            if self.system_message == '':
+                ret = ''
+            else:
+                ret = system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+            return ret
+        else:
+            raise ValueError(f'Invalid style: {self.sep_style}')
+    def set_system_message(self, system_message: str):
+        """Set the system message."""
+        self.system_message = system_message
+    def append_message(self, role: str, message: str):
+        """Append a new message."""
+        self.messages.append([role, message])
+    def update_last_message(self, message: str):
+        """Update the last output.
+        The last message is typically set to be None when constructing the prompt,
+        so we need to update it in-place after getting the response from a model.
+        """
+        self.messages[-1][1] = message
+    def to_gradio_chatbot(self):
+        """Convert the conversation to gradio chatbot format."""
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+    def to_openai_api_messages(self):
+        """Convert the conversation to OpenAI chat completion format."""
+        ret = [{'role': 'system', 'content': self.system_message}]
+        for i, (_, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                ret.append({'role': 'user', 'content': msg})
+            else:
+                if msg is not None:
+                    ret.append({'role': 'assistant', 'content': msg})
+        return ret
+    def copy(self):
+        return Conversation(
+            name=self.name,
+            system_template=self.system_template,
+            system_message=self.system_message,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            stop_str=self.stop_str,
+            stop_token_ids=self.stop_token_ids,
+        )
+    def dict(self):
+        return {
+            'template_name': self.name,
+            'system_message': self.system_message,
+            'roles': self.roles,
+            'messages': self.messages,
+            'offset': self.offset,
+        }
+# A global registry for all conversation templates
+conv_templates: Dict[str, Conversation] = {}
+def register_conv_template(template: Conversation, override: bool = False):
+    """Register a new conversation template."""
+    if not override:
+        assert (
+            template.name not in conv_templates
+        ), f'{template.name} has been registered.'
+    conv_templates[template.name] = template
+def get_conv_template(name: str) -> Conversation:
+    """Get a conversation template."""
+    return conv_templates[name].copy()
+register_conv_template(
+    Conversation(
+        name='interactiveomni_template',
+        system_template='<|im_start|>system\n{system_message}',
+        system_message='You are a highly advanced multimodal conversational AI designed for human-like interaction. You can perceive auditory, visual, speech, and textual inputs, and generate text and speech.',
+        roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
+        sep_style=SeparatorStyle.MPT,
+        sep='<|im_end|>\n',
+        stop_token_ids=[
+            2,
+            92543,
+            92542
+        ]
+    )
+)

generation_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "_from_model_config": true,
+  "transformers_version": "4.51.3"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f5b2da752eea0e481167b8203c4b792c8cd7b5f4dfe44490a577b8ed5db6ee15
+size 4990472920

model-00002-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9fb6caa54bb12b742ba39f1d44963057aa2cdc177206f39ccabb4a61a5922d27
+size 4999848424

model-00003-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:849eeeb4f6b5233a4d4749eabacd79375f3ac4340c0057fdc85d93af65e4c45d
+size 4983071360

model-00004-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:10149f10dbd934bc38e316409cd12432aeb21061e35bbc754c8d70c387c2d6ee
+size 4999999724

model-00005-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2c57621a543541dc6e0fd8aa9f7bfcae153ddfd549a570435f106467d37654b0
+size 129569282

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_flow.py ADDED Viewed

	@@ -0,0 +1,2318 @@

+# --------------------------------------------------------
+# SenseTime
+# Copyright (c) 2025 SenseTime
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from transformers.modeling_utils import PreTrainedModel
+from typing import Dict, Tuple, Optional, Union, Any
+from torch import nn
+from torch.nn import functional as F
+import torch
+import copy
+from omegaconf import DictConfig
+import threading
+import math
+from abc import ABC
+from diffusers.models.activations import get_activation
+from einops import pack, rearrange, repeat
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from diffusers.models.attention import (
+    GEGLU,
+    GELU,
+    AdaLayerNorm,
+    AdaLayerNormZero,
+    ApproximateGELU,
+)
+from diffusers.models.attention_processor import Attention
+from diffusers.models.lora import LoRACompatibleLinear
+from .configuration_flow import FlowConfig
+def subsequent_chunk_mask(
+        size: int,
+        chunk_size: int,
+        num_left_chunks: int = -1,
+        device: torch.device = torch.device("cpu"),
+) -> torch.Tensor:
+    """Create mask for subsequent steps (size, size) with chunk size,
+       this is for streaming encoder
+    Args:
+        size (int): size of mask
+        chunk_size (int): size of chunk
+        num_left_chunks (int): number of left chunks
+            <0: use full chunk
+            >=0: use num_left_chunks
+        device (torch.device): "cpu" or "cuda" or torch.Tensor.device
+    Returns:
+        torch.Tensor: mask
+    Examples:
+        >>> subsequent_chunk_mask(4, 2)
+        [[1, 1, 0, 0],
+         [1, 1, 0, 0],
+         [1, 1, 1, 1],
+         [1, 1, 1, 1]]
+    """
+    # NOTE this modified implementation meets onnx export requirements, but it doesn't support num_left_chunks
+    # actually this is not needed after we have inference cache implemented, will remove it later
+    pos_idx = torch.arange(size, device=device)
+    block_value = (torch.div(pos_idx, chunk_size, rounding_mode='trunc') + 1) * chunk_size
+    ret = pos_idx.unsqueeze(0) < block_value.unsqueeze(1)
+    return ret
+def add_optional_chunk_mask(xs: torch.Tensor,
+                            masks: torch.Tensor,
+                            use_dynamic_chunk: bool,
+                            use_dynamic_left_chunk: bool,
+                            decoding_chunk_size: int,
+                            static_chunk_size: int,
+                            num_decoding_left_chunks: int,
+                            enable_full_context: bool = True):
+    """ Apply optional mask for encoder.
+    Args:
+        xs (torch.Tensor): padded input, (B, L, D), L for max length
+        mask (torch.Tensor): mask for xs, (B, 1, L)
+        use_dynamic_chunk (bool): whether to use dynamic chunk or not
+        use_dynamic_left_chunk (bool): whether to use dynamic left chunk for
+            training.
+        decoding_chunk_size (int): decoding chunk size for dynamic chunk, it's
+            0: default for training, use random dynamic chunk.
+            <0: for decoding, use full chunk.
+            >0: for decoding, use fixed chunk size as set.
+        static_chunk_size (int): chunk size for static chunk training/decoding
+            if it's greater than 0, if use_dynamic_chunk is true,
+            this parameter will be ignored
+        num_decoding_left_chunks: number of left chunks, this is for decoding,
+            the chunk size is decoding_chunk_size.
+            >=0: use num_decoding_left_chunks
+            <0: use all left chunks
+        enable_full_context (bool):
+            True: chunk size is either [1, 25] or full context(max_len)
+            False: chunk size ~ U[1, 25]
+    Returns:
+        torch.Tensor: chunk mask of the input xs.
+    """
+    # Whether to use chunk mask or not
+    if use_dynamic_chunk:
+        max_len = xs.size(1)
+        if decoding_chunk_size < 0:
+            chunk_size = max_len
+            num_left_chunks = -1
+        elif decoding_chunk_size > 0:
+            chunk_size = decoding_chunk_size
+            num_left_chunks = num_decoding_left_chunks
+        else:
+            # chunk size is either [1, 25] or full context(max_len).
+            # Since we use 4 times subsampling and allow up to 1s(100 frames)
+            # delay, the maximum frame is 100 / 4 = 25.
+            chunk_size = torch.randint(1, max_len, (1, )).item()
+            num_left_chunks = -1
+            if chunk_size > max_len // 2 and enable_full_context:
+                chunk_size = max_len
+            else:
+                chunk_size = chunk_size % 25 + 1
+                if use_dynamic_left_chunk:
+                    max_left_chunks = (max_len - 1) // chunk_size
+                    num_left_chunks = torch.randint(0, max_left_chunks,
+                                                    (1, )).item()
+        chunk_masks = subsequent_chunk_mask(xs.size(1), chunk_size,
+                                            num_left_chunks,
+                                            xs.device)  # (L, L)
+        chunk_masks = chunk_masks.unsqueeze(0)  # (1, L, L)
+        chunk_masks = masks & chunk_masks  # (B, L, L)
+    elif static_chunk_size > 0:
+        num_left_chunks = num_decoding_left_chunks
+        chunk_masks = subsequent_chunk_mask(xs.size(1), static_chunk_size,
+                                            num_left_chunks,
+                                            xs.device)  # (L, L)
+        chunk_masks = chunk_masks.unsqueeze(0)  # (1, L, L)
+        chunk_masks = masks & chunk_masks  # (B, L, L)
+    else:
+        chunk_masks = masks
+    return chunk_masks
+def mask_to_bias(mask: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    assert mask.dtype == torch.bool
+    assert dtype in [torch.float32, torch.bfloat16, torch.float16]
+    mask = mask.to(dtype)
+    # attention mask bias
+    # NOTE(Mddct): torch.finfo jit issues
+    #     chunk_masks = (1.0 - chunk_masks) * torch.finfo(dtype).min
+    mask = (1.0 - mask) * torch.finfo(dtype).min
+    return mask
+def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor:
+    """Make mask tensor containing indices of padded part.
+    See description of make_non_pad_mask.
+    Args:
+        lengths (torch.Tensor): Batch of lengths (B,).
+    Returns:
+        torch.Tensor: Mask tensor containing indices of padded part.
+    Examples:
+        >>> lengths = [5, 3, 2]
+        >>> make_pad_mask(lengths)
+        masks = [[0, 0, 0, 0 ,0],
+                 [0, 0, 0, 1, 1],
+                 [0, 0, 1, 1, 1]]
+    """
+    batch_size = lengths.size(0)
+    max_len = max_len if max_len > 0 else lengths.max().item()
+    seq_range = torch.arange(0,
+                             max_len,
+                             dtype=torch.int64,
+                             device=lengths.device)
+    seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
+    seq_length_expand = lengths.unsqueeze(-1)
+    mask = seq_range_expand >= seq_length_expand
+    return mask
+class Swish(torch.nn.Module):
+    """Construct an Swish object."""
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Return Swish activation function."""
+        return x * torch.sigmoid(x)
+class BASECFM(torch.nn.Module, ABC):
+    def __init__(
+        self,
+        n_feats,
+        cfm_params,
+        n_spks=1,
+        spk_emb_dim=128,
+    ):
+        super().__init__()
+        self.n_feats = n_feats
+        self.n_spks = n_spks
+        self.spk_emb_dim = spk_emb_dim
+        self.solver = cfm_params.solver
+        if hasattr(cfm_params, "sigma_min"):
+            self.sigma_min = cfm_params.sigma_min
+        else:
+            self.sigma_min = 1e-4
+        self.estimator = None
+    @torch.inference_mode()
+    def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None):
+        """Forward diffusion
+        Args:
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            n_timesteps (int): number of diffusion steps
+            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
+            spks (torch.Tensor, optional): speaker ids. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+            cond: Not used but kept for future purposes
+        Returns:
+            sample: generated mel-spectrogram
+                shape: (batch_size, n_feats, mel_timesteps)
+        """
+        z = torch.randn_like(mu) * temperature
+        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device)
+        return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond)
+    def solve_euler(self, x, t_span, mu, mask, spks, cond):
+        """
+        Fixed euler solver for ODEs.
+        Args:
+            x (torch.Tensor): random noise
+            t_span (torch.Tensor): n_timesteps interpolated
+                shape: (n_timesteps + 1,)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            spks (torch.Tensor, optional): speaker ids. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+            cond: Not used but kept for future purposes
+        """
+        t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
+        # I am storing this because I can later plot it by putting a debugger here and saving it to a file
+        # Or in future might add like a return_all_steps flag
+        sol = []
+        for step in range(1, len(t_span)):
+            dphi_dt = self.estimator(x, mask, mu, t, spks, cond)
+            x = x + dt * dphi_dt
+            t = t + dt
+            sol.append(x)
+            if step < len(t_span) - 1:
+                dt = t_span[step + 1] - t
+        return sol[-1]
+    def compute_loss(self, x1, mask, mu, spks=None, cond=None):
+        """Computes diffusion loss
+        Args:
+            x1 (torch.Tensor): Target
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): target mask
+                shape: (batch_size, 1, mel_timesteps)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            spks (torch.Tensor, optional): speaker embedding. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+        Returns:
+            loss: conditional flow matching loss
+            y: conditional flow
+                shape: (batch_size, n_feats, mel_timesteps)
+        """
+        b, _, t = mu.shape
+        # random timestep
+        t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype)
+        # sample noise p(x_0)
+        z = torch.randn_like(x1)
+        y = (1 - (1 - self.sigma_min) * t) * z + t * x1
+        u = x1 - (1 - self.sigma_min) * z
+        loss = F.mse_loss(self.estimator(y, mask, mu, t.squeeze(), spks), u, reduction="sum") / (
+            torch.sum(mask) * u.shape[1]
+        )
+        return loss, y
+class Transpose(torch.nn.Module):
+    def __init__(self, dim0: int, dim1: int):
+        super().__init__()
+        self.dim0 = dim0
+        self.dim1 = dim1
+    def forward(self, x: torch.Tensor):
+        x = torch.transpose(x, self.dim0, self.dim1)
+        return x
+class Block1D(torch.nn.Module):
+    def __init__(self, dim, dim_out, groups=8):
+        super().__init__()
+        self.block = torch.nn.Sequential(
+            torch.nn.Conv1d(dim, dim_out, 3, padding=1),
+            torch.nn.GroupNorm(groups, dim_out),
+            nn.Mish(),
+        )
+    def forward(self, x, mask):
+        output = self.block(x * mask)
+        return output * mask
+class CausalBlock1D(Block1D):
+    def __init__(self, dim: int, dim_out: int):
+        super(CausalBlock1D, self).__init__(dim, dim_out)
+        self.block = torch.nn.Sequential(
+            CausalConv1d(dim, dim_out, 3),
+            Transpose(1, 2),
+            nn.LayerNorm(dim_out),
+            Transpose(1, 2),
+            nn.Mish(),
+        )
+    def forward(self, x: torch.Tensor, mask: torch.Tensor):
+        output = self.block(x * mask)
+        return output * mask
+class ResnetBlock1D(torch.nn.Module):
+    def __init__(self, dim, dim_out, time_emb_dim, groups=8):
+        super().__init__()
+        self.mlp = torch.nn.Sequential(nn.Mish(), torch.nn.Linear(time_emb_dim, dim_out))
+        self.block1 = Block1D(dim, dim_out, groups=groups)
+        self.block2 = Block1D(dim_out, dim_out, groups=groups)
+        self.res_conv = torch.nn.Conv1d(dim, dim_out, 1)
+    def forward(self, x, mask, time_emb):
+        h = self.block1(x, mask)
+        h += self.mlp(time_emb).unsqueeze(-1)
+        h = self.block2(h, mask)
+        output = h + self.res_conv(x * mask)
+        return output
+class CausalResnetBlock1D(ResnetBlock1D):
+    def __init__(self, dim: int, dim_out: int, time_emb_dim: int, groups: int = 8):
+        super(CausalResnetBlock1D, self).__init__(dim, dim_out, time_emb_dim, groups)
+        self.block1 = CausalBlock1D(dim, dim_out)
+        self.block2 = CausalBlock1D(dim_out, dim_out)
+class CausalConv1d(torch.nn.Conv1d):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = 'zeros',
+        device=None,
+        dtype=None
+    ) -> None:
+        super(CausalConv1d, self).__init__(in_channels, out_channels,
+                                           kernel_size, stride,
+                                           padding=0, dilation=dilation,
+                                           groups=groups, bias=bias,
+                                           padding_mode=padding_mode,
+                                           device=device, dtype=dtype)
+        assert stride == 1
+        self.causal_padding = (kernel_size - 1, 0)
+    def forward(self, x: torch.Tensor):
+        x = F.pad(x, self.causal_padding)
+        x = super(CausalConv1d, self).forward(x)
+        return x
+class ResnetBlock1D(torch.nn.Module):
+    def __init__(self, dim, dim_out, time_emb_dim, groups=8):
+        super().__init__()
+        self.mlp = torch.nn.Sequential(nn.Mish(), torch.nn.Linear(time_emb_dim, dim_out))
+        self.block1 = Block1D(dim, dim_out, groups=groups)
+        self.block2 = Block1D(dim_out, dim_out, groups=groups)
+        self.res_conv = torch.nn.Conv1d(dim, dim_out, 1)
+    def forward(self, x, mask, time_emb):
+        h = self.block1(x, mask)
+        h += self.mlp(time_emb).unsqueeze(-1)
+        h = self.block2(h, mask)
+        output = h + self.res_conv(x * mask)
+        return output
+class SinusoidalPosEmb(torch.nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+        assert self.dim % 2 == 0, "SinusoidalPosEmb requires dim to be even"
+    def forward(self, x, scale=1000):
+        if x.ndim < 1:
+            x = x.unsqueeze(0)
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb)
+        emb = scale * x.unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+class SnakeBeta(nn.Module):
+    """
+    A modified Snake function which uses separate parameters for the magnitude of the periodic components
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    Parameters:
+        - alpha - trainable parameter that controls frequency
+        - beta - trainable parameter that controls magnitude
+    References:
+        - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+        https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = snakebeta(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    """
+    def __init__(self, in_features, out_features, alpha=1.0, alpha_trainable=True, alpha_logscale=True):
+        """
+        Initialization.
+        INPUT:
+            - in_features: shape of the input
+            - alpha - trainable parameter that controls frequency
+            - beta - trainable parameter that controls magnitude
+            alpha is initialized to 1 by default, higher values = higher-frequency.
+            beta is initialized to 1 by default, higher values = higher-magnitude.
+            alpha will be trained along with the rest of your model.
+        """
+        super().__init__()
+        self.in_features = out_features if isinstance(out_features, list) else [out_features]
+        self.proj = LoRACompatibleLinear(in_features, out_features)
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale:  # log scale alphas initialized to zeros
+            self.alpha = nn.Parameter(torch.zeros(self.in_features) * alpha)
+            self.beta = nn.Parameter(torch.zeros(self.in_features) * alpha)
+        else:  # linear scale alphas initialized to ones
+            self.alpha = nn.Parameter(torch.ones(self.in_features) * alpha)
+            self.beta = nn.Parameter(torch.ones(self.in_features) * alpha)
+        self.alpha.requires_grad = alpha_trainable
+        self.beta.requires_grad = alpha_trainable
+        self.no_div_by_zero = 0.000000001
+    def forward(self, x):
+        """
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        SnakeBeta ∶= x + 1/b * sin^2 (xa)
+        """
+        x = self.proj(x)
+        if self.alpha_logscale:
+            alpha = torch.exp(self.alpha)
+            beta = torch.exp(self.beta)
+        else:
+            alpha = self.alpha
+            beta = self.beta
+        x = x + (1.0 / (beta + self.no_div_by_zero)) * torch.pow(torch.sin(x * alpha), 2)
+        return x
+class FeedForward(nn.Module):
+    r"""
+    A feed-forward layer.
+    Parameters:
+        dim (`int`): The number of channels in the input.
+        dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
+        mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        final_dropout (`bool` *optional*, defaults to False): Apply a final dropout.
+    """
+    def __init__(
+        self,
+        dim: int,
+        dim_out: Optional[int] = None,
+        mult: int = 4,
+        dropout: float = 0.0,
+        activation_fn: str = "geglu",
+        final_dropout: bool = False,
+    ):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+        if activation_fn == "gelu":
+            act_fn = GELU(dim, inner_dim)
+        if activation_fn == "gelu-approximate":
+            act_fn = GELU(dim, inner_dim, approximate="tanh")
+        elif activation_fn == "geglu":
+            act_fn = GEGLU(dim, inner_dim)
+        elif activation_fn == "geglu-approximate":
+            act_fn = ApproximateGELU(dim, inner_dim)
+        elif activation_fn == "snakebeta":
+            act_fn = SnakeBeta(dim, inner_dim)
+        self.net = nn.ModuleList([])
+        # project in
+        self.net.append(act_fn)
+        # project dropout
+        self.net.append(nn.Dropout(dropout))
+        # project out
+        self.net.append(LoRACompatibleLinear(inner_dim, dim_out))
+        # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
+        if final_dropout:
+            self.net.append(nn.Dropout(dropout))
+    def forward(self, hidden_states):
+        for module in self.net:
+            hidden_states = module(hidden_states)
+        return hidden_states
+@maybe_allow_in_graph
+class BasicTransformerBlock(nn.Module):
+    r"""
+    A basic Transformer block.
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+        only_cross_attention (`bool`, *optional*):
+            Whether to use only cross-attention layers. In this case two cross attention layers are used.
+        double_self_attention (`bool`, *optional*):
+            Whether to use two self-attention layers. In this case no cross attention layers are used.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm (:
+            obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
+        attention_bias (:
+            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        attention_bias: bool = False,
+        only_cross_attention: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_elementwise_affine: bool = True,
+        norm_type: str = "layer_norm",
+        final_dropout: bool = False,
+    ):
+        super().__init__()
+        self.only_cross_attention = only_cross_attention
+        self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
+        self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
+        if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
+            raise ValueError(
+                f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
+                f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
+            )
+        # Define 3 blocks. Each block has its own normalization layer.
+        # 1. Self-Attn
+        if self.use_ada_layer_norm:
+            self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
+        elif self.use_ada_layer_norm_zero:
+            self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
+        else:
+            self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+        self.attn1 = Attention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+            upcast_attention=upcast_attention,
+        )
+        # 2. Cross-Attn
+        if cross_attention_dim is not None or double_self_attention:
+            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
+            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
+            # the second cross attention block.
+            self.norm2 = (
+                AdaLayerNorm(dim, num_embeds_ada_norm)
+                if self.use_ada_layer_norm
+                else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+            )
+            self.attn2 = Attention(
+                query_dim=dim,
+                cross_attention_dim=cross_attention_dim if not double_self_attention else None,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+                # scale_qk=False, # uncomment this to not to use flash attention
+            )  # is self-attn if encoder_hidden_states is none
+        else:
+            self.norm2 = None
+            self.attn2 = None
+        # 3. Feed-forward
+        self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+        self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout)
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = 0
+    def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int):
+        # Sets chunk feed-forward
+        self._chunk_size = chunk_size
+        self._chunk_dim = dim
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+    ):
+        # Notice that normalization is always applied before the real computation in the following blocks.
+        # 1. Self-Attention
+        if self.use_ada_layer_norm:
+            norm_hidden_states = self.norm1(hidden_states, timestep)
+        elif self.use_ada_layer_norm_zero:
+            norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
+                hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
+            )
+        else:
+            norm_hidden_states = self.norm1(hidden_states)
+        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+        attn_output = self.attn1(
+            norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+            attention_mask=encoder_attention_mask if self.only_cross_attention else attention_mask,
+            **cross_attention_kwargs,
+        )
+        if self.use_ada_layer_norm_zero:
+            attn_output = gate_msa.unsqueeze(1) * attn_output
+        hidden_states = attn_output + hidden_states
+        # 2. Cross-Attention
+        if self.attn2 is not None:
+            norm_hidden_states = (
+                self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
+            )
+            attn_output = self.attn2(
+                norm_hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                **cross_attention_kwargs,
+            )
+            hidden_states = attn_output + hidden_states
+        # 3. Feed-forward
+        norm_hidden_states = self.norm3(hidden_states)
+        if self.use_ada_layer_norm_zero:
+            norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        if self._chunk_size is not None:
+            # "feed_forward_chunk_size" can be used to save memory
+            if norm_hidden_states.shape[self._chunk_dim] % self._chunk_size != 0:
+                raise ValueError(
+                    f"`hidden_states` dimension to be chunked: {norm_hidden_states.shape[self._chunk_dim]} has to be divisible by chunk size: {self._chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`."
+                )
+            num_chunks = norm_hidden_states.shape[self._chunk_dim] // self._chunk_size
+            ff_output = torch.cat(
+                [self.ff(hid_slice) for hid_slice in norm_hidden_states.chunk(num_chunks, dim=self._chunk_dim)],
+                dim=self._chunk_dim,
+            )
+        else:
+            ff_output = self.ff(norm_hidden_states)
+        if self.use_ada_layer_norm_zero:
+            ff_output = gate_mlp.unsqueeze(1) * ff_output
+        hidden_states = ff_output + hidden_states
+        return hidden_states
+class Downsample1D(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.conv = torch.nn.Conv1d(dim, dim, 3, 2, 1)
+    def forward(self, x):
+        return self.conv(x)
+class TimestepEmbedding(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        time_embed_dim: int,
+        act_fn: str = "silu",
+        out_dim: int = None,
+        post_act_fn: Optional[str] = None,
+        cond_proj_dim=None,
+    ):
+        super().__init__()
+        self.linear_1 = nn.Linear(in_channels, time_embed_dim)
+        if cond_proj_dim is not None:
+            self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias=False)
+        else:
+            self.cond_proj = None
+        self.act = get_activation(act_fn)
+        if out_dim is not None:
+            time_embed_dim_out = out_dim
+        else:
+            time_embed_dim_out = time_embed_dim
+        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out)
+        if post_act_fn is None:
+            self.post_act = None
+        else:
+            self.post_act = get_activation(post_act_fn)
+    def forward(self, sample, condition=None):
+        if condition is not None:
+            sample = sample + self.cond_proj(condition)
+        sample = self.linear_1(sample)
+        if self.act is not None:
+            sample = self.act(sample)
+        sample = self.linear_2(sample)
+        if self.post_act is not None:
+            sample = self.post_act(sample)
+        return sample
+class ConditionalDecoder(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        causal=False,
+        channels=(256, 256),
+        dropout=0.05,
+        attention_head_dim=64,
+        n_blocks=1,
+        num_mid_blocks=2,
+        num_heads=4,
+        act_fn="snake",
+    ):
+        """
+        This decoder requires an input with the same shape of the target. So, if your text content
+        is shorter or longer than the outputs, please re-sampling it before feeding to the decoder.
+        """
+        super().__init__()
+        channels = tuple(channels)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.causal = causal
+        self.time_embeddings = SinusoidalPosEmb(in_channels)
+        time_embed_dim = channels[0] * 4
+        self.time_mlp = TimestepEmbedding(
+            in_channels=in_channels,
+            time_embed_dim=time_embed_dim,
+            act_fn="silu",
+        )
+        self.down_blocks = nn.ModuleList([])
+        self.mid_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+        output_channel = in_channels
+        for i in range(len(channels)):  # pylint: disable=consider-using-enumerate
+            input_channel = output_channel
+            output_channel = channels[i]
+            is_last = i == len(channels) - 1
+            resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) if self.causal else \
+                ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
+            transformer_blocks = nn.ModuleList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            downsample = (
+                Downsample1D(output_channel) if not is_last else
+                CausalConv1d(output_channel, output_channel, 3) if self.causal else nn.Conv1d(output_channel, output_channel, 3, padding=1)
+            )
+            self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample]))
+        for _ in range(num_mid_blocks):
+            input_channel = channels[-1]
+            out_channels = channels[-1]
+            resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) if self.causal else \
+                ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
+            transformer_blocks = nn.ModuleList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            self.mid_blocks.append(nn.ModuleList([resnet, transformer_blocks]))
+        channels = channels[::-1] + (channels[0],)
+        for i in range(len(channels) - 1):
+            input_channel = channels[i] * 2
+            output_channel = channels[i + 1]
+            is_last = i == len(channels) - 2
+            resnet = CausalResnetBlock1D(
+                dim=input_channel,
+                dim_out=output_channel,
+                time_emb_dim=time_embed_dim,
+            ) if self.causal else ResnetBlock1D(
+                dim=input_channel,
+                dim_out=output_channel,
+                time_emb_dim=time_embed_dim,
+            )
+            transformer_blocks = nn.ModuleList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            upsample = (
+                Upsample1D(output_channel, use_conv_transpose=True)
+                if not is_last
+                else CausalConv1d(output_channel, output_channel, 3) if self.causal else nn.Conv1d(output_channel, output_channel, 3, padding=1)
+            )
+            self.up_blocks.append(nn.ModuleList([resnet, transformer_blocks, upsample]))
+        self.final_block = CausalBlock1D(channels[-1], channels[-1]) if self.causal else Block1D(channels[-1], channels[-1])
+        self.final_proj = nn.Conv1d(channels[-1], self.out_channels, 1)
+        self.initialize_weights()
+    def initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv1d):
+                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.GroupNorm):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def forward(self, x, mask, mu, t, spks=None, cond=None):
+        """Forward pass of the UNet1DConditional model.
+        Args:
+            x (torch.Tensor): shape (batch_size, in_channels, time)
+            mask (_type_): shape (batch_size, 1, time)
+            t (_type_): shape (batch_size)
+            spks (_type_, optional): shape: (batch_size, condition_channels). Defaults to None.
+            cond (_type_, optional): placeholder for future use. Defaults to None.
+        Raises:
+            ValueError: _description_
+            ValueError: _description_
+        Returns:
+            _type_: _description_
+        """
+        t = self.time_embeddings(t).to(t.dtype)
+        t = self.time_mlp(t)
+        x = pack([x, mu], "b * t")[0]
+        if spks is not None:
+            spks = repeat(spks, "b c -> b c t", t=x.shape[-1])
+            x = pack([x, spks], "b * t")[0]
+        if cond is not None:
+            x = pack([x, cond], "b * t")[0]
+        hiddens = []
+        masks = [mask]
+        for resnet, transformer_blocks, downsample in self.down_blocks:
+            mask_down = masks[-1]
+            x = resnet(x, mask_down, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()
+            # attn_mask = torch.matmul(mask_down.transpose(1, 2).contiguous(), mask_down)
+            attn_mask = add_optional_chunk_mask(x, mask_down.bool(), False, False, 0, self.static_chunk_size, -1)
+            attn_mask = mask_to_bias(attn_mask == 1, x.dtype)
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=attn_mask,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+            hiddens.append(x)  # Save hidden states for skip connections
+            x = downsample(x * mask_down)
+            masks.append(mask_down[:, :, ::2])
+        masks = masks[:-1]
+        mask_mid = masks[-1]
+        for resnet, transformer_blocks in self.mid_blocks:
+            x = resnet(x, mask_mid, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()
+            # attn_mask = torch.matmul(mask_mid.transpose(1, 2).contiguous(), mask_mid)
+            attn_mask = add_optional_chunk_mask(x, mask_mid.bool(), False, False, 0, self.static_chunk_size, -1)
+            attn_mask = mask_to_bias(attn_mask == 1, x.dtype)
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=attn_mask,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+        for resnet, transformer_blocks, upsample in self.up_blocks:
+            mask_up = masks.pop()
+            skip = hiddens.pop()
+            x = pack([x[:, :, :skip.shape[-1]], skip], "b * t")[0]
+            x = resnet(x, mask_up, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()
+            # attn_mask = torch.matmul(mask_up.transpose(1, 2).contiguous(), mask_up)
+            attn_mask = add_optional_chunk_mask(x, mask_up.bool(), False, False, 0, self.static_chunk_size, -1)
+            attn_mask = mask_to_bias(attn_mask == 1, x.dtype)
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=attn_mask,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+            x = upsample(x * mask_up)
+        x = self.final_block(x, mask_up)
+        output = self.final_proj(x * mask_up)
+        return output * mask
+class ConditionalCFM(BASECFM):
+    def __init__(self, in_channels=240, cfm_params=None, n_spks=1, spk_emb_dim=64, estimator_config= None):
+        super().__init__(
+            n_feats=in_channels,
+            cfm_params=cfm_params,
+            n_spks=n_spks,
+            spk_emb_dim=spk_emb_dim,
+        )
+        self.t_scheduler = cfm_params.t_scheduler
+        self.training_cfg_rate = cfm_params.training_cfg_rate
+        self.inference_cfg_rate = cfm_params.inference_cfg_rate
+        in_channels = in_channels + (spk_emb_dim if n_spks > 0 else 0)
+        # Just change the architecture of the estimator here
+        self.estimator = ConditionalDecoder(**estimator_config)
+        self.lock = threading.Lock()
+    @torch.inference_mode()
+    def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None, prompt_len=0, flow_cache=torch.zeros(1, 80, 0, 2)):
+        """Forward diffusion
+        Args:
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            n_timesteps (int): number of diffusion steps
+            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
+            spks (torch.Tensor, optional): speaker ids. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+            cond: Not used but kept for future purposes
+        Returns:
+            sample: generated mel-spectrogram
+                shape: (batch_size, n_feats, mel_timesteps)
+        """
+        z = torch.randn_like(mu).to(mu.device).to(mu.dtype) * temperature
+        cache_size = flow_cache.shape[2]
+        # fix prompt and overlap part mu and z
+        if cache_size != 0:
+            z[:, :, :cache_size] = flow_cache[:, :, :, 0]
+            mu[:, :, :cache_size] = flow_cache[:, :, :, 1]
+        z_cache = torch.concat([z[:, :, :prompt_len], z[:, :, -34:]], dim=2)
+        mu_cache = torch.concat([mu[:, :, :prompt_len], mu[:, :, -34:]], dim=2)
+        flow_cache = torch.stack([z_cache, mu_cache], dim=-1)
+        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device, dtype=mu.dtype)
+        if self.t_scheduler == 'cosine':
+            t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
+        return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond), flow_cache
+    def solve_euler(self, x, t_span, mu, mask, spks, cond):
+        """
+        Fixed euler solver for ODEs.
+        Args:
+            x (torch.Tensor): random noise
+            t_span (torch.Tensor): n_timesteps interpolated
+                shape: (n_timesteps + 1,)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            spks (torch.Tensor, optional): speaker ids. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+            cond: Not used but kept for future purposes
+        """
+        t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
+        t = t.unsqueeze(dim=0)
+        # I am storing this because I can later plot it by putting a debugger here and saving it to a file
+        # Or in future might add like a return_all_steps flag
+        sol = []
+        # Do not use concat, it may cause memory format changed and trt infer with wrong results!
+        x_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype)
+        mask_in = torch.zeros([2, 1, x.size(2)], device=x.device, dtype=x.dtype)
+        mu_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype)
+        t_in = torch.zeros([2], device=x.device, dtype=x.dtype)
+        spks_in = torch.zeros([2, 80], device=x.device, dtype=x.dtype)
+        cond_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype)
+        for step in range(1, len(t_span)):
+            # Classifier-Free Guidance inference introduced in VoiceBox
+            x_in[:] = x
+            mask_in[:] = mask
+            mu_in[0] = mu
+            t_in[:] = t.unsqueeze(0)
+            spks_in[0] = spks
+            cond_in[0] = cond
+            dphi_dt = self.forward_estimator(
+                x_in, mask_in,
+                mu_in, t_in,
+                spks_in,
+                cond_in
+            )
+            dphi_dt, cfg_dphi_dt = torch.split(dphi_dt, [x.size(0), x.size(0)], dim=0)
+            dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt - self.inference_cfg_rate * cfg_dphi_dt)
+            x = x + dt * dphi_dt
+            t = t + dt
+            sol.append(x)
+            if step < len(t_span) - 1:
+                dt = t_span[step + 1] - t
+        return sol[-1].float()
+    def forward_estimator(self, x, mask, mu, t, spks, cond):
+        if isinstance(self.estimator, torch.nn.Module):
+            return self.estimator.forward(x, mask, mu, t, spks, cond)
+        else:
+            with self.lock:
+                self.estimator.set_input_shape('x', (2, 80, x.size(2)))
+                self.estimator.set_input_shape('mask', (2, 1, x.size(2)))
+                self.estimator.set_input_shape('mu', (2, 80, x.size(2)))
+                self.estimator.set_input_shape('t', (2,))
+                self.estimator.set_input_shape('spks', (2, 80))
+                self.estimator.set_input_shape('cond', (2, 80, x.size(2)))
+                # run trt engine
+                self.estimator.execute_v2([x.contiguous().data_ptr(),
+                                           mask.contiguous().data_ptr(),
+                                           mu.contiguous().data_ptr(),
+                                           t.contiguous().data_ptr(),
+                                           spks.contiguous().data_ptr(),
+                                           cond.contiguous().data_ptr(),
+                                           x.data_ptr()])
+            return x
+    def compute_loss(self, x1, mask, mu, spks=None, cond=None):
+        """Computes diffusion loss
+        Args:
+            x1 (torch.Tensor): Target
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): target mask
+                shape: (batch_size, 1, mel_timesteps)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            spks (torch.Tensor, optional): speaker embedding. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+        Returns:
+            loss: conditional flow matching loss
+            y: conditional flow
+                shape: (batch_size, n_feats, mel_timesteps)
+        """
+        b, _, t = mu.shape
+        # random timestep
+        t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype)
+        if self.t_scheduler == 'cosine':
+            t = 1 - torch.cos(t * 0.5 * torch.pi)
+        # sample noise p(x_0)
+        z = torch.randn_like(x1)
+        y = (1 - (1 - self.sigma_min) * t) * z + t * x1
+        u = x1 - (1 - self.sigma_min) * z
+        # during training, we randomly drop condition to trade off mode coverage and sample fidelity
+        if self.training_cfg_rate > 0:
+            cfg_mask = torch.rand(b, device=x1.device) > self.training_cfg_rate
+            mu = mu * cfg_mask.view(-1, 1, 1)
+            spks = spks * cfg_mask.view(-1, 1)
+            cond = cond * cfg_mask.view(-1, 1, 1)
+        pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond)
+        loss = F.mse_loss(pred * mask, u * mask, reduction="sum") / (torch.sum(mask) * u.shape[1])
+        return loss, y
+class CausalConditionalCFM(ConditionalCFM):
+    def __init__(self, in_channels=240, cfm_params=None, n_spks=1, spk_emb_dim=64,  estimator_config = None):
+        super().__init__(in_channels, cfm_params, n_spks, spk_emb_dim, estimator_config)
+        self.rand_noise = torch.randn([1, 80, 50 * 300])
+    @torch.inference_mode()
+    def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None):
+        """Forward diffusion
+        Args:
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            n_timesteps (int): number of diffusion steps
+            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
+            spks (torch.Tensor, optional): speaker ids. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+            cond: Not used but kept for future purposes
+        Returns:
+            sample: generated mel-spectrogram
+                shape: (batch_size, n_feats, mel_timesteps)
+        """
+        z = self.rand_noise[:, :, :mu.size(2)].to(mu.device).to(mu.dtype) * temperature
+        # fix prompt and overlap part mu and z
+        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device, dtype=mu.dtype)
+        if self.t_scheduler == 'cosine':
+            t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
+        return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond), None
+class PositionwiseFeedForward(torch.nn.Module):
+    """Positionwise feed forward layer.
+    FeedForward are appied on each position of the sequence.
+    The output dim is same with the input dim.
+    Args:
+        idim (int): Input dimenstion.
+        hidden_units (int): The number of hidden units.
+        dropout_rate (float): Dropout rate.
+        activation (torch.nn.Module): Activation function
+    """
+    def __init__(
+            self,
+            idim: int,
+            hidden_units: int,
+            dropout_rate: float,
+            activation: torch.nn.Module = torch.nn.ReLU(),
+    ):
+        """Construct a PositionwiseFeedForward object."""
+        super(PositionwiseFeedForward, self).__init__()
+        self.w_1 = torch.nn.Linear(idim, hidden_units)
+        self.activation = activation
+        self.dropout = torch.nn.Dropout(dropout_rate)
+        self.w_2 = torch.nn.Linear(hidden_units, idim)
+    def forward(self, xs: torch.Tensor) -> torch.Tensor:
+        """Forward function.
+        Args:
+            xs: input tensor (B, L, D)
+        Returns:
+            output tensor, (B, L, D)
+        """
+        return self.w_2(self.dropout(self.activation(self.w_1(xs))))
+class ConformerEncoderLayer(nn.Module):
+    """Encoder layer module.
+    Args:
+        size (int): Input dimension.
+        self_attn (torch.nn.Module): Self-attention module instance.
+            `MultiHeadedAttention` or `RelPositionMultiHeadedAttention`
+            instance can be used as the argument.
+        feed_forward (torch.nn.Module): Feed-forward module instance.
+            `PositionwiseFeedForward` instance can be used as the argument.
+        feed_forward_macaron (torch.nn.Module): Additional feed-forward module
+             instance.
+            `PositionwiseFeedForward` instance can be used as the argument.
+        conv_module (torch.nn.Module): Convolution module instance.
+            `ConvlutionModule` instance can be used as the argument.
+        dropout_rate (float): Dropout rate.
+        normalize_before (bool):
+            True: use layer_norm before each sub-block.
+            False: use layer_norm after each sub-block.
+    """
+    def __init__(
+        self,
+        size: int,
+        self_attn: torch.nn.Module,
+        feed_forward: Optional[nn.Module] = None,
+        feed_forward_macaron: Optional[nn.Module] = None,
+        conv_module: Optional[nn.Module] = None,
+        dropout_rate: float = 0.1,
+        normalize_before: bool = True,
+    ):
+        """Construct an EncoderLayer object."""
+        super().__init__()
+        self.self_attn = self_attn
+        self.feed_forward = feed_forward
+        self.feed_forward_macaron = feed_forward_macaron
+        self.conv_module = conv_module
+        self.norm_ff = nn.LayerNorm(size, eps=1e-12)  # for the FNN module
+        self.norm_mha = nn.LayerNorm(size, eps=1e-12)  # for the MHA module
+        if feed_forward_macaron is not None:
+            self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-12)
+            self.ff_scale = 0.5
+        else:
+            self.ff_scale = 1.0
+        if self.conv_module is not None:
+            self.norm_conv = nn.LayerNorm(size, eps=1e-12)  # for the CNN module
+            self.norm_final = nn.LayerNorm(
+                size, eps=1e-12)  # for the final output of the block
+        self.dropout = nn.Dropout(dropout_rate)
+        self.size = size
+        self.normalize_before = normalize_before
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: torch.Tensor,
+        pos_emb: torch.Tensor,
+        mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
+        cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Compute encoded features.
+        Args:
+            x (torch.Tensor): (#batch, time, size)
+            mask (torch.Tensor): Mask tensor for the input (#batch, time，time),
+                (0, 0, 0) means fake mask.
+            pos_emb (torch.Tensor): positional encoding, must not be None
+                for ConformerEncoderLayer.
+            mask_pad (torch.Tensor): batch padding mask used for conv module.
+                (#batch, 1，time), (0, 0, 0) means fake mask.
+            att_cache (torch.Tensor): Cache tensor of the KEY & VALUE
+                (#batch=1, head, cache_t1, d_k * 2), head * d_k == size.
+            cnn_cache (torch.Tensor): Convolution cache in conformer layer
+                (#batch=1, size, cache_t2)
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, size).
+            torch.Tensor: Mask tensor (#batch, time, time).
+            torch.Tensor: att_cache tensor,
+                (#batch=1, head, cache_t1 + time, d_k * 2).
+            torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2).
+        """
+        # whether to use macaron style
+        if self.feed_forward_macaron is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_ff_macaron(x)
+            x = residual + self.ff_scale * self.dropout(
+                self.feed_forward_macaron(x))
+            if not self.normalize_before:
+                x = self.norm_ff_macaron(x)
+        # multi-headed self-attention module
+        residual = x
+        if self.normalize_before:
+            x = self.norm_mha(x)
+        x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb,
+                                              att_cache)
+        x = residual + self.dropout(x_att)
+        if not self.normalize_before:
+            x = self.norm_mha(x)
+        # convolution module
+        # Fake new cnn cache here, and then change it in conv_module
+        new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device)
+        if self.conv_module is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_conv(x)
+            x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache)
+            x = residual + self.dropout(x)
+            if not self.normalize_before:
+                x = self.norm_conv(x)
+        # feed forward module
+        residual = x
+        if self.normalize_before:
+            x = self.norm_ff(x)
+        x = residual + self.ff_scale * self.dropout(self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm_ff(x)
+        if self.conv_module is not None:
+            x = self.norm_final(x)
+        return x, mask, new_att_cache, new_cnn_cache
+class ConvolutionModule(nn.Module):
+    """ConvolutionModule in Conformer model."""
+    def __init__(self,
+                 channels: int,
+                 kernel_size: int = 15,
+                 activation: nn.Module = nn.ReLU(),
+                 norm: str = "batch_norm",
+                 causal: bool = False,
+                 bias: bool = True):
+        """Construct an ConvolutionModule object.
+        Args:
+            channels (int): The number of channels of conv layers.
+            kernel_size (int): Kernel size of conv layers.
+            causal (int): Whether use causal convolution or not
+        """
+        super().__init__()
+        self.pointwise_conv1 = nn.Conv1d(
+            channels,
+            2 * channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=bias,
+        )
+        # self.lorder is used to distinguish if it's a causal convolution,
+        # if self.lorder > 0: it's a causal convolution, the input will be
+        #    padded with self.lorder frames on the left in forward.
+        # else: it's a symmetrical convolution
+        if causal:
+            padding = 0
+            self.lorder = kernel_size - 1
+        else:
+            # kernel_size should be an odd number for none causal convolution
+            assert (kernel_size - 1) % 2 == 0
+            padding = (kernel_size - 1) // 2
+            self.lorder = 0
+        self.depthwise_conv = nn.Conv1d(
+            channels,
+            channels,
+            kernel_size,
+            stride=1,
+            padding=padding,
+            groups=channels,
+            bias=bias,
+        )
+        assert norm in ['batch_norm', 'layer_norm']
+        if norm == "batch_norm":
+            self.use_layer_norm = False
+            self.norm = nn.BatchNorm1d(channels)
+        else:
+            self.use_layer_norm = True
+            self.norm = nn.LayerNorm(channels)
+        self.pointwise_conv2 = nn.Conv1d(
+            channels,
+            channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=bias,
+        )
+        self.activation = activation
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        cache: torch.Tensor = torch.zeros((0, 0, 0)),
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute convolution module.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, channels).
+            mask_pad (torch.Tensor): used for batch padding (#batch, 1, time),
+                (0, 0, 0) means fake mask.
+            cache (torch.Tensor): left context cache, it is only
+                used in causal convolution (#batch, channels, cache_t),
+                (0, 0, 0) meas fake cache.
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, channels).
+        """
+        # exchange the temporal dimension and the feature dimension
+        x = x.transpose(1, 2)  # (#batch, channels, time)
+        # mask batch padding
+        if mask_pad.size(2) > 0:  # time > 0
+            x.masked_fill_(~mask_pad, 0.0)
+        if self.lorder > 0:
+            if cache.size(2) == 0:  # cache_t == 0
+                x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0)
+            else:
+                assert cache.size(0) == x.size(0)  # equal batch
+                assert cache.size(1) == x.size(1)  # equal channel
+                x = torch.cat((cache, x), dim=2)
+            assert (x.size(2) > self.lorder)
+            new_cache = x[:, :, -self.lorder:]
+        else:
+            # It's better we just return None if no cache is required,
+            # However, for JIT export, here we just fake one tensor instead of
+            # None.
+            new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device)
+        # GLU mechanism
+        x = self.pointwise_conv1(x)  # (batch, 2*channel, dim)
+        x = nn.functional.glu(x, dim=1)  # (batch, channel, dim)
+        # 1D Depthwise Conv
+        x = self.depthwise_conv(x)
+        if self.use_layer_norm:
+            x = x.transpose(1, 2)
+        x = self.activation(self.norm(x))
+        if self.use_layer_norm:
+            x = x.transpose(1, 2)
+        x = self.pointwise_conv2(x)
+        # mask batch padding
+        if mask_pad.size(2) > 0:  # time > 0
+            x.masked_fill_(~mask_pad, 0.0)
+        return x.transpose(1, 2), new_cache
+class Upsample1D(nn.Module):
+    """A 1D upsampling layer with an optional convolution.
+    Parameters:
+        channels (`int`):
+            number of channels in the inputs and outputs.
+        use_conv (`bool`, default `False`):
+            option to use a convolution.
+        use_conv_transpose (`bool`, default `False`):
+            option to use a convolution transpose.
+        out_channels (`int`, optional):
+            number of output channels. Defaults to `channels`.
+    """
+    def __init__(self, channels: int, out_channels: int, stride: int = 2):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels
+        self.stride = stride
+        # In this mode, first repeat interpolate, than conv with stride=1
+        self.conv = nn.Conv1d(self.channels, self.out_channels, stride * 2 + 1, stride=1, padding=0)
+    def forward(self, inputs: torch.Tensor, input_lengths: torch.Tensor):
+        outputs = F.interpolate(inputs, scale_factor=float(self.stride), mode="nearest")
+        outputs = F.pad(outputs, (self.stride * 2, 0), value=0.0)
+        outputs = self.conv(outputs)
+        return outputs, input_lengths * self.stride
+class PreLookaheadLayer(nn.Module):
+    def __init__(self, channels: int, pre_lookahead_len: int = 1):
+        super().__init__()
+        self.channels = channels
+        self.pre_lookahead_len = pre_lookahead_len
+        self.conv1 = nn.Conv1d(
+            channels, channels,
+            kernel_size=pre_lookahead_len + 1,
+            stride=1, padding=0,
+        )
+        self.conv2 = nn.Conv1d(
+            channels, channels,
+            kernel_size=3, stride=1, padding=0,
+        )
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        """
+        inputs: (batch_size, seq_len, channels)
+        """
+        outputs = inputs.transpose(1, 2).contiguous()
+        # look ahead
+        outputs = F.pad(outputs, (0, self.pre_lookahead_len), mode='constant', value=0.0)
+        outputs = F.leaky_relu(self.conv1(outputs))
+        # outputs
+        outputs = F.pad(outputs, (2, 0), mode='constant', value=0.0)
+        outputs = self.conv2(outputs)
+        outputs = outputs.transpose(1, 2).contiguous()
+        # residual connection
+        outputs = outputs + inputs
+        return outputs
+class BaseSubsampling(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.right_context = 0
+        self.subsampling_rate = 1
+    def position_encoding(self, offset: Union[int, torch.Tensor],
+                          size: int) -> torch.Tensor:
+        return self.pos_enc.position_encoding(offset, size)
+class LinearNoSubsampling(BaseSubsampling):
+    """Linear transform the input without subsampling
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+    """
+    def __init__(self, idim: int, odim: int, dropout_rate: float,
+                 pos_enc_class: torch.nn.Module):
+        """Construct an linear object."""
+        super().__init__()
+        self.out = torch.nn.Sequential(
+            torch.nn.Linear(idim, odim),
+            torch.nn.LayerNorm(odim, eps=1e-5),
+            torch.nn.Dropout(dropout_rate),
+        )
+        self.pos_enc = pos_enc_class
+        self.right_context = 0
+        self.subsampling_rate = 1
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        offset: Union[int, torch.Tensor] = 0
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Input x.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+        Returns:
+            torch.Tensor: linear input tensor (#batch, time', odim),
+                where time' = time .
+            torch.Tensor: linear input mask (#batch, 1, time'),
+                where time' = time .
+        """
+        x = self.out(x)
+        x, pos_emb = self.pos_enc(x, offset)
+        return x, pos_emb, x_mask
+class EspnetRelPositionalEncoding(torch.nn.Module):
+    """Relative positional encoding module (new implementation).
+    Details can be found in https://github.com/espnet/espnet/pull/2816.
+    See : Appendix B in https://arxiv.org/abs/1901.02860
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+    """
+    def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000):
+        """Construct an PositionalEncoding object."""
+        super(EspnetRelPositionalEncoding, self).__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, max_len))
+    def extend_pe(self, x: torch.Tensor):
+        """Reset the positional encodings."""
+        if self.pe is not None:
+            # self.pe contains both positive and negative parts
+            # the length of self.pe is 2 * input_len - 1
+            if self.pe.size(1) >= x.size(1) * 2 - 1:
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        # Suppose `i` means to the position of query vecotr and `j` means the
+        # position of key vector. We use position relative positions when keys
+        # are to the left (i>j) and negative relative positions otherwise (i<j).
+        pe_positive = torch.zeros(x.size(1), self.d_model)
+        pe_negative = torch.zeros(x.size(1), self.d_model)
+        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32)
+            * -(math.log(10000.0) / self.d_model)
+        )
+        pe_positive[:, 0::2] = torch.sin(position * div_term)
+        pe_positive[:, 1::2] = torch.cos(position * div_term)
+        pe_negative[:, 0::2] = torch.sin(-1 * position * div_term)
+        pe_negative[:, 1::2] = torch.cos(-1 * position * div_term)
+        # Reserve the order of positive indices and concat both positive and
+        # negative indices. This is used to support the shifting trick
+        # as in https://arxiv.org/abs/1901.02860
+        pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
+        pe_negative = pe_negative[1:].unsqueeze(0)
+        pe = torch.cat([pe_positive, pe_negative], dim=1)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+    def forward(self, x: torch.Tensor, offset: Union[int, torch.Tensor] = 0) \
+            -> Tuple[torch.Tensor, torch.Tensor]:
+        """Add positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            torch.Tensor: Encoded tensor (batch, time, `*`).
+        """
+        self.extend_pe(x)
+        x = x * self.xscale
+        pos_emb = self.position_encoding(size=x.size(1), offset=offset)
+        return self.dropout(x), self.dropout(pos_emb)
+    def position_encoding(self,
+                          offset: Union[int, torch.Tensor],
+                          size: int) -> torch.Tensor:
+        """ For getting encoding in a streaming fashion
+        Attention!!!!!
+        we apply dropout only once at the whole utterance level in a none
+        streaming way, but will call this function several times with
+        increasing input size in a streaming scenario, so the dropout will
+        be applied several times.
+        Args:
+            offset (int or torch.tensor): start offset
+            size (int): required size of position encoding
+        Returns:
+            torch.Tensor: Corresponding encoding
+        """
+        pos_emb = self.pe[
+            :,
+            self.pe.size(1) // 2 - size + 1: self.pe.size(1) // 2 + size,
+        ]
+        return pos_emb
+class MultiHeadedAttention(nn.Module):
+    """Multi-Head Attention layer.
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
+    """
+    def __init__(self,
+                 n_head: int,
+                 n_feat: int,
+                 dropout_rate: float,
+                 key_bias: bool = True):
+        """Construct an MultiHeadedAttention object."""
+        super().__init__()
+        assert n_feat % n_head == 0
+        # We assume d_v always equals d_k
+        self.d_k = n_feat // n_head
+        self.h = n_head
+        self.linear_q = nn.Linear(n_feat, n_feat)
+        self.linear_k = nn.Linear(n_feat, n_feat, bias=key_bias)
+        self.linear_v = nn.Linear(n_feat, n_feat)
+        self.linear_out = nn.Linear(n_feat, n_feat)
+        self.dropout = nn.Dropout(p=dropout_rate)
+    def forward_qkv(
+        self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Transform query, key and value.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+        Returns:
+            torch.Tensor: Transformed query tensor, size
+                (#batch, n_head, time1, d_k).
+            torch.Tensor: Transformed key tensor, size
+                (#batch, n_head, time2, d_k).
+            torch.Tensor: Transformed value tensor, size
+                (#batch, n_head, time2, d_k).
+        """
+        n_batch = query.size(0)
+        q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)
+        k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k)
+        v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k)
+        q = q.transpose(1, 2)  # (batch, head, time1, d_k)
+        k = k.transpose(1, 2)  # (batch, head, time2, d_k)
+        v = v.transpose(1, 2)  # (batch, head, time2, d_k)
+        return q, k, v
+    def forward_attention(
+        self,
+        value: torch.Tensor,
+        scores: torch.Tensor,
+        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool)
+    ) -> torch.Tensor:
+        """Compute attention context vector.
+        Args:
+            value (torch.Tensor): Transformed value, size
+                (#batch, n_head, time2, d_k).
+            scores (torch.Tensor): Attention score, size
+                (#batch, n_head, time1, time2).
+            mask (torch.Tensor): Mask, size (#batch, 1, time2) or
+                (#batch, time1, time2), (0, 0, 0) means fake mask.
+        Returns:
+            torch.Tensor: Transformed value (#batch, time1, d_model)
+                weighted by the attention score (#batch, time1, time2).
+        """
+        n_batch = value.size(0)
+        # NOTE(xcsong): When will `if mask.size(2) > 0` be True?
+        #   1. onnx(16/4) [WHY? Because we feed real cache & real mask for the
+        #           1st chunk to ease the onnx export.]
+        #   2. pytorch training
+        if mask.size(2) > 0:  # time2 > 0
+            mask = mask.unsqueeze(1).eq(0)  # (batch, 1, *, time2)
+            # For last chunk, time2 might be larger than scores.size(-1)
+            mask = mask[:, :, :, :scores.size(-1)]  # (batch, 1, *, time2)
+            scores = scores.masked_fill(mask, -float('inf'))
+            attn = torch.softmax(scores, dim=-1).masked_fill(
+                mask, 0.0)  # (batch, head, time1, time2)
+        # NOTE(xcsong): When will `if mask.size(2) > 0` be False?
+        #   1. onnx(16/-1, -1/-1, 16/0)
+        #   2. jit (16/-1, -1/-1, 16/0, 16/4)
+        else:
+            attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
+        p_attn = self.dropout(attn)
+        x = torch.matmul(p_attn, value)  # (batch, head, time1, d_k)
+        x = (x.transpose(1, 2).contiguous().view(n_batch, -1,
+                                                 self.h * self.d_k)
+             )  # (batch, time1, d_model)
+        return self.linear_out(x)  # (batch, time1, d_model)
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        pos_emb: torch.Tensor = torch.empty(0),
+        cache: torch.Tensor = torch.zeros((0, 0, 0, 0))
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute scaled dot product attention.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2).
+                1.When applying cross attention between decoder and encoder,
+                the batch padding mask for input is in (#batch, 1, T) shape.
+                2.When applying self attention of encoder,
+                the mask is in (#batch, T, T)  shape.
+                3.When applying self attention of decoder,
+                the mask is in (#batch, L, L)  shape.
+                4.If the different position in decoder see different block
+                of the encoder, such as Mocha, the passed in mask could be
+                in (#batch, L, T) shape. But there is no such case in current
+                CosyVoice.
+            cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2),
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+            torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2)
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        # NOTE(xcsong):
+        #   when export onnx model, for 1st chunk, we feed
+        #       cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode)
+        #       or cache(1, head, real_cache_t, d_k * 2) (16/4 mode).
+        #       In all modes, `if cache.size(0) > 0` will alwayse be `True`
+        #       and we will always do splitting and
+        #       concatnation(this will simplify onnx export). Note that
+        #       it's OK to concat & split zero-shaped tensors(see code below).
+        #   when export jit  model, for 1st chunk, we always feed
+        #       cache(0, 0, 0, 0) since jit supports dynamic if-branch.
+        # >>> a = torch.ones((1, 2, 0, 4))
+        # >>> b = torch.ones((1, 2, 3, 4))
+        # >>> c = torch.cat((a, b), dim=2)
+        # >>> torch.equal(b, c)        # True
+        # >>> d = torch.split(a, 2, dim=-1)
+        # >>> torch.equal(d[0], d[1])  # True
+        if cache.size(0) > 0:
+            key_cache, value_cache = torch.split(cache,
+                                                 cache.size(-1) // 2,
+                                                 dim=-1)
+            k = torch.cat([key_cache, k], dim=2)
+            v = torch.cat([value_cache, v], dim=2)
+        # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's
+        #   non-trivial to calculate `next_cache_start` here.
+        new_cache = torch.cat((k, v), dim=-1)
+        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
+        return self.forward_attention(v, scores, mask), new_cache
+class RelPositionMultiHeadedAttention(MultiHeadedAttention):
+    """Multi-Head Attention layer with relative position encoding.
+    Paper: https://arxiv.org/abs/1901.02860
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
+    """
+    def __init__(self,
+                 n_head: int,
+                 n_feat: int,
+                 dropout_rate: float,
+                 key_bias: bool = True):
+        """Construct an RelPositionMultiHeadedAttention object."""
+        super().__init__(n_head, n_feat, dropout_rate, key_bias)
+        # linear transformation for positional encoding
+        self.linear_pos = nn.Linear(n_feat, n_feat, bias=False)
+        # these two learnable bias are used in matrix c and matrix d
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k))
+        self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k))
+        torch.nn.init.xavier_uniform_(self.pos_bias_u)
+        torch.nn.init.xavier_uniform_(self.pos_bias_v)
+    def rel_shift(self, x: torch.Tensor) -> torch.Tensor:
+        """Compute relative positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, head, time1, 2*time1-1).
+            time1 means the length of query vector.
+        Returns:
+            torch.Tensor: Output tensor.
+        """
+        zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1),
+                               device=x.device,
+                               dtype=x.dtype)
+        x_padded = torch.cat([zero_pad, x], dim=-1)
+        x_padded = x_padded.view(x.size()[0],
+                                 x.size()[1],
+                                 x.size(3) + 1, x.size(2))
+        x = x_padded[:, :, 1:].view_as(x)[
+            :, :, :, : x.size(-1) // 2 + 1
+        ]  # only keep the positions from 0 to time2
+        return x
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        pos_emb: torch.Tensor = torch.empty(0),
+        cache: torch.Tensor = torch.zeros((0, 0, 0, 0))
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2), (0, 0, 0) means fake mask.
+            pos_emb (torch.Tensor): Positional embedding tensor
+                (#batch, time2, size).
+            cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2),
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+            torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2)
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        q = q.transpose(1, 2)  # (batch, time1, head, d_k)
+        # NOTE(xcsong):
+        #   when export onnx model, for 1st chunk, we feed
+        #       cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode)
+        #       or cache(1, head, real_cache_t, d_k * 2) (16/4 mode).
+        #       In all modes, `if cache.size(0) > 0` will alwayse be `True`
+        #       and we will always do splitting and
+        #       concatnation(this will simplify onnx export). Note that
+        #       it's OK to concat & split zero-shaped tensors(see code below).
+        #   when export jit  model, for 1st chunk, we always feed
+        #       cache(0, 0, 0, 0) since jit supports dynamic if-branch.
+        # >>> a = torch.ones((1, 2, 0, 4))
+        # >>> b = torch.ones((1, 2, 3, 4))
+        # >>> c = torch.cat((a, b), dim=2)
+        # >>> torch.equal(b, c)        # True
+        # >>> d = torch.split(a, 2, dim=-1)
+        # >>> torch.equal(d[0], d[1])  # True
+        if cache.size(0) > 0:
+            key_cache, value_cache = torch.split(cache,
+                                                 cache.size(-1) // 2,
+                                                 dim=-1)
+            k = torch.cat([key_cache, k], dim=2)
+            v = torch.cat([value_cache, v], dim=2)
+        # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's
+        #   non-trivial to calculate `next_cache_start` here.
+        new_cache = torch.cat((k, v), dim=-1)
+        n_batch_pos = pos_emb.size(0)
+        p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
+        p = p.transpose(1, 2)  # (batch, head, time1, d_k)
+        # (batch, head, time1, d_k)
+        q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2)
+        # (batch, head, time1, d_k)
+        q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2)
+        # compute attention score
+        # first compute matrix a and matrix c
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        # (batch, head, time1, time2)
+        matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1))
+        # compute matrix b and matrix d
+        # (batch, head, time1, time2)
+        matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1))
+        # NOTE(Xiang Lyu): Keep rel_shift since espnet rel_pos_emb is used
+        if matrix_ac.shape != matrix_bd.shape:
+            matrix_bd = self.rel_shift(matrix_bd)
+        scores = (matrix_ac + matrix_bd) / math.sqrt(
+            self.d_k)  # (batch, head, time1, time2)
+        return self.forward_attention(v, scores, mask), new_cache
+class UpsampleConformerEncoder(torch.nn.Module):
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int = 256,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        input_layer: str = "conv2d",
+        pos_enc_layer_type: str = "rel_pos",
+        normalize_before: bool = True,
+        static_chunk_size: int = 0,
+        use_dynamic_chunk: bool = False,
+        global_cmvn: torch.nn.Module = None,
+        use_dynamic_left_chunk: bool = False,
+        positionwise_conv_kernel_size: int = 1,
+        macaron_style: bool = True,
+        selfattention_layer_type: str = "rel_selfattn",
+        activation_type: str = "swish",
+        use_cnn_module: bool = True,
+        cnn_module_kernel: int = 15,
+        causal: bool = False,
+        cnn_module_norm: str = "batch_norm",
+        key_bias: bool = True,
+        gradient_checkpointing: bool = False,
+    ):
+        """
+        Args:
+            input_size (int): input dim
+            output_size (int): dimension of attention
+            attention_heads (int): the number of heads of multi head attention
+            linear_units (int): the hidden units number of position-wise feed
+                forward
+            num_blocks (int): the number of decoder blocks
+            dropout_rate (float): dropout rate
+            attention_dropout_rate (float): dropout rate in attention
+            positional_dropout_rate (float): dropout rate after adding
+                positional encoding
+            input_layer (str): input layer type.
+                optional [linear, conv2d, conv2d6, conv2d8]
+            pos_enc_layer_type (str): Encoder positional encoding layer type.
+                opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos]
+            normalize_before (bool):
+                True: use layer_norm before each sub-block of a layer.
+                False: use layer_norm after each sub-block of a layer.
+            static_chunk_size (int): chunk size for static chunk training and
+                decoding
+            use_dynamic_chunk (bool): whether use dynamic chunk size for
+                training or not, You can only use fixed chunk(chunk_size > 0)
+                or dyanmic chunk size(use_dynamic_chunk = True)
+            global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module
+            use_dynamic_left_chunk (bool): whether use dynamic left chunk in
+                dynamic chunk training
+            key_bias: whether use bias in attention.linear_k, False for whisper models.
+            gradient_checkpointing: rerunning a forward-pass segment for each
+                checkpointed segment during backward.
+        """
+        super().__init__()
+        self._output_size = output_size
+        self.global_cmvn = global_cmvn
+        # self.embed = COSYVOICE_SUBSAMPLE_CLASSES[input_layer](
+        self.embed = LinearNoSubsampling(
+            input_size,
+            output_size,
+            dropout_rate,
+            # COSYVOICE_EMB_CLASSES[pos_enc_layer_type](
+            EspnetRelPositionalEncoding(
+                output_size,
+                positional_dropout_rate,
+            ),
+        )
+        self.normalize_before = normalize_before
+        self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5)
+        self.static_chunk_size = static_chunk_size
+        self.use_dynamic_chunk = use_dynamic_chunk
+        self.use_dynamic_left_chunk = use_dynamic_left_chunk
+        self.gradient_checkpointing = gradient_checkpointing
+        # COSYVOICE_ACTIVATION_CLASSES[activation_type]()
+        activation = getattr(torch.nn, "SiLU", Swish)()
+        # self-attention module definition
+        encoder_selfattn_layer_args = (
+            attention_heads,
+            output_size,
+            attention_dropout_rate,
+            key_bias,
+        )
+        # feed-forward module definition
+        positionwise_layer_args = (
+            output_size,
+            linear_units,
+            dropout_rate,
+            activation,
+        )
+        # convolution module definition
+        convolution_layer_args = (output_size, cnn_module_kernel, activation,
+                                  cnn_module_norm, causal)
+        self.pre_lookahead_layer = PreLookaheadLayer(channels=512, pre_lookahead_len=3)
+        self.encoders = torch.nn.ModuleList([
+            ConformerEncoderLayer(
+                output_size,
+                # COSYVOICE_ATTENTION_CLASSES[selfattention_layer_type](
+                RelPositionMultiHeadedAttention(
+                    *encoder_selfattn_layer_args),
+                PositionwiseFeedForward(*positionwise_layer_args),
+                PositionwiseFeedForward(
+                    *positionwise_layer_args) if macaron_style else None,
+                ConvolutionModule(
+                    *convolution_layer_args) if use_cnn_module else None,
+                dropout_rate,
+                normalize_before,
+            ) for _ in range(num_blocks)
+        ])
+        self.up_layer = Upsample1D(channels=512, out_channels=512, stride=2)
+        # self.up_embed = COSYVOICE_SUBSAMPLE_CLASSES[input_layer](
+        self.up_embed = LinearNoSubsampling(
+            input_size,
+            output_size,
+            dropout_rate,
+            # COSYVOICE_EMB_CLASSES[pos_enc_layer_type](
+            EspnetRelPositionalEncoding(
+                output_size,
+                positional_dropout_rate,
+            ),
+        )
+        self.up_encoders = torch.nn.ModuleList([
+            ConformerEncoderLayer(
+                output_size,
+                # COSYVOICE_ATTENTION_CLASSES[selfattention_layer_type](
+                RelPositionMultiHeadedAttention(
+                    *encoder_selfattn_layer_args),
+                PositionwiseFeedForward(*positionwise_layer_args),
+                PositionwiseFeedForward(
+                    *positionwise_layer_args) if macaron_style else None,
+                ConvolutionModule(
+                    *convolution_layer_args) if use_cnn_module else None,
+                dropout_rate,
+                normalize_before,
+            ) for _ in range(4)
+        ])
+    def output_size(self) -> int:
+        return self._output_size
+    def forward(
+        self,
+        xs: torch.Tensor,
+        xs_lens: torch.Tensor,
+        decoding_chunk_size: int = 0,
+        num_decoding_left_chunks: int = -1,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Embed positions in tensor.
+        Args:
+            xs: padded input tensor (B, T, D)
+            xs_lens: input length (B)
+            decoding_chunk_size: decoding chunk size for dynamic chunk
+                0: default for training, use random dynamic chunk.
+                <0: for decoding, use full chunk.
+                >0: for decoding, use fixed chunk size as set.
+            num_decoding_left_chunks: number of left chunks, this is for decoding,
+            the chunk size is decoding_chunk_size.
+                >=0: use num_decoding_left_chunks
+                <0: use all left chunks
+        Returns:
+            encoder output tensor xs, and subsampled masks
+            xs: padded output tensor (B, T' ~= T/subsample_rate, D)
+            masks: torch.Tensor batch padding mask after subsample
+                (B, 1, T' ~= T/subsample_rate)
+        NOTE(xcsong):
+            We pass the `__call__` method of the modules instead of `forward` to the
+            checkpointing API because `__call__` attaches all the hooks of the module.
+            https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2
+        """
+        T = xs.size(1)
+        masks = ~make_pad_mask(xs_lens, T).unsqueeze(1)  # (B, 1, T)
+        if self.global_cmvn is not None:
+            xs = self.global_cmvn(xs)
+        xs, pos_emb, masks = self.embed(xs, masks)
+        mask_pad = masks  # (B, 1, T/subsample_rate)
+        chunk_masks = add_optional_chunk_mask(xs, masks,
+                                              self.use_dynamic_chunk,
+                                              self.use_dynamic_left_chunk,
+                                              decoding_chunk_size,
+                                              self.static_chunk_size,
+                                              num_decoding_left_chunks)
+        # lookahead + conformer encoder
+        xs = self.pre_lookahead_layer(xs)
+        xs = self.forward_layers(xs, chunk_masks, pos_emb, mask_pad)
+        # upsample + conformer encoder
+        xs = xs.transpose(1, 2).contiguous()
+        xs, xs_lens = self.up_layer(xs, xs_lens)
+        xs = xs.transpose(1, 2).contiguous()
+        T = xs.size(1)
+        masks = ~make_pad_mask(xs_lens, T).unsqueeze(1)  # (B, 1, T)
+        xs, pos_emb, masks = self.up_embed(xs, masks)
+        mask_pad = masks  # (B, 1, T/subsample_rate)
+        chunk_masks = add_optional_chunk_mask(xs, masks,
+                                              self.use_dynamic_chunk,
+                                              self.use_dynamic_left_chunk,
+                                              decoding_chunk_size,
+                                              self.static_chunk_size * self.up_layer.stride,
+                                              num_decoding_left_chunks)
+        xs = self.forward_up_layers(xs, chunk_masks, pos_emb, mask_pad)
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        # Here we assume the mask is not changed in encoder layers, so just
+        # return the masks before encoder layers, and the masks will be used
+        # for cross attention with decoder later
+        return xs, masks
+    def forward_layers(self, xs: torch.Tensor, chunk_masks: torch.Tensor,
+                       pos_emb: torch.Tensor,
+                       mask_pad: torch.Tensor) -> torch.Tensor:
+        for layer in self.encoders:
+            xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
+        return xs
+    def forward_up_layers(self, xs: torch.Tensor, chunk_masks: torch.Tensor,
+                          pos_emb: torch.Tensor,
+                          mask_pad: torch.Tensor) -> torch.Tensor:
+        for layer in self.up_encoders:
+            xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
+        return xs
+class CausalMaskedDiffWithXvec(PreTrainedModel):
+    """
+    cosyvoice2.0 flow模块
+    """
+    def __init__(
+        self,
+        config: FlowConfig,
+        mel_feat_conf: Dict = {
+            'n_fft': 1024,
+            'num_mels': 80,
+            'sampling_rate': 22050,
+            'hop_size': 256,
+            'win_size': 1024,
+            'fmin': 0,
+            'fmax': 8000,
+        },
+    ):
+        super().__init__(config)
+        self.input_size = config.input_size
+        self.output_size = config.output_size
+        self.decoder_conf = config.decoder_config
+        self.mel_feat_conf = mel_feat_conf
+        self.vocab_size = config.vocab_size  # 与speech tokenizer保持一致 6561
+        self.output_type = config.output_type
+        self.input_frame_rate = config.input_frame_rate
+        self.input_embedding = nn.Embedding(config.vocab_size, config.input_size)
+        self.spk_embed_affine_layer = torch.nn.Linear(config.spk_embed_dim, config.output_size)
+        self.encoder = UpsampleConformerEncoder(**config.encoder_config)
+        self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), config.output_size)
+        decoder_config = copy.deepcopy(config.decoder_config)
+        decoder_config['cfm_params'] = DictConfig(decoder_config['cfm_params'])
+        self.decoder = CausalConditionalCFM(**decoder_config)
+        self.only_mask_loss = config.only_mask_loss
+        self.token_mel_ratio = config.token_mel_ratio
+        self.pre_lookahead_len = config.pre_lookahead_len
+    @torch.inference_mode()
+    def inference(
+        self,
+        token,
+        token_len,
+        prompt_token,
+        prompt_token_len,
+        prompt_feat,
+        prompt_feat_len,
+        embedding,
+        finalize,
+    ):
+        # if self.fp16 is True:
+        #     prompt_feat = prompt_feat.half()
+        #     embedding = embedding.half()
+        # process
+        embedding = embedding.to(self.spk_embed_affine_layer.weight.data.dtype)  # noqa, TODO
+        prompt_feat = prompt_feat.to(self.spk_embed_affine_layer.weight.data.dtype) # noqa, TODO
+        assert token.shape[0] == 1
+        # xvec projection
+        embedding = F.normalize(embedding, dim=1)
+        embedding = self.spk_embed_affine_layer(embedding)
+        # concat text and prompt_text
+        token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len   # 拼接prompt token+ 需要生成的token
+        mask = (~make_pad_mask(token_len)).unsqueeze(-1).to(embedding)
+        token = self.input_embedding(torch.clamp(token, min=0)) * mask
+        # text encode
+        h, h_lengths = self.encoder(token, token_len)
+        if finalize is False:
+            h = h[:, :-self.pre_lookahead_len * self.token_mel_ratio]
+        mel_len1, mel_len2 = prompt_feat.shape[1], h.shape[1] - prompt_feat.shape[1]
+        h = self.encoder_proj(h)
+        # get conditions
+        conds = torch.zeros([1, mel_len1 + mel_len2, self.output_size], device=token.device).to(h.dtype)
+        conds[:, :mel_len1] = prompt_feat   # prompt音频的mel 特征作为condition
+        conds = conds.transpose(1, 2)
+        mask = (~make_pad_mask(torch.tensor([mel_len1 + mel_len2]))).to(h)
+        feat, _ = self.decoder(
+            mu=h.transpose(1, 2).contiguous(),
+            mask=mask.unsqueeze(1),
+            spks=embedding,
+            cond=conds,
+            n_timesteps=10
+        )
+        feat = feat[:, :, mel_len1:]
+        assert feat.shape[2] == mel_len2
+        return feat.float(), None

modeling_hifigan.py ADDED Viewed

	@@ -0,0 +1,479 @@

+# --------------------------------------------------------
+# SenseTime
+# Copyright (c) 2025 SenseTime
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from typing import Dict, Optional, List
+import numpy as np
+from scipy.signal import get_window
+import torch
+import torch.nn.functional as F
+from torch.nn import ConvTranspose1d, Conv1d, Parameter
+from torch.nn.utils import remove_weight_norm
+from torch.nn.utils.parametrizations import weight_norm
+from torch.distributions.uniform import Uniform
+from torch import nn, sin, pow
+from transformers.modeling_utils import PreTrainedModel
+from .configuration_hifigan import HiFiGanConfig
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+    return
+# Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
+#   LICENSE is in incl_licenses directory.
+class Snake(nn.Module):
+    '''
+    Implementation of a sine-based periodic activation function
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    Parameters:
+        - alpha - trainable parameter
+    References:
+        - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+        https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = snake(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    '''
+    def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
+        '''
+        Initialization.
+        INPUT:
+            - in_features: shape of the input
+            - alpha: trainable parameter
+            alpha is initialized to 1 by default, higher values = higher-frequency.
+            alpha will be trained along with the rest of your model.
+        '''
+        super(Snake, self).__init__()
+        self.in_features = in_features
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale:  # log scale alphas initialized to zeros
+            self.alpha = Parameter(torch.zeros(in_features) * alpha)
+        else:  # linear scale alphas initialized to ones
+            self.alpha = Parameter(torch.ones(in_features) * alpha)
+        self.alpha.requires_grad = alpha_trainable
+        self.no_div_by_zero = 0.000000001
+    def forward(self, x):
+        '''
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        Snake ∶= x + 1/a * sin^2 (xa)
+        '''
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # line up with x to [B, C, T]
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+        x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
+        return x
+class ConvRNNF0Predictor(nn.Module):
+    def __init__(self,
+                 num_class: int = 1,
+                 in_channels: int = 80,
+                 cond_channels: int = 512
+                 ):
+        super().__init__()
+        self.num_class = num_class
+        self.condnet = nn.Sequential(
+            weight_norm(
+                nn.Conv1d(in_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+            weight_norm(
+                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+            weight_norm(
+                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+            weight_norm(
+                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+            weight_norm(
+                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+        )
+        self.classifier = nn.Linear(in_features=cond_channels, out_features=self.num_class)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.condnet(x)
+        x = x.transpose(1, 2)
+        return torch.abs(self.classifier(x).squeeze(-1))
+class ResBlock(torch.nn.Module):
+    """Residual block module in HiFiGAN/BigVGAN."""
+    def __init__(
+        self,
+        channels: int = 512,
+        kernel_size: int = 3,
+        dilations: List[int] = [1, 3, 5],
+    ):
+        super(ResBlock, self).__init__()
+        self.convs1 = nn.ModuleList()
+        self.convs2 = nn.ModuleList()
+        for dilation in dilations:
+            self.convs1.append(
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation,
+                        padding=get_padding(kernel_size, dilation)
+                    )
+                )
+            )
+            self.convs2.append(
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1)
+                    )
+                )
+            )
+        self.convs1.apply(init_weights)
+        self.convs2.apply(init_weights)
+        self.activations1 = nn.ModuleList([
+            Snake(channels, alpha_logscale=False)
+            for _ in range(len(self.convs1))
+        ])
+        self.activations2 = nn.ModuleList([
+            Snake(channels, alpha_logscale=False)
+            for _ in range(len(self.convs2))
+        ])
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        for idx in range(len(self.convs1)):
+            xt = self.activations1[idx](x)
+            xt = self.convs1[idx](xt)
+            xt = self.activations2[idx](xt)
+            xt = self.convs2[idx](xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for idx in range(len(self.convs1)):
+            remove_weight_norm(self.convs1[idx])
+            remove_weight_norm(self.convs2[idx])
+class SineGen(torch.nn.Module):
+    """ Definition of sine generator
+    SineGen(samp_rate, harmonic_num = 0,
+            sine_amp = 0.1, noise_std = 0.003,
+            voiced_threshold = 0,
+            flag_for_pulse=False)
+    samp_rate: sampling rate in Hz
+    harmonic_num: number of harmonic overtones (default 0)
+    sine_amp: amplitude of sine-wavefrom (default 0.1)
+    noise_std: std of Gaussian noise (default 0.003)
+    voiced_thoreshold: F0 threshold for U/V classification (default 0)
+    flag_for_pulse: this SinGen is used inside PulseGen (default False)
+    Note: when flag_for_pulse is True, the first time step of a voiced
+        segment is always sin(np.pi) or cos(0)
+    """
+    def __init__(self, samp_rate, harmonic_num=0,
+                 sine_amp=0.1, noise_std=0.003,
+                 voiced_threshold=0):
+        super(SineGen, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.harmonic_num = harmonic_num
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+    def _f02uv(self, f0):
+        # generate uv signal
+        uv = (f0 > self.voiced_threshold).type(torch.float32)
+        return uv
+    @torch.no_grad()
+    def forward(self, f0):
+        """
+        :param f0: [B, 1, sample_len], Hz
+        :return: [B, 1, sample_len]
+        """
+        F_mat = torch.zeros((f0.size(0), self.harmonic_num + 1, f0.size(-1))).to(f0.device)
+        for i in range(self.harmonic_num + 1):
+            F_mat[:, i: i + 1, :] = f0 * (i + 1) / self.sampling_rate
+        theta_mat = 2 * np.pi * (torch.cumsum(F_mat, dim=-1) % 1)
+        u_dist = Uniform(low=-np.pi, high=np.pi)
+        phase_vec = u_dist.sample(sample_shape=(f0.size(0), self.harmonic_num + 1, 1)).to(F_mat.device)
+        phase_vec[:, 0, :] = 0
+        # generate sine waveforms
+        sine_waves = self.sine_amp * torch.sin(theta_mat + phase_vec)
+        # generate uv signal
+        uv = self._f02uv(f0)
+        # noise: for unvoiced should be similar to sine_amp
+        #        std = self.sine_amp/3 -> max value ~ self.sine_amp
+        # .       for voiced regions is self.noise_std
+        noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+        noise = noise_amp * torch.randn_like(sine_waves)
+        # first: set the unvoiced part to 0 by uv
+        # then: additive noise
+        sine_waves = sine_waves * uv + noise
+        return sine_waves, uv, noise
+class SourceModuleHnNSF(torch.nn.Module):
+    """ SourceModule for hn-nsf
+    SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0)
+    sampling_rate: sampling_rate in Hz
+    harmonic_num: number of harmonic above F0 (default: 0)
+    sine_amp: amplitude of sine source signal (default: 0.1)
+    add_noise_std: std of additive Gaussian noise (default: 0.003)
+        note that amplitude of noise in unvoiced is decided
+        by sine_amp
+    voiced_threshold: threhold to set U/V given F0 (default: 0)
+    Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+    F0_sampled (batchsize, length, 1)
+    Sine_source (batchsize, length, 1)
+    noise_source (batchsize, length 1)
+    uv (batchsize, length, 1)
+    """
+    def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0):
+        super(SourceModuleHnNSF, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = add_noise_std
+        # to produce sine waveforms
+        self.l_sin_gen = SineGen(sampling_rate, harmonic_num,
+                                 sine_amp, add_noise_std, voiced_threshod)
+        # to merge source harmonics into a single excitation
+        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
+        self.l_tanh = torch.nn.Tanh()
+    def forward(self, x):
+        """
+        Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+        F0_sampled (batchsize, length, 1)
+        Sine_source (batchsize, length, 1)
+        noise_source (batchsize, length 1)
+        """
+        # source for harmonic branch
+        with torch.no_grad():
+            sine_wavs, uv, _ = self.l_sin_gen(x.transpose(1, 2))
+            sine_wavs = sine_wavs.transpose(1, 2)
+            uv = uv.transpose(1, 2)
+        sine_wavs = sine_wavs.to(self.l_linear.weight.data.dtype)  # noqa, TODO
+        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
+        # source for noise branch, in the same shape as uv
+        noise = torch.randn_like(uv) * self.sine_amp / 3
+        return sine_merge, noise, uv
+class HiFTGenerator(PreTrainedModel):
+    """
+    HiFTNet Generator: Neural Source Filter + ISTFTNet
+    https://arxiv.org/abs/2309.09493
+    """
+    def __init__(
+            self,
+            config: HiFiGanConfig
+    ):
+        super(HiFTGenerator, self).__init__(config)
+        self.out_channels = 1
+        self.nb_harmonics = config.nb_harmonics
+        self.sampling_rate = config.sampling_rate
+        self.istft_params = config.istft_params
+        self.lrelu_slope = config.lrelu_slope
+        self.audio_limit = config.audio_limit
+        self.num_kernels = len(config.resblock_kernel_sizes)
+        self.num_upsamples = len(config.upsample_rates)
+        self.m_source = SourceModuleHnNSF(
+            sampling_rate=config.sampling_rate,
+            upsample_scale=np.prod(config.upsample_rates) * config.istft_params["hop_len"],
+            harmonic_num=config.nb_harmonics,
+            sine_amp=config.nsf_alpha,
+            add_noise_std=config.nsf_sigma,
+            voiced_threshod=config.nsf_voiced_threshold)
+        self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(config.upsample_rates) * config.istft_params["hop_len"])
+        self.conv_pre = weight_norm(
+            Conv1d(config.in_channels, config.base_channels, 7, 1, padding=3)
+        )
+        # Up
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(config.upsample_rates, config.upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        config.base_channels // (2**i),
+                        config.base_channels // (2**(i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+        # Down
+        self.source_downs = nn.ModuleList()
+        self.source_resblocks = nn.ModuleList()
+        downsample_rates = [1] + config.upsample_rates[::-1][:-1]
+        downsample_cum_rates = np.cumprod(downsample_rates)
+        for i, (u, k, d) in enumerate(zip(downsample_cum_rates[::-1], config.source_resblock_kernel_sizes, config.source_resblock_dilation_sizes)):
+            if u == 1:
+                self.source_downs.append(
+                    Conv1d(config.istft_params["n_fft"] + 2, config.base_channels // (2 ** (i + 1)), 1, 1)
+                )
+            else:
+                self.source_downs.append(
+                    Conv1d(config.istft_params["n_fft"] + 2, config.base_channels // (2 ** (i + 1)), u * 2, u, padding=(u // 2))
+                )
+            self.source_resblocks.append(
+                ResBlock(config.base_channels // (2 ** (i + 1)), k, d)
+            )
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = config.base_channels // (2**(i + 1))
+            for _, (k, d) in enumerate(zip(config.resblock_kernel_sizes, config.resblock_dilation_sizes)):
+                self.resblocks.append(ResBlock(ch, k, d))
+        self.conv_post = weight_norm(Conv1d(ch, config.istft_params["n_fft"] + 2, 7, 1, padding=3))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+        self.reflection_pad = nn.ReflectionPad1d((1, 0))
+        self.stft_window = torch.from_numpy(get_window("hann", config.istft_params["n_fft"], fftbins=True).astype(np.float32))
+        self.f0_predictor = ConvRNNF0Predictor(**config.f0_predictor_config)
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+        self.m_source.remove_weight_norm()
+        for l in self.source_downs:
+            remove_weight_norm(l)
+        for l in self.source_resblocks:
+            l.remove_weight_norm()
+    def _stft(self, x):
+        spec = torch.stft(
+            x,
+            self.istft_params["n_fft"], self.istft_params["hop_len"], self.istft_params["n_fft"], window=self.stft_window.to(x.device),
+            return_complex=True)
+        spec = torch.view_as_real(spec)  # [B, F, TT, 2]
+        return spec[..., 0], spec[..., 1]
+    def _istft(self, magnitude, phase):
+        magnitude = torch.clip(magnitude, max=1e2)
+        real = magnitude * torch.cos(phase)
+        img = magnitude * torch.sin(phase)
+        inverse_transform = torch.istft(torch.complex(real, img), self.istft_params["n_fft"], self.istft_params["hop_len"],
+                                        self.istft_params["n_fft"], window=self.stft_window.to(magnitude.device))
+        return inverse_transform
+    def decode(self, x: torch.Tensor, s: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor:
+        s_stft_real, s_stft_imag = self._stft(s.squeeze(1))
+        s_stft = torch.cat([s_stft_real, s_stft_imag], dim=1)
+        s_stft = s_stft.to(x)  # noqa TODO
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, self.lrelu_slope)
+            x = self.ups[i](x)
+            if i == self.num_upsamples - 1:
+                x = self.reflection_pad(x)
+            # fusion
+            si = self.source_downs[i](s_stft)
+            si = self.source_resblocks[i](si)
+            x = x + si
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        magnitude = torch.exp(x[:, :self.istft_params["n_fft"] // 2 + 1, :])
+        phase = torch.sin(x[:, self.istft_params["n_fft"] // 2 + 1:, :])  # actually, sin is redundancy
+        magnitude = magnitude.to(torch.float)  # noqa TODO
+        phase = phase.to(torch.float)  # noqa TODO
+        x = self._istft(magnitude, phase)
+        x = torch.clamp(x, -self.audio_limit, self.audio_limit)
+        return x
+    def forward(
+            self,
+            batch: dict,
+            device: torch.device,
+    ) -> Dict[str, Optional[torch.Tensor]]:
+        speech_feat = batch['speech_feat'].transpose(1, 2).to(device)
+        # mel->f0
+        f0 = self.f0_predictor(speech_feat)
+        # f0->source
+        s = self.f0_upsamp(f0[:, None]).transpose(1, 2)  # bs,n,t
+        s, _, _ = self.m_source(s)
+        s = s.transpose(1, 2)
+        # mel+source->speech
+        generated_speech = self.decode(x=speech_feat, s=s)
+        return generated_speech, f0
+    @torch.inference_mode()
+    def inference(self, speech_feat: torch.Tensor, cache_source: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor:
+        # process data
+        speech_feat = speech_feat.to(self.f0_predictor.classifier.weight.data.dtype)  # noqa, TODO
+        cache_source = cache_source.to(self.f0_predictor.classifier.weight.data.dtype)  # noqa, TODO
+        # mel->f0
+        f0 = self.f0_predictor(speech_feat)
+        # f0->source
+        s = self.f0_upsamp(f0[:, None]).transpose(1, 2)  # bs,n,t
+        s, _, _ = self.m_source(s)
+        s = s.transpose(1, 2)
+        # use cache_source to avoid glitch
+        if cache_source.shape[2] != 0:
+            s[:, :, :cache_source.shape[2]] = cache_source
+        generated_speech = self.decode(x=speech_feat, s=s)
+        return generated_speech, s

modeling_interactiveomni.py ADDED Viewed

	@@ -0,0 +1,773 @@

+# --------------------------------------------------------
+# SenseTime
+# Copyright (c) 2025 SenseTime
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import warnings
+from typing import Any, List, Optional, Tuple, Union
+import re
+import json
+import math
+import librosa
+import numpy as np
+from PIL import Image
+from decord import VideoReader, cpu
+from torch import nn
+import torch
+import torchvision.transforms as T
+from torchvision.transforms.functional import InterpolationMode
+from transformers import (GenerationConfig, Qwen3ForCausalLM, WhisperFeatureExtractor)
+from transformers.modeling_utils import PreTrainedModel
+import onnxruntime
+import torchaudio.compliance.kaldi as kaldi
+import torchaudio
+from transformers.utils.hub import cached_file
+from .configuration_interactiveomni import InteractiveOmniConfig
+from .modeling_intern_vit import InternVisionModel
+from .modeling_whisper import AudioWhisperModel
+from .modeling_voicelm import VoiceLM
+from .conversation import get_conv_template
+from .modeling_flow import CausalMaskedDiffWithXvec
+from .modeling_hifigan import HiFTGenerator
+import logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+IMG_START_TOKEN = '<img>'
+IMG_END_TOKEN = '</img>'
+IMG_CONTEXT_TOKEN = '<IMG_CONTEXT>'
+AUDIO_START_TOKEN = '<audio>'
+AUDIO_END_TOKEN = '</audio>'
+AUDIO_CONTEXT_TOKEN = '<AUDIO_CONTEXT>'
+class InteractiveOmniModel(PreTrainedModel):
+    config_class = InteractiveOmniConfig
+    main_input_name = 'pixel_values'
+    base_model_prefix = 'language_model'
+    _no_split_modules = ['InternVisionModel', 'AudioWhisperModel', 'Qwen3DecoderLayer', 'Qwen2DecoderLayer']
+    def __init__(self, config: InteractiveOmniConfig, vision_model=None, language_model=None, audio_model=None):
+        super().__init__(config)
+        image_size = config.force_image_size or config.vision_config.image_size
+        patch_size = config.vision_config.patch_size
+        self.patch_size = patch_size
+        self.select_layer = config.select_layer
+        self.template = config.template
+        self.num_image_token = int((image_size // patch_size) ** 2 * (config.downsample_ratio ** 2))
+        self.downsample_ratio = config.downsample_ratio
+        self.ps_version = config.ps_version
+        self.audio_feature_extractor = WhisperFeatureExtractor(**config.audio_preprocessor_config)
+        self.transform = self.build_transform(input_size=image_size)
+        self.campplus_session = None
+        self.default_speaker_embedding = None
+        self.default_wav_path = None
+        logger.info(f'num_image_token: {self.num_image_token}')
+        logger.info(f'ps_version: {self.ps_version}')
+        if vision_model is not None:
+            self.vision_model = vision_model
+        else:
+            self.vision_model = InternVisionModel(config.vision_config)
+        if audio_model is not None:
+            self.audio_model = audio_model
+        else:
+            self.audio_model = AudioWhisperModel(config.audio_config)
+        if language_model is not None:
+            self.language_model = language_model
+        else:
+            self.language_model = Qwen3ForCausalLM(config.llm_config)
+        self.voicelm_model = VoiceLM(config.voicelm_config)
+        self.flow_model = CausalMaskedDiffWithXvec(config.flow_config).float()
+        self.hifigan_model = HiFTGenerator(config.hifigan_config).float()
+        vit_hidden_size = config.vision_config.hidden_size
+        audio_hidden_size = config.audio_config.d_model
+        llm_hidden_size = config.llm_config.hidden_size
+        self.mlp1 = nn.Sequential(
+            nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio) ** 2),
+            nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio) ** 2, llm_hidden_size),
+            nn.GELU(),
+            nn.Linear(llm_hidden_size, llm_hidden_size)
+        )
+        self.mlp2 = nn.Sequential(
+            nn.LayerNorm(audio_hidden_size),
+            nn.Linear(audio_hidden_size, llm_hidden_size),
+            nn.GELU(),
+            nn.Linear(llm_hidden_size, llm_hidden_size)
+        )
+        self.mlp_llm2voicelm = nn.Sequential(
+            nn.LayerNorm(llm_hidden_size),
+            nn.Linear(llm_hidden_size, config.voicelm_config.llm_input_size),
+            nn.GELU(),
+            nn.Linear(config.voicelm_config.llm_input_size, config.voicelm_config.llm_input_size)
+        )
+        self.gate = nn.Sequential(
+            nn.Linear(2 * llm_hidden_size, llm_hidden_size),
+            nn.Sigmoid()
+        )
+        self.img_context_token_id = None
+        self.audio_context_token_id = None
+        self.neftune_alpha = None
+        self.post_init()
+        pass
+    def fusion(self, rep, emb):
+        gate = self.gate(torch.cat([rep, emb], dim=-1))
+        return rep * gate + emb * (1 - gate)
+    def __load_campplus_session(self, campplus_path:str):
+        ''''''
+        logger.info(f"load campplus session: {campplus_path}")
+        option = onnxruntime.SessionOptions()
+        option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+        option.intra_op_num_threads = 1
+        campplus_session = onnxruntime.InferenceSession(
+            campplus_path,
+            sess_options=option,
+            providers=["CPUExecutionProvider"],
+        )
+        self.campplus_session = campplus_session
+        return campplus_session
+    def extract_speaker_embedding(self, prompt_wav:str):
+        '''extract speaker embedding tensor'''
+        logger.info(f"extract speaker embedding: {prompt_wav}")
+        target_sr = 16000
+        prompt_speech_16k, sample_rate = torchaudio.load(prompt_wav)
+        prompt_speech_16k = prompt_speech_16k.mean(dim=0, keepdim=True)
+        if sample_rate != target_sr:
+            assert sample_rate > target_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr)
+            prompt_speech_16k = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(prompt_speech_16k)
+        feat = kaldi.fbank(
+            prompt_speech_16k,
+            num_mel_bins=80,
+            dither=0,
+            sample_frequency=target_sr,
+        )
+        feat = feat - feat.mean(dim=0, keepdim=True)
+        speaker_embedding = self.campplus_session.run(
+            None,
+            {self.campplus_session.get_inputs()[0].name: feat.unsqueeze(dim=0).cpu().numpy()},
+        )[0].flatten().tolist()
+        speaker_embedding = torch.tensor([speaker_embedding])
+        return speaker_embedding
+    def build_transform(self, input_size):
+        MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+        transform = T.Compose([
+            T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+            T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+            T.ToTensor(),
+            T.Normalize(mean=MEAN, std=STD)
+        ])
+        return transform
+    def find_closest_aspect_ratio(self, image, min_num=1, max_num=6, image_size=448):
+        assert min_num == 1
+        original_width, original_height = image.size
+        log_ratio = math.log(original_width / original_height)
+        ratio = original_width * original_height / (image_size * image_size)
+        multiple = min(math.ceil(ratio), max_num)
+        if multiple <= 1:
+            return [1, 1]
+        candidate_split_grids_nums = []
+        for i in [multiple - 1, multiple, multiple + 1]:
+            if i > max_num:
+                continue
+            candidate_split_grids_nums.append(i)
+        candidate_grids = []
+        for split_grids_nums in candidate_split_grids_nums:
+            m = 1
+            while m <= split_grids_nums:
+                if split_grids_nums % m == 0:
+                    candidate_grids.append([m, split_grids_nums // m])
+                m += 1
+        best_grid = [1, 1]
+        min_error = float("inf")
+        for grid in candidate_grids:
+            error = abs(log_ratio - math.log(grid[0] / grid[1]))
+            if error < min_error:
+                best_grid = grid
+                min_error = error
+        return best_grid
+    def dynamic_preprocess(self, image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
+        target_aspect_ratio = self.find_closest_aspect_ratio(image, min_num, max_num, image_size)
+        target_width = image_size * target_aspect_ratio[0]
+        target_height = image_size * target_aspect_ratio[1]
+        blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+        # resize the image
+        resized_img = image.resize((target_width, target_height))
+        processed_images = []
+        for i in range(blocks):
+            box = (
+                (i % (target_width // image_size)) * image_size,
+                (i // (target_width // image_size)) * image_size,
+                ((i % (target_width // image_size)) + 1) * image_size,
+                ((i // (target_width // image_size)) + 1) * image_size
+            )
+            # split the image
+            split_img = resized_img.crop(box)
+            processed_images.append(split_img)
+        assert len(processed_images) == blocks
+        if use_thumbnail and len(processed_images) != 1:
+            thumbnail_img = image.resize((image_size, image_size))
+            processed_images.append(thumbnail_img)
+        return processed_images
+    def load_image(self, image, input_size=448, max_num=12):
+        if not isinstance(image, Image.Image):
+            image = Image.open(image).convert('RGB')
+        images = self.dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
+        return images
+    def pixel_shuffle(self, x, scale_factor=0.5):
+        n, w, h, c = x.size()
+        # N, W, H, C --> N, W, H * scale, C // scale
+        x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
+        # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
+        x = x.permute(0, 2, 1, 3).contiguous()
+        # N, H * scale, W, C // scale --> N, H * scale, W * scale, C // (scale ** 2)
+        x = x.view(n, int(h * scale_factor), int(w * scale_factor),
+                   int(c / (scale_factor * scale_factor)))
+        if self.ps_version == 'v1':
+            warnings.warn("In ps_version 'v1', the height and width have not been swapped back, "
+                          'which results in a transposed image.')
+        else:
+            x = x.permute(0, 2, 1, 3).contiguous()
+        return x
+    def extract_feature(self, pixel_values):
+        if self.select_layer == -1:
+            vit_embeds = self.vision_model(
+                pixel_values=pixel_values,
+                output_hidden_states=False,
+                return_dict=True).last_hidden_state
+        else:
+            vit_embeds = self.vision_model(
+                pixel_values=pixel_values,
+                output_hidden_states=True,
+                return_dict=True).hidden_states[self.select_layer]
+        vit_embeds = vit_embeds[:, 1:, :]
+        if self.training and self.neftune_alpha is not None:
+            vit_embeds = self.noised_embed(vit_embeds, self.neftune_alpha)
+        h = w = int(vit_embeds.shape[1] ** 0.5)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
+        vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
+        vit_embeds = self.mlp1(vit_embeds)#.to(pixel_values.device)
+        return vit_embeds
+    def get_T_after_cnn(self, L_in, dilation=1):
+        for (padding, kernel_size, stride) in eval("[(1,3,1)] + [(1,3,2)] "):
+            L_out = L_in + 2 * padding - dilation * (kernel_size - 1) - 1
+            L_out = 1 + L_out // stride
+            L_in = L_out
+        return L_out
+    def process_audio(self, audio, return_tensors, sampling_rate=16000):
+        L = (audio.shape[0] if audio.shape[0] <= 480000 else 480000)  # max_length < 30s
+        mel_len = L // 160
+        audio_len_after_cnn = self.get_T_after_cnn(mel_len)
+        audio_token_num = (audio_len_after_cnn - 2) // 2 + 1
+        inputs = self.audio_feature_extractor(audio, return_tensors=return_tensors, sampling_rate=sampling_rate)
+        inputs['audio_len_after_cnn'] = torch.tensor(audio_len_after_cnn, dtype=torch.long)
+        inputs['audio_token_num'] = torch.tensor(audio_token_num, dtype=torch.long)
+        return inputs
+    def load_audio(self, audio_file, sampling_rate=16000):
+        audio_values, _ = librosa.load(audio_file, sr=sampling_rate) # sample rate should be 16000
+        audio_process_values = self.process_audio(audio_values, sampling_rate=sampling_rate, return_tensors="pt")
+        input_features = audio_process_values['input_features']
+        audio_len_after_cnn = audio_process_values['audio_len_after_cnn']
+        audio_token_num = audio_process_values['audio_token_num']
+        audio_input_dict = {'audio_values': input_features,
+                        'audio_len_after_cnn': audio_len_after_cnn,
+                        'audio_token_num': audio_token_num,
+                        }
+        return audio_input_dict
+    def extract_audio_feature(self, audio_values, audio_len_after_cnn):
+        audio_values = audio_values.squeeze(1)
+        max_len_in_batch = int(torch.max(audio_len_after_cnn).item())
+        padding_mask = torch.ones([audio_values.size(0), max_len_in_batch]).to(dtype=audio_values.dtype, device=audio_values.device)
+        for index in range(len(audio_values)):
+            padding_mask[index, :int(audio_len_after_cnn[index].item())] = 0
+        last_hidden_state = self.audio_model(audio_values, padding_mask, audio_len_after_cnn)  # (bs, max_token_num, 1280)
+        audio_embeds = self.mlp2(last_hidden_state)
+        return audio_embeds
+    def get_index(self, bound, fps, max_frame, first_idx=0, num_segments=32):
+        if bound:
+            start, end = bound[0], bound[1]
+        else:
+            start, end = -100000, 100000
+        start_idx = max(first_idx, round(start * fps))
+        end_idx = min(round(end * fps), max_frame)
+        seg_size = float(end_idx - start_idx) / num_segments
+        frame_indices = np.array([
+            int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
+            for idx in range(num_segments)
+        ])
+        return frame_indices
+    def load_video(self, video_path, bound=None, num_segments=32):
+        vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+        max_frame = len(vr) - 1
+        fps = float(vr.get_avg_fps())
+        frame_indices = self.get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
+        frames = list()
+        for frame_index in frame_indices:
+            img = Image.fromarray(vr[frame_index].asnumpy()).convert('RGB')
+            frames.append(img)
+        return frames
+    def find_second_last_occurrence(self, input_ids_list, target_id):
+        '''find taget_id index'''
+        reversed_list = list(reversed(input_ids_list))
+        first_occurrence = -1
+        second_occurrence = -1
+        for idx, val in enumerate(reversed_list):
+            if val == target_id:
+                if first_occurrence == -1:
+                    first_occurrence = idx  # first index
+                elif second_occurrence == -1:
+                    second_occurrence = idx  # second index
+                    break
+        if second_occurrence == -1:
+            return -1
+        return len(input_ids_list) - second_occurrence - 1
+    def decode_speech_tokens(
+        self,
+        speech_tokens,
+        speaker_embedding=None,
+        flow_prompt_speech_token=None,
+        prompt_speech_feat=None,
+        finalize=True,
+        token_offset=0,
+    ):
+        if speaker_embedding is None:
+            speaker_embedding = torch.zeros(1, 192)
+            pass
+        if flow_prompt_speech_token is None:
+            flow_prompt_speech_token = torch.zeros(1, 0, dtype=torch.int32)
+            pass
+        if prompt_speech_feat is None:
+            prompt_speech_feat = torch.zeros(1, 0, 80)
+            pass
+        self.flow_model.encoder.static_chunk_size = 2 * self.flow_model.input_frame_rate # 50
+        self.flow_model.decoder.estimator.static_chunk_size = 2 * self.flow_model.input_frame_rate * self.flow_model.token_mel_ratio # 100
+        device = speech_tokens.device
+        tts_mel, _ = self.flow_model.inference(
+            token=speech_tokens.to(device),
+            token_len=torch.tensor([speech_tokens.shape[1]], dtype=torch.int32).to(device),
+            prompt_token=flow_prompt_speech_token.to(device),
+            prompt_token_len=torch.tensor([flow_prompt_speech_token.shape[1]], dtype=torch.int32).to(device),
+            prompt_feat=prompt_speech_feat.to(device),
+            prompt_feat_len=torch.tensor([prompt_speech_feat.shape[1]], dtype=torch.int32).to(device),
+            embedding=speaker_embedding.to(device),
+            finalize=finalize,
+        )
+        tts_mel = tts_mel[:, :, token_offset * self.config.flow_config.token_mel_ratio:]
+        hift_cache_source = torch.zeros(1, 1, 0)
+        tts_speech, tts_source = self.hifigan_model.inference(speech_feat=tts_mel, cache_source=hift_cache_source)  # [1, sampling point num]
+        return tts_speech
+    @torch.no_grad()
+    def generate(
+        self,
+        pixel_values: torch.FloatTensor,
+        input_ids: torch.FloatTensor,
+        attention_mask: torch.LongTensor,
+        visual_features: Optional[torch.FloatTensor] = None,
+        audio_values: Optional[torch.FloatTensor] = None,
+        audio_len_after_cnn: Optional[bool] = None,
+        audio_token_num: Optional[bool] = None,
+        generation_config: Optional[GenerationConfig] = None,
+        output_hidden_states: Optional[bool] = None,
+        start_token_id:int = 151644,
+        generate_audio:bool = False,
+        speaker_embedding:torch.Tensor = torch.zeros(1, 192),
+        mix_ratio:list=[5,25],
+        **generate_kwargs,
+    ) -> torch.LongTensor:
+        assert self.img_context_token_id is not None
+        assert self.audio_context_token_id is not None
+        vit_embeds = None
+        if visual_features is not None:
+            vit_embeds = visual_features
+        elif pixel_values is not None:
+            vit_embeds = self.extract_feature(pixel_values)
+        cur_conv_start_id = self.find_second_last_occurrence(input_ids.tolist()[0], start_token_id)
+        input_embeds = self.language_model.get_input_embeddings()(input_ids)
+        B, N, C = input_embeds.shape
+        input_embeds = input_embeds.reshape(B * N, C)
+        input_ids = input_ids.reshape(B * N)
+        if vit_embeds is not None:
+            selected = (input_ids == self.img_context_token_id)
+            input_embeds[selected] = vit_embeds.reshape(-1, C)
+        if audio_values is not None and audio_len_after_cnn is not None and audio_token_num is not None:
+            audio_embeds = self.extract_audio_feature(audio_values, audio_len_after_cnn)
+            output_audios = []
+            for i in range(len(audio_token_num)):
+                token_num = int(audio_token_num[i].item())
+                audio = audio_embeds[i][:token_num]
+                output_audios.append(audio)
+            output_audios = torch.cat(output_audios, dim=0)
+            selected = (input_ids == self.audio_context_token_id)
+            input_embeds[selected] = output_audios.reshape(-1, C)
+        input_embeds = input_embeds.reshape(B, N, C)
+        outputs = self.language_model.generate(
+            inputs_embeds=input_embeds,
+            attention_mask=attention_mask,
+            generation_config=generation_config,
+            output_hidden_states=output_hidden_states or generate_audio,
+            return_dict_in_generate=generate_audio,
+            use_cache=True,
+            **generate_kwargs,
+        )
+        if not generate_audio:
+            return outputs, None, None
+        hidden_states = torch.cat(
+            [outputs.hidden_states[0][-1][:, -1:, :]] + [outputs.hidden_states[i][-1] for i in range(1, len(outputs.hidden_states))],
+            dim=1,
+        )
+        sampled_token = outputs.sequences
+        if sampled_token.shape[1] == hidden_states.shape[1] + 1:
+            sampled_token = sampled_token[:, 1:]
+        sampled_token_embeddings = self.language_model.get_input_embeddings()(sampled_token)
+        target_text_token_hidden_states = self.fusion(hidden_states, sampled_token_embeddings)
+        input_token_hidden_states = outputs.hidden_states[0][-1][:, cur_conv_start_id:-1, :]
+        question_input_embeddings = input_embeds[:, cur_conv_start_id+1:, :]
+        input_token_hidden_states = self.fusion(input_token_hidden_states, question_input_embeddings)
+        input_feature = self.mlp_llm2voicelm(input_token_hidden_states)
+        target_text_feature = self.mlp_llm2voicelm(target_text_token_hidden_states)  #
+        try:
+            speech_tokens = self.voicelm_model.inference_bistream(input_feature, target_text_feature, mix_ratio=mix_ratio)
+            speech_tokens = torch.LongTensor([speech_tokens]).to(input_feature.device)
+            tts_speech = self.decode_speech_tokens(
+                speech_tokens,
+                speaker_embedding=speaker_embedding,
+            )
+        except Exception as e:
+              logger.warning(f"=========voice lm except:{e}")
+              return outputs.sequences,None, None
+        return outputs.sequences, speech_tokens, tts_speech
+    def chat(
+        self,
+        tokenizer,
+        generation_config,
+        messages,
+        max_patch_num=12,
+        frame=8,
+        generate_audio=False,
+        speaker_embedding=torch.zeros(1, 192),
+        print_flag=True,
+    ):
+        if self.flow_model.dtype != torch.float32 or self.hifigan_model.dtype != torch.float32:
+            logger.info(f"reset flow model and higigan model dtype to float32")
+            self.reset_vocoder()
+            pass
+        if messages is None or len(messages) == 0:
+            raise RuntimeError('no messages')
+        role_transfer_dict = {
+            'system': ['user'],
+            'user': ['assistant'],
+            'assistant': ['user'],
+        }
+        first_role = ['system', 'user']
+        last_role = ['user']
+        if messages[-1]['role'] not in last_role:
+            raise RuntimeError(f"last role error, expect {last_role}, but got {messages[-1]}")
+        current_role = None
+        dynamic_images = list()
+        dynamic_nums = list()
+        audio_values = list()
+        audio_len_after_cnn = list()
+        audio_token_num = list()
+        template = get_conv_template(self.template)
+        for index in range(len(messages)):
+            text = ''
+            audios = list()
+            images = list()
+            message = messages[index]
+            if index == 0:
+                if message['role'] not in first_role:
+                    raise RuntimeError(f'first role error expect {first_role}, but got {message}')
+            else:
+                if message['role'] not in current_role:
+                    raise RuntimeError(f'role error expect {current_role}, but got {message}')
+            current_role = message['role']
+            if isinstance(message["content"], list):
+                for item in message["content"]:
+                    if item['type'] == 'text':
+                        if item.get('text', None) is None:
+                            continue
+                        text += item['text']
+                    elif item['type'] == 'audio':
+                        if item.get('audio', None) is None:
+                            continue
+                        if type(item['audio']) is list:
+                            assert len(item['audio']) == 1, f'only support 1 audio file in round, but got {item["audio"]}'
+                            audio = item['audio'][0]
+                        else:
+                            audio = item['audio']
+                        audios.append(audio)
+                    elif item['type'] == 'image':
+                        if item.get('image', None) is None:
+                            continue
+                        if type(item['image']) is not list:
+                            images.append(item['image'])
+                        else:
+                            images.extend(item['image'])
+                    elif item['type'] == 'video':
+                        if item.get('video', None) is None:
+                            continue
+                        if type(item['video']) is list:
+                            assert len(item['video']) == 1, f'only support 1 video file in round, but got {item["video"]}'
+                            video = item['video'][0]
+                        else:
+                            video = item['video']
+                        frames = self.load_video(video, num_segments=frame)
+                        images.extend(frames)
+            else:
+                assert isinstance(message["content"], str), message["content"]
+                text = message["content"]
+            if len(audios) != 0:
+                assert len(audios) == 1, f'only support 1 audio file in round, but got {audios}'
+                if '<audio>' in text:
+                    matches = re.findall(r"<audio>", text)
+                    assert len(matches) == len(audios), f'<audio> error {text} {len(audios)}' + text
+                    text = re.sub(r'(<audio>)(?!\n)', r'\1\n', text)
+                else:
+                    text = '<audio>\n'*len(audios) + text
+                audio_path = audios[0]
+                audio_input_dict = self.load_audio(audio_path)
+                assert audio_input_dict['audio_token_num'].item() != 0, f'audio_token_num of {audio_path} is 0.'
+                audio_values.append(audio_input_dict['audio_values'])
+                audio_len_after_cnn.append(audio_input_dict['audio_len_after_cnn'])
+                audio_token_num.append(audio_input_dict['audio_token_num'])
+            if images is not None:
+                if '<image>' in text:
+                    matches = re.findall(r"<image>", text)
+                    assert len(matches) == len(images), f'<image> error {text} {len(images)}' + text
+                    text = re.sub(r'(<image>)(?!\n)', r'\1\n', text)
+                else:
+                    text = '<image>\n'*len(images) + text
+                for image in images:
+                    dynamic_image = self.load_image(image, max_num=max_patch_num)
+                    dynamic_images += dynamic_image
+                    dynamic_nums.append(len(dynamic_image))
+            if message['role'] == 'system':
+                template.set_system_message(text)
+            elif message['role'] == 'user':
+                template.append_message(template.roles[0], text)
+            elif message['role'] == 'assistant':
+                template.append_message(template.roles[1], text)
+            else:
+                raise ValueError('unexpected role')
+            current_role = role_transfer_dict[current_role]
+        template.append_message(template.roles[1], None)
+        if len(audio_values) != 0:
+            audio_values = torch.cat(audio_values, dim=0).to(dtype=self.dtype).cuda()  # [num_audio, 128, 3000]
+            audio_len_after_cnn = torch.stack(audio_len_after_cnn, dim=0)  # [num_audio]
+            audio_token_num = torch.stack(audio_token_num, dim=0)  # [num_audio]
+        else:
+            audio_values = None
+            audio_len_after_cnn = None
+            audio_token_num = None
+        if len(dynamic_images) != 0:
+            pixel_values = [self.transform(image) for image in dynamic_images]
+            pixel_values = torch.stack(pixel_values)
+            pixel_values = pixel_values.to(torch.bfloat16).cuda()
+        else:
+            pixel_values = None
+            dynamic_nums = None
+        img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
+        self.img_context_token_id = img_context_token_id
+        audio_context_token_id = tokenizer.convert_tokens_to_ids(AUDIO_CONTEXT_TOKEN)
+        self.audio_context_token_id = audio_context_token_id
+        # also add end-of-assistant token in eos token id to avoid unnecessary generation
+        eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(["<|im_end|>"])[0]]
+        start_token_id = tokenizer.convert_tokens_to_ids(["<|im_start|>"])[0]
+        query = template.get_prompt()
+        if audio_values is not None:
+            if print_flag:
+                logger.info(f'audio num: {len(audio_token_num)}')
+            audio_tokens_list = list()
+            for index in range(len(audio_token_num)):
+                audio_token_num_i = audio_token_num[index]
+                if print_flag:
+                    logger.info(f'audio_token_num: {audio_token_num_i}')
+                audio_tokens = AUDIO_START_TOKEN + AUDIO_CONTEXT_TOKEN * audio_token_num_i + AUDIO_END_TOKEN
+                audio_tokens_list.append(audio_tokens)
+            audio_tokens_iter = iter(audio_tokens_list)
+            query = re.sub(r"<audio>", lambda match:next(audio_tokens_iter), query)
+        if pixel_values is not None:
+            if print_flag:
+                logger.info(f'image num: {len(dynamic_nums)}')
+            image_tokens_list = list()
+            total_dynamic_num = 0
+            for index in range(len(dynamic_nums)):
+                dynamic_num = dynamic_nums[index]
+                total_dynamic_num += dynamic_num
+                if print_flag:
+                    logger.info(f'dynamic ViT batch size: {dynamic_num}')
+                image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * dynamic_num + IMG_END_TOKEN
+                image_tokens_list.append(image_tokens)
+            assert total_dynamic_num == pixel_values.shape[0], f'dynamic num not equal, {total_dynamic_num}, {pixel_values.shape[0]}'
+            image_tokens_iter = iter(image_tokens_list)
+            query = re.sub(r"<image>", lambda match:next(image_tokens_iter), query)
+        model_inputs = tokenizer(query, return_tensors='pt', add_special_tokens=False)
+        input_ids = model_inputs['input_ids'].cuda()
+        attention_mask = model_inputs['attention_mask'].cuda()
+        generation_config['eos_token_id'] = eos_token_id
+        generation_output, speech_token, audio_bytes = self.generate(
+            pixel_values=pixel_values,
+            audio_values=audio_values,
+            audio_len_after_cnn=audio_len_after_cnn,
+            audio_token_num=audio_token_num,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            generate_audio=generate_audio,
+            start_token_id=start_token_id,
+            speaker_embedding=speaker_embedding,
+            **generation_config
+        )
+        response = tokenizer.batch_decode(generation_output, skip_special_tokens=False)[0]
+        response = response.split("<|im_end|>")[0].replace('<|endoftext|>', '').strip()
+        query_to_print = query
+        if pixel_values is not None:
+            query_to_print = query_to_print.replace(IMG_CONTEXT_TOKEN, '')
+            query_to_print = query_to_print.replace(f'{IMG_START_TOKEN}{IMG_END_TOKEN}', '<image>')
+        if audio_values is not None:
+            query_to_print = query_to_print.replace(AUDIO_CONTEXT_TOKEN, '')
+            query_to_print = query_to_print.replace(f'{AUDIO_START_TOKEN}{AUDIO_END_TOKEN}', '<audio>')
+        if print_flag:
+            logger.info('query: ' + json.dumps(query_to_print, ensure_ascii=False))
+            logger.info('response: ' + response)
+        if generate_audio:
+            return response, audio_bytes
+        return response
+    def __cache_file(self, pretrained_model_name_or_path:str, filename:str, **kw):
+        '''cache some file'''
+        full_path = cached_file(
+            pretrained_model_name_or_path,
+            filename,
+            subfolder=kw.pop("subfolder", None),
+            cache_dir=kw.pop("cache_dir", None),
+            force_download=kw.pop("force_download", False),
+            proxies=kw.pop("proxies", None),
+            resume_download=kw.pop("resume_download", None),
+            local_files_only=kw.pop("local_files_only", False),
+            token=kw.pop("use_auth_token", None),
+            revision=kw.pop("revision", None),
+        )
+        if full_path is None:
+            raise ValueError(f"""{pretrained_model_name_or_path}/{filename} not exists""")
+        return full_path
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path,
+        *model_args,
+        config=None,
+        cache_dir=None,
+        ignore_mismatched_sizes=False,
+        force_download=False,
+        local_files_only=False,
+        token=None,
+        revision="main",
+        use_safetensors=None,
+        weights_only=True,
+        **kwargs,
+    ):
+        model = super().from_pretrained(
+            pretrained_model_name_or_path,
+            *model_args,
+            config=config,
+            cache_dir=cache_dir,
+            ignore_mismatched_sizes=ignore_mismatched_sizes,
+            force_download=force_download,
+            local_files_only=local_files_only,
+            token=token,
+            revision=revision,
+            use_safetensors=use_safetensors,
+            weights_only=weights_only,
+            **kwargs,
+        )
+        campplus_path = model.__cache_file(pretrained_model_name_or_path, "campplus.onnx", **kwargs)
+        model.__load_campplus_session(campplus_path)
+        default_wav_path = model.__cache_file(pretrained_model_name_or_path, "taozi.wav", **kwargs)
+        model.default_wav_path = default_wav_path
+        model.default_speaker_embedding = model.extract_speaker_embedding(default_wav_path)
+        return model

modeling_intern_vit.py ADDED Viewed

	@@ -0,0 +1,427 @@

+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from einops import rearrange
+from timm.models.layers import DropPath
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (BaseModelOutput,
+                                           BaseModelOutputWithPooling)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+from .configuration_intern_vit import InternVisionConfig
+try:
+    from flash_attn.bert_padding import pad_input, unpad_input
+    from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func
+    has_flash_attn = True
+except:
+    print('FlashAttention2 is not installed.')
+    has_flash_attn = False
+logger = logging.get_logger(__name__)
+class FlashAttention(nn.Module):
+    """Implement the scaled dot product attention with softmax.
+    Arguments
+    ---------
+        softmax_scale: The temperature to use for the softmax attention.
+                      (default: 1/sqrt(d_keys) where d_keys is computed at
+                      runtime)
+        attention_dropout: The dropout rate to apply to the attention
+                           (default: 0.0)
+    """
+    def __init__(self, softmax_scale=None, attention_dropout=0.0, device=None, dtype=None):
+        super().__init__()
+        self.softmax_scale = softmax_scale
+        self.dropout_p = attention_dropout
+    def forward(self, qkv, key_padding_mask=None, causal=False, cu_seqlens=None,
+                max_s=None, need_weights=False):
+        """Implements the multihead softmax attention.
+        Arguments
+        ---------
+            qkv: The tensor containing the query, key, and value. (B, S, 3, H, D) if key_padding_mask is None
+                if unpadded: (nnz, 3, h, d)
+            key_padding_mask: a bool tensor of shape (B, S)
+        """
+        assert not need_weights
+        assert qkv.dtype in [torch.float16, torch.bfloat16]
+        assert qkv.is_cuda
+        if cu_seqlens is None:
+            batch_size = qkv.shape[0]
+            seqlen = qkv.shape[1]
+            if key_padding_mask is None:
+                qkv = rearrange(qkv, 'b s ... -> (b s) ...')
+                max_s = seqlen
+                cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32,
+                                          device=qkv.device)
+                output = flash_attn_varlen_qkvpacked_func(
+                    qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
+                    softmax_scale=self.softmax_scale, causal=causal
+                )
+                output = rearrange(output, '(b s) ... -> b s ...', b=batch_size)
+            else:
+                nheads = qkv.shape[-2]
+                x = rearrange(qkv, 'b s three h d -> b s (three h d)')
+                x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask)
+                x_unpad = rearrange(x_unpad, 'nnz (three h d) -> nnz three h d', three=3, h=nheads)
+                output_unpad = flash_attn_varlen_qkvpacked_func(
+                    x_unpad, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
+                    softmax_scale=self.softmax_scale, causal=causal
+                )
+                output = rearrange(pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'),
+                                             indices, batch_size, seqlen),
+                                   'b s (h d) -> b s h d', h=nheads)
+        else:
+            assert max_s is not None
+            output = flash_attn_varlen_qkvpacked_func(
+                qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
+                softmax_scale=self.softmax_scale, causal=causal
+            )
+        return output, None
+class InternRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+try:
+    from apex.normalization import FusedRMSNorm
+    InternRMSNorm = FusedRMSNorm  # noqa
+    logger.info('Discovered apex.normalization.FusedRMSNorm - will use it instead of InternRMSNorm')
+except ImportError:
+    # using the normal InternRMSNorm
+    pass
+except Exception:
+    logger.warning('discovered apex but it failed to load, falling back to InternRMSNorm')
+    pass
+NORM2FN = {
+    'rms_norm': InternRMSNorm,
+    'layer_norm': nn.LayerNorm,
+}
+class InternVisionEmbeddings(nn.Module):
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.class_embedding = nn.Parameter(
+            torch.randn(1, 1, self.embed_dim),
+        )
+        self.patch_embedding = nn.Conv2d(
+            in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
+        )
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))
+    def _get_pos_embed(self, pos_embed, H, W):
+        target_dtype = pos_embed.dtype
+        pos_embed = pos_embed.float().reshape(
+            1, self.image_size // self.patch_size, self.image_size // self.patch_size, -1).permute(0, 3, 1, 2)
+        pos_embed = F.interpolate(pos_embed, size=(H, W), mode='bicubic', align_corners=False).\
+            reshape(1, -1, H * W).permute(0, 2, 1).to(target_dtype)
+        return pos_embed
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, channel, width, height]
+        batch_size, _, height, width = patch_embeds.shape
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        position_embedding = torch.cat([
+            self.position_embedding[:, :1, :],
+            self._get_pos_embed(self.position_embedding[:, 1:, :], height, width)
+        ], dim=1)
+        embeddings = embeddings + position_embedding.to(target_dtype)
+        return embeddings
+class InternAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.use_flash_attn = config.use_flash_attn and has_flash_attn
+        if config.use_flash_attn and not has_flash_attn:
+            print('Warning: Flash Attention is not available, use_flash_attn is set to False.')
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f'embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:'
+                f' {self.num_heads}).'
+            )
+        self.scale = self.head_dim ** -0.5
+        self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=config.qkv_bias)
+        self.attn_drop = nn.Dropout(config.attention_dropout)
+        self.proj_drop = nn.Dropout(config.dropout)
+        self.qk_normalization = config.qk_normalization
+        if self.qk_normalization:
+            self.q_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+            self.k_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+        if self.use_flash_attn:
+            self.inner_attn = FlashAttention(attention_dropout=config.attention_dropout)
+        self.proj = nn.Linear(self.embed_dim, self.embed_dim)
+    def _naive_attn(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)  # make torchscript happy (cannot use tensor as tuple)
+        if self.qk_normalization:
+            B_, H_, N_, D_ = q.shape
+            q = self.q_norm(q.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
+            k = self.k_norm(k.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
+        attn = ((q * self.scale) @ k.transpose(-2, -1))
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+    def _flash_attn(self, x, key_padding_mask=None, need_weights=False):
+        qkv = self.qkv(x)
+        qkv = rearrange(qkv, 'b s (three h d) -> b s three h d', three=3, h=self.num_heads)
+        if self.qk_normalization:
+            q, k, v = qkv.unbind(2)
+            q = self.q_norm(q.flatten(-2, -1)).view(q.shape)
+            k = self.k_norm(k.flatten(-2, -1)).view(k.shape)
+            qkv = torch.stack([q, k, v], dim=2)
+        context, _ = self.inner_attn(
+            qkv, key_padding_mask=key_padding_mask, need_weights=need_weights, causal=False
+        )
+        outs = self.proj(rearrange(context, 'b s h d -> b s (h d)'))
+        outs = self.proj_drop(outs)
+        return outs
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        x = self._naive_attn(hidden_states) if not self.use_flash_attn else self._flash_attn(hidden_states)
+        return x
+class InternMLP(nn.Module):
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        self.act = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+class InternVisionEncoderLayer(nn.Module):
+    def __init__(self, config: InternVisionConfig, drop_path_rate: float):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.norm_type = config.norm_type
+        self.attn = InternAttention(config)
+        self.mlp = InternMLP(config)
+        self.norm1 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps)
+        self.norm2 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps)
+        self.ls1 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
+        self.ls2 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
+        self.drop_path1 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+        self.drop_path2 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor], Optional[Tuple[torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]`): input to the layer of shape `(batch, seq_len, embed_dim)`
+        """
+        hidden_states = hidden_states + self.drop_path1(self.attn(self.norm1(hidden_states)) * self.ls1)
+        hidden_states = hidden_states + self.drop_path2(self.mlp(self.norm2(hidden_states)) * self.ls2)
+        return hidden_states
+class InternVisionEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`InternEncoderLayer`].
+    Args:
+        config (`InternConfig`):
+            The corresponding vision configuration for the `InternEncoder`.
+    """
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)]
+        self.layers = nn.ModuleList([
+            InternVisionEncoderLayer(config, dpr[idx]) for idx in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = True
+    def forward(
+            self,
+            inputs_embeds,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Embedded representation of the inputs. Should be float, not int tokens.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        encoder_states = () if output_hidden_states else None
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    encoder_layer,
+                    hidden_states)
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                )
+            hidden_states = layer_outputs
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states
+        )
+class InternVisionModel(PreTrainedModel):
+    main_input_name = 'pixel_values'
+    config_class = InternVisionConfig
+    _no_split_modules = ['InternVisionEncoderLayer']
+    def __init__(self, config: InternVisionConfig):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = InternVisionEmbeddings(config)
+        self.encoder = InternVisionEncoder(config)
+    def resize_pos_embeddings(self, old_size, new_size, patch_size):
+        pos_emb = self.embeddings.position_embedding
+        _, num_positions, embed_dim = pos_emb.shape
+        cls_emb = pos_emb[:, :1, :]
+        pos_emb = pos_emb[:, 1:, :].reshape(1, old_size // patch_size, old_size // patch_size, -1).permute(0, 3, 1, 2)
+        pos_emb = F.interpolate(pos_emb.float(), size=new_size // patch_size, mode='bicubic', align_corners=False)
+        pos_emb = pos_emb.to(cls_emb.dtype).reshape(1, embed_dim, -1).permute(0, 2, 1)
+        pos_emb = torch.cat([cls_emb, pos_emb], dim=1)
+        self.embeddings.position_embedding = nn.Parameter(pos_emb)
+        self.embeddings.image_size = new_size
+        logger.info('Resized position embeddings from {} to {}'.format(old_size, new_size))
+    def get_input_embeddings(self):
+        return self.embeddings
+    def forward(
+            self,
+            pixel_values: Optional[torch.FloatTensor] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            pixel_embeds: Optional[torch.FloatTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if pixel_values is None and pixel_embeds is None:
+            raise ValueError('You have to specify pixel_values or pixel_embeds')
+        if pixel_embeds is not None:
+            hidden_states = pixel_embeds
+        else:
+            if len(pixel_values.shape) == 4:
+                hidden_states = self.embeddings(pixel_values)
+            else:
+                raise ValueError(f'wrong pixel_values size: {pixel_values.shape}')
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = encoder_outputs.last_hidden_state
+        pooled_output = last_hidden_state[:, 0, :]
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )

modeling_voicelm.py ADDED Viewed

	@@ -0,0 +1,192 @@

+# --------------------------------------------------------
+# SenseTime
+# Copyright (c) 2025 SenseTime
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from typing import List
+import math
+import torch
+from torch import nn
+from transformers import Qwen2ForCausalLM
+from transformers import PreTrainedModel
+import logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+from .configuration_voicelm import VoiceLMConfig
+class Qwen2Encoder(torch.nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.model = Qwen2ForCausalLM(config)
+        pass
+    def forward_one_step(self, xs, masks, cache=None):
+        input_masks = masks[:, -1, :]
+        outs = self.model(
+            inputs_embeds=xs,
+            attention_mask=input_masks,
+            output_hidden_states=True,
+            return_dict=True,
+            use_cache=True,
+            past_key_values=cache,
+        )
+        xs = outs.hidden_states[-1]
+        new_cache = outs.past_key_values
+        return xs, new_cache
+class VoiceLM(PreTrainedModel):
+    """
+    voicelm model
+    """
+    def __init__(self, config: VoiceLMConfig):
+        super().__init__(config)
+        self.llm_input_size = config.llm_input_size
+        self.llm_output_size = config.llm_output_size
+        self.speech_token_size = config.speech_token_size  # 6561
+        self.sampling_config = config.sampling_config
+        self.sos_eos = 0
+        self.task_id = 1
+        self.fill_token = 2
+        self.llm_embedding = torch.nn.Embedding(2, config.llm_input_size)
+        self.llm = Qwen2Encoder(config.llm_config)
+        self.llm_decoder = nn.Linear(config.llm_output_size, config.speech_token_size + 3)
+        # speech token embedding (6564, 896)
+        self.speech_embedding = torch.nn.Embedding(
+            config.speech_token_size + 3,
+            config.llm_input_size,
+        )
+        pass
+    # Repetition Aware Sampling in VALL-E 2
+    def ras_sampling(self, weighted_scores:torch.Tensor, decoded_tokens, sampling, top_p=0.8, top_k=25, win_size=10, tau_r=0.1):
+        top_ids = self.nucleus_sampling(weighted_scores, top_p=top_p, top_k=top_k)
+        rep_num = (torch.tensor(decoded_tokens[-win_size:]).to(weighted_scores.device) == top_ids).sum().item()
+        if rep_num >= win_size * tau_r:
+            top_ids = self.random_sampling(weighted_scores, decoded_tokens, sampling)
+        return top_ids
+    def nucleus_sampling(self, weighted_scores:torch.Tensor, top_p=0.8, top_k=25):
+        prob, indices = [], []
+        cum_prob = 0.0
+        sorted_value, sorted_idx = weighted_scores.softmax(dim=0).sort(descending=True, stable=True)
+        for i in range(len(sorted_idx)):
+            # sampling both top-p and numbers.
+            if cum_prob < top_p and len(prob) < top_k:
+                cum_prob += sorted_value[i]
+                prob.append(sorted_value[i])
+                indices.append(sorted_idx[i])
+            else:
+                break
+        prob = torch.tensor(prob).to(weighted_scores)
+        indices = torch.tensor(indices, dtype=torch.long).to(weighted_scores.device)
+        top_ids = indices[prob.multinomial(1, replacement=True)]
+        return top_ids
+    def random_sampling(self, weighted_scores:torch.Tensor, decoded_tokens, sampling):
+        top_ids = weighted_scores.softmax(dim=0).multinomial(1, replacement=True)
+        return top_ids
+    def sampling_ids(
+            self,
+            weighted_scores: torch.Tensor,
+            decoded_tokens: List,
+            sampling: int,
+            ignore_eos: bool = True,
+    ):
+        num_trials, max_trials = 0, 100
+        while True:
+            top_ids = self.ras_sampling(weighted_scores, decoded_tokens, sampling, **self.sampling_config)
+            if (not ignore_eos) or (self.speech_token_size not in top_ids):
+                break
+            num_trials += 1
+            if num_trials > max_trials:
+                raise RuntimeError('sampling reaches max_trials {} and still get eos when ignore_eos is True, check your input!'.format(max_trials))
+        return top_ids
+    @torch.inference_mode()
+    def inference_bistream(
+        self,
+        input_feature: torch.Tensor,
+        target_text_feature: torch.Tensor,
+        sampling: int = 25,
+        mix_ratio: List[int] = [5, 25],
+    ):
+        text_token_len = target_text_feature.size(1)
+        # 1. prepare input
+        sos_eos_emb = self.llm_embedding.weight[self.sos_eos].reshape(1, 1, -1)
+        task_id_emb = self.llm_embedding.weight[self.task_id].reshape(1, 1, -1)
+        lm_input = torch.concat([sos_eos_emb, input_feature], dim=1)
+        # 2. iterate text
+        out_tokens = []
+        return_out_tokens = []
+        cache = None
+        text_cache = target_text_feature
+        next_fill_index = -1
+        for j in range(int(math.floor((text_token_len) / mix_ratio[0] ))):
+            if (len(out_tokens) != 0 and out_tokens[-1] == self.speech_token_size + 2) or (len(out_tokens) == 0 and lm_input.size(1) == (1 + input_feature.size(1))):
+                logger.info('get fill token, need to append more text token')
+                if text_cache.size(1) >= mix_ratio[0]:
+                    lm_input_text = text_cache[:, :mix_ratio[0]]
+                    logger.info('append {} text token'.format(lm_input_text.size(1)))
+                    if len(out_tokens) != 0 and out_tokens[-1] == self.speech_token_size + 2:
+                        lm_input = lm_input_text
+                    else:
+                        lm_input = torch.concat([lm_input, lm_input_text], dim=1)
+                    text_cache = text_cache[:, mix_ratio[0]:]
+                else:
+                    logger.info('not enough text token to decode, wait for more')
+                    continue
+            while True:
+                seq_len = lm_input.shape[1] if cache is None else lm_input.shape[1] + cache[0][0].size(2)
+                y_pred, cache = self.llm.forward_one_step(lm_input,
+                                                            masks=torch.tril(torch.ones((1, seq_len, seq_len), device=lm_input.device)).to(torch.bool),
+                                                            cache=cache)
+                logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1)
+                if next_fill_index != -1 and len(out_tokens) == next_fill_index:
+                    top_ids = self.speech_token_size + 2
+                    next_fill_index += (mix_ratio[1] + 1)
+                else:
+                    top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True).item()
+                if top_ids == self.speech_token_size + 2:
+                    next_fill_index = len(out_tokens) + mix_ratio[1] + 1
+                    logger.info('fill_token index {} next fill_token index {}'.format(len(out_tokens), next_fill_index))
+                out_tokens.append(top_ids)
+                if top_ids >= self.speech_token_size:
+                    if top_ids == self.speech_token_size + 2:
+                        break
+                    else:
+                        raise ValueError('should not get token {}'.format(top_ids))
+                # yield top_ids
+                return_out_tokens.append(top_ids)
+                lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1)
+        # 3. final decode
+        lm_input = torch.concat([lm_input, text_cache, task_id_emb], dim=1)
+        logger.info('no more text token, decode until met eos')
+        while True:
+            seq_len = lm_input.shape[1] if cache is None else lm_input.shape[1] + cache[0][0].size(2)
+            y_pred, cache = self.llm.forward_one_step(lm_input,
+                                                      masks=torch.tril(torch.ones((1, seq_len, seq_len), device=lm_input.device)).to(torch.bool),
+                                                      cache=cache)
+            logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1)
+            top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=False).item()
+            out_tokens.append(top_ids)
+            if top_ids >= self.speech_token_size:
+                if top_ids == self.speech_token_size:
+                    break
+                else:
+                    raise ValueError('should not get token {}'.format(top_ids))
+            # in stream mode, yield token one by one
+            # yield top_ids
+            return_out_tokens.append(top_ids)
+            lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1)
+        return return_out_tokens

modeling_whisper.py ADDED Viewed

The diff for this file is too large to render. See raw diff

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,330 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>",
+    "<IMG_CONTEXT>",
+    "<img>",
+    "</img>",
+    "<quad>",
+    "</quad>",
+    "<ref>",
+    "</ref>",
+    "<box>",
+    "</box>",
+    "<|action_start|>",
+    "<|action_end|>",
+    "<|plugin|>",
+    "<|interpreter|>",
+    "<FAKE_PAD_0>",
+    "<FAKE_PAD_1>",
+    "<FAKE_PAD_2>",
+    "<FAKE_PAD_3>",
+    "<FAKE_PAD_4>",
+    "<FAKE_PAD_5>",
+    "<FAKE_PAD_6>",
+    "<FAKE_PAD_7>",
+    "<FAKE_PAD_8>",
+    "<FAKE_PAD_9>",
+    "<FAKE_PAD_10>",
+    "<FAKE_PAD_11>",
+    "<FAKE_PAD_12>",
+    "<FAKE_PAD_13>",
+    "<FAKE_PAD_14>",
+    "<FAKE_PAD_15>",
+    "<FAKE_PAD_16>",
+    "<FAKE_PAD_17>",
+    "<FAKE_PAD_18>",
+    "<FAKE_PAD_19>",
+    "<FAKE_PAD_20>",
+    "<FAKE_PAD_21>",
+    "<FAKE_PAD_22>",
+    "<FAKE_PAD_23>",
+    "<FAKE_PAD_24>",
+    "<FAKE_PAD_25>",
+    "<FAKE_PAD_26>",
+    "<FAKE_PAD_27>",
+    "<FAKE_PAD_28>",
+    "<FAKE_PAD_29>",
+    "<FAKE_PAD_30>",
+    "<FAKE_PAD_31>",
+    "<FAKE_PAD_32>",
+    "<FAKE_PAD_33>",
+    "<FAKE_PAD_34>",
+    "<FAKE_PAD_35>",
+    "<FAKE_PAD_36>",
+    "<FAKE_PAD_37>",
+    "<FAKE_PAD_38>",
+    "<FAKE_PAD_39>",
+    "<FAKE_PAD_40>",
+    "<FAKE_PAD_41>",
+    "<FAKE_PAD_42>",
+    "<FAKE_PAD_43>",
+    "<FAKE_PAD_44>",
+    "<FAKE_PAD_45>",
+    "<FAKE_PAD_46>",
+    "<FAKE_PAD_47>",
+    "<FAKE_PAD_48>",
+    "<FAKE_PAD_49>",
+    "<FAKE_PAD_50>",
+    "<FAKE_PAD_51>",
+    "<FAKE_PAD_52>",
+    "<FAKE_PAD_53>",
+    "<FAKE_PAD_54>",
+    "<FAKE_PAD_55>",
+    "<FAKE_PAD_56>",
+    "<FAKE_PAD_57>",
+    "<FAKE_PAD_58>",
+    "<FAKE_PAD_59>",
+    "<FAKE_PAD_60>",
+    "<FAKE_PAD_61>",
+    "<FAKE_PAD_62>",
+    "<FAKE_PAD_63>",
+    "<FAKE_PAD_64>",
+    "<FAKE_PAD_65>",
+    "<FAKE_PAD_66>",
+    "<FAKE_PAD_67>",
+    "<FAKE_PAD_68>",
+    "<FAKE_PAD_69>",
+    "<FAKE_PAD_70>",
+    "<FAKE_PAD_71>",
+    "<FAKE_PAD_72>",
+    "<FAKE_PAD_73>",
+    "<FAKE_PAD_74>",
+    "<FAKE_PAD_75>",
+    "<FAKE_PAD_76>",
+    "<FAKE_PAD_77>",
+    "<FAKE_PAD_78>",
+    "<FAKE_PAD_79>",
+    "<FAKE_PAD_80>",
+    "<FAKE_PAD_81>",
+    "<FAKE_PAD_82>",
+    "<FAKE_PAD_83>",
+    "<FAKE_PAD_84>",
+    "<FAKE_PAD_85>",
+    "<FAKE_PAD_86>",
+    "<FAKE_PAD_87>",
+    "<FAKE_PAD_88>",
+    "<FAKE_PAD_89>",
+    "<FAKE_PAD_90>",
+    "<FAKE_PAD_91>",
+    "<FAKE_PAD_92>",
+    "<FAKE_PAD_93>",
+    "<FAKE_PAD_94>",
+    "<FAKE_PAD_95>",
+    "<FAKE_PAD_96>",
+    "<FAKE_PAD_97>",
+    "<FAKE_PAD_98>",
+    "<FAKE_PAD_99>",
+    "<FAKE_PAD_100>",
+    "<FAKE_PAD_101>",
+    "<FAKE_PAD_102>",
+    "<FAKE_PAD_103>",
+    "<FAKE_PAD_104>",
+    "<FAKE_PAD_105>",
+    "<FAKE_PAD_106>",
+    "<FAKE_PAD_107>",
+    "<FAKE_PAD_108>",
+    "<FAKE_PAD_109>",
+    "<FAKE_PAD_110>",
+    "<FAKE_PAD_111>",
+    "<FAKE_PAD_112>",
+    "<FAKE_PAD_113>",
+    "<FAKE_PAD_114>",
+    "<FAKE_PAD_115>",
+    "<FAKE_PAD_116>",
+    "<FAKE_PAD_117>",
+    "<FAKE_PAD_118>",
+    "<FAKE_PAD_119>",
+    "<FAKE_PAD_120>",
+    "<FAKE_PAD_121>",
+    "<FAKE_PAD_122>",
+    "<FAKE_PAD_123>",
+    "<FAKE_PAD_124>",
+    "<FAKE_PAD_125>",
+    "<FAKE_PAD_126>",
+    "<FAKE_PAD_127>",
+    "<FAKE_PAD_128>",
+    "<FAKE_PAD_129>",
+    "<FAKE_PAD_130>",
+    "<FAKE_PAD_131>",
+    "<FAKE_PAD_132>",
+    "<FAKE_PAD_133>",
+    "<FAKE_PAD_134>",
+    "<FAKE_PAD_135>",
+    "<FAKE_PAD_136>",
+    "<FAKE_PAD_137>",
+    "<FAKE_PAD_138>",
+    "<FAKE_PAD_139>",
+    "<FAKE_PAD_140>",
+    "<FAKE_PAD_141>",
+    "<FAKE_PAD_142>",
+    "<FAKE_PAD_143>",
+    "<FAKE_PAD_144>",
+    "<FAKE_PAD_145>",
+    "<FAKE_PAD_146>",
+    "<FAKE_PAD_147>",
+    "<FAKE_PAD_148>",
+    "<FAKE_PAD_149>",
+    "<FAKE_PAD_150>",
+    "<FAKE_PAD_151>",
+    "<FAKE_PAD_152>",
+    "<FAKE_PAD_153>",
+    "<FAKE_PAD_154>",
+    "<FAKE_PAD_155>",
+    "<FAKE_PAD_156>",
+    "<FAKE_PAD_157>",
+    "<FAKE_PAD_158>",
+    "<FAKE_PAD_159>",
+    "<FAKE_PAD_160>",
+    "<FAKE_PAD_161>",
+    "<FAKE_PAD_162>",
+    "<FAKE_PAD_163>",
+    "<FAKE_PAD_164>",
+    "<FAKE_PAD_165>",
+    "<FAKE_PAD_166>",
+    "<FAKE_PAD_167>",
+    "<FAKE_PAD_168>",
+    "<FAKE_PAD_169>",
+    "<FAKE_PAD_170>",
+    "<FAKE_PAD_171>",
+    "<FAKE_PAD_172>",
+    "<FAKE_PAD_173>",
+    "<FAKE_PAD_174>",
+    "<FAKE_PAD_175>",
+    "<FAKE_PAD_176>",
+    "<FAKE_PAD_177>",
+    "<FAKE_PAD_178>",
+    "<FAKE_PAD_179>",
+    "<FAKE_PAD_180>",
+    "<FAKE_PAD_181>",
+    "<FAKE_PAD_182>",
+    "<FAKE_PAD_183>",
+    "<FAKE_PAD_184>",
+    "<FAKE_PAD_185>",
+    "<FAKE_PAD_186>",
+    "<FAKE_PAD_187>",
+    "<FAKE_PAD_188>",
+    "<FAKE_PAD_189>",
+    "<FAKE_PAD_190>",
+    "<FAKE_PAD_191>",
+    "<FAKE_PAD_192>",
+    "<FAKE_PAD_193>",
+    "<FAKE_PAD_194>",
+    "<FAKE_PAD_195>",
+    "<FAKE_PAD_196>",
+    "<FAKE_PAD_197>",
+    "<FAKE_PAD_198>",
+    "<FAKE_PAD_199>",
+    "<FAKE_PAD_200>",
+    "<FAKE_PAD_201>",
+    "<FAKE_PAD_202>",
+    "<FAKE_PAD_203>",
+    "<FAKE_PAD_204>",
+    "<FAKE_PAD_205>",
+    "<FAKE_PAD_206>",
+    "<FAKE_PAD_207>",
+    "<FAKE_PAD_208>",
+    "<FAKE_PAD_209>",
+    "<FAKE_PAD_210>",
+    "<FAKE_PAD_211>",
+    "<FAKE_PAD_212>",
+    "<FAKE_PAD_213>",
+    "<FAKE_PAD_214>",
+    "<FAKE_PAD_215>",
+    "<FAKE_PAD_216>",
+    "<FAKE_PAD_217>",
+    "<FAKE_PAD_218>",
+    "<FAKE_PAD_219>",
+    "<FAKE_PAD_220>",
+    "<FAKE_PAD_221>",
+    "<FAKE_PAD_222>",
+    "<FAKE_PAD_223>",
+    "<FAKE_PAD_224>",
+    "<FAKE_PAD_225>",
+    "<FAKE_PAD_226>",
+    "<FAKE_PAD_227>",
+    "<FAKE_PAD_228>",
+    "<FAKE_PAD_229>",
+    "<FAKE_PAD_230>",
+    "<FAKE_PAD_231>",
+    "<FAKE_PAD_232>",
+    "<FAKE_PAD_233>",
+    "<FAKE_PAD_234>",
+    "<FAKE_PAD_235>",
+    "<FAKE_PAD_236>",
+    "<FAKE_PAD_237>",
+    "<FAKE_PAD_238>",
+    "<FAKE_PAD_239>",
+    "<FAKE_PAD_240>",
+    "<FAKE_PAD_241>",
+    "<FAKE_PAD_242>",
+    "<FAKE_PAD_243>",
+    "<FAKE_PAD_244>",
+    "<FAKE_PAD_245>",
+    "<FAKE_PAD_246>",
+    "<FAKE_PAD_247>",
+    "<FAKE_PAD_248>",
+    "<FAKE_PAD_249>",
+    "<FAKE_PAD_250>",
+    "<FAKE_PAD_251>",
+    "<FAKE_PAD_252>",
+    "<FAKE_PAD_253>",
+    "<audio>",
+    "</audio>",
+    "<AUDIO_CONTEXT>",
+    "<interrupt>",
+    "<FAKE_PAD_PAD_0>",
+    "<FAKE_PAD_PAD_1>",
+    "<FAKE_PAD_PAD_2>",
+    "<FAKE_PAD_PAD_3>",
+    "<FAKE_PAD_PAD_4>",
+    "<FAKE_PAD_PAD_5>",
+    "<FAKE_PAD_PAD_6>",
+    "<FAKE_PAD_PAD_7>",
+    "<FAKE_PAD_PAD_8>",
+    "<FAKE_PAD_PAD_9>",
+    "<FAKE_PAD_PAD_10>",
+    "<FAKE_PAD_PAD_11>",
+    "<FAKE_PAD_PAD_12>",
+    "<FAKE_PAD_PAD_13>",
+    "<FAKE_PAD_PAD_14>",
+    "<FAKE_PAD_PAD_15>",
+    "<FAKE_PAD_PAD_16>",
+    "<FAKE_PAD_PAD_17>",
+    "<FAKE_PAD_PAD_18>",
+    "<FAKE_PAD_PAD_19>",
+    "<FAKE_PAD_PAD_20>",
+    "<FAKE_PAD_PAD_21>",
+    "<FAKE_PAD_PAD_22>",
+    "<FAKE_PAD_PAD_23>",
+    "<FAKE_PAD_PAD_24>",
+    "<FAKE_PAD_PAD_25>",
+    "<FAKE_PAD_PAD_26>",
+    "<FAKE_PAD_PAD_27>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

taozi.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b3d286d93323ff1ed598503c40cf028dc3faa946c662fa8d509b201165d56356
+size 807404

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,2931 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151669": {
+      "content": "<IMG_CONTEXT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151670": {
+      "content": "<img>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151671": {
+      "content": "</img>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151672": {
+      "content": "<quad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151673": {
+      "content": "</quad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151674": {
+      "content": "<ref>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151675": {
+      "content": "</ref>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151676": {
+      "content": "<box>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151677": {
+      "content": "</box>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151678": {
+      "content": "<|action_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151679": {
+      "content": "<|action_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151680": {
+      "content": "<|plugin|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151681": {
+      "content": "<|interpreter|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151682": {
+      "content": "<FAKE_PAD_0>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151683": {
+      "content": "<FAKE_PAD_1>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151684": {
+      "content": "<FAKE_PAD_2>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151685": {
+      "content": "<FAKE_PAD_3>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151686": {
+      "content": "<FAKE_PAD_4>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151687": {
+      "content": "<FAKE_PAD_5>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151688": {
+      "content": "<FAKE_PAD_6>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151689": {
+      "content": "<FAKE_PAD_7>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151690": {
+      "content": "<FAKE_PAD_8>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151691": {
+      "content": "<FAKE_PAD_9>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151692": {
+      "content": "<FAKE_PAD_10>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151693": {
+      "content": "<FAKE_PAD_11>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151694": {
+      "content": "<FAKE_PAD_12>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151695": {
+      "content": "<FAKE_PAD_13>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151696": {
+      "content": "<FAKE_PAD_14>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151697": {
+      "content": "<FAKE_PAD_15>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151698": {
+      "content": "<FAKE_PAD_16>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151699": {
+      "content": "<FAKE_PAD_17>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151700": {
+      "content": "<FAKE_PAD_18>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151701": {
+      "content": "<FAKE_PAD_19>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151702": {
+      "content": "<FAKE_PAD_20>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151703": {
+      "content": "<FAKE_PAD_21>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151704": {
+      "content": "<FAKE_PAD_22>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151705": {
+      "content": "<FAKE_PAD_23>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151706": {
+      "content": "<FAKE_PAD_24>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151707": {
+      "content": "<FAKE_PAD_25>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151708": {
+      "content": "<FAKE_PAD_26>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151709": {
+      "content": "<FAKE_PAD_27>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151710": {
+      "content": "<FAKE_PAD_28>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151711": {
+      "content": "<FAKE_PAD_29>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151712": {
+      "content": "<FAKE_PAD_30>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151713": {
+      "content": "<FAKE_PAD_31>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151714": {
+      "content": "<FAKE_PAD_32>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151715": {
+      "content": "<FAKE_PAD_33>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151716": {
+      "content": "<FAKE_PAD_34>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151717": {
+      "content": "<FAKE_PAD_35>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151718": {
+      "content": "<FAKE_PAD_36>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151719": {
+      "content": "<FAKE_PAD_37>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151720": {
+      "content": "<FAKE_PAD_38>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151721": {
+      "content": "<FAKE_PAD_39>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151722": {
+      "content": "<FAKE_PAD_40>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151723": {
+      "content": "<FAKE_PAD_41>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151724": {
+      "content": "<FAKE_PAD_42>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151725": {
+      "content": "<FAKE_PAD_43>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151726": {
+      "content": "<FAKE_PAD_44>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151727": {
+      "content": "<FAKE_PAD_45>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151728": {
+      "content": "<FAKE_PAD_46>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151729": {
+      "content": "<FAKE_PAD_47>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151730": {
+      "content": "<FAKE_PAD_48>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151731": {
+      "content": "<FAKE_PAD_49>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151732": {
+      "content": "<FAKE_PAD_50>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151733": {
+      "content": "<FAKE_PAD_51>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151734": {
+      "content": "<FAKE_PAD_52>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151735": {
+      "content": "<FAKE_PAD_53>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151736": {
+      "content": "<FAKE_PAD_54>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151737": {
+      "content": "<FAKE_PAD_55>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151738": {
+      "content": "<FAKE_PAD_56>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151739": {
+      "content": "<FAKE_PAD_57>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151740": {
+      "content": "<FAKE_PAD_58>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151741": {
+      "content": "<FAKE_PAD_59>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151742": {
+      "content": "<FAKE_PAD_60>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151743": {
+      "content": "<FAKE_PAD_61>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151744": {
+      "content": "<FAKE_PAD_62>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151745": {
+      "content": "<FAKE_PAD_63>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151746": {
+      "content": "<FAKE_PAD_64>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151747": {
+      "content": "<FAKE_PAD_65>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151748": {
+      "content": "<FAKE_PAD_66>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151749": {
+      "content": "<FAKE_PAD_67>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151750": {
+      "content": "<FAKE_PAD_68>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151751": {
+      "content": "<FAKE_PAD_69>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151752": {
+      "content": "<FAKE_PAD_70>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151753": {
+      "content": "<FAKE_PAD_71>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151754": {
+      "content": "<FAKE_PAD_72>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151755": {
+      "content": "<FAKE_PAD_73>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151756": {
+      "content": "<FAKE_PAD_74>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151757": {
+      "content": "<FAKE_PAD_75>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151758": {
+      "content": "<FAKE_PAD_76>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151759": {
+      "content": "<FAKE_PAD_77>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151760": {
+      "content": "<FAKE_PAD_78>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151761": {
+      "content": "<FAKE_PAD_79>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151762": {
+      "content": "<FAKE_PAD_80>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151763": {
+      "content": "<FAKE_PAD_81>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151764": {
+      "content": "<FAKE_PAD_82>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151765": {
+      "content": "<FAKE_PAD_83>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151766": {
+      "content": "<FAKE_PAD_84>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151767": {
+      "content": "<FAKE_PAD_85>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151768": {
+      "content": "<FAKE_PAD_86>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151769": {
+      "content": "<FAKE_PAD_87>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151770": {
+      "content": "<FAKE_PAD_88>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151771": {
+      "content": "<FAKE_PAD_89>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151772": {
+      "content": "<FAKE_PAD_90>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151773": {
+      "content": "<FAKE_PAD_91>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151774": {
+      "content": "<FAKE_PAD_92>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151775": {
+      "content": "<FAKE_PAD_93>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151776": {
+      "content": "<FAKE_PAD_94>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151777": {
+      "content": "<FAKE_PAD_95>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151778": {
+      "content": "<FAKE_PAD_96>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151779": {
+      "content": "<FAKE_PAD_97>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151780": {
+      "content": "<FAKE_PAD_98>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151781": {
+      "content": "<FAKE_PAD_99>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151782": {
+      "content": "<FAKE_PAD_100>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151783": {
+      "content": "<FAKE_PAD_101>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151784": {
+      "content": "<FAKE_PAD_102>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151785": {
+      "content": "<FAKE_PAD_103>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151786": {
+      "content": "<FAKE_PAD_104>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151787": {
+      "content": "<FAKE_PAD_105>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151788": {
+      "content": "<FAKE_PAD_106>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151789": {
+      "content": "<FAKE_PAD_107>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151790": {
+      "content": "<FAKE_PAD_108>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151791": {
+      "content": "<FAKE_PAD_109>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151792": {
+      "content": "<FAKE_PAD_110>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151793": {
+      "content": "<FAKE_PAD_111>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151794": {
+      "content": "<FAKE_PAD_112>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151795": {
+      "content": "<FAKE_PAD_113>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151796": {
+      "content": "<FAKE_PAD_114>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151797": {
+      "content": "<FAKE_PAD_115>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151798": {
+      "content": "<FAKE_PAD_116>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151799": {
+      "content": "<FAKE_PAD_117>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151800": {
+      "content": "<FAKE_PAD_118>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151801": {
+      "content": "<FAKE_PAD_119>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151802": {
+      "content": "<FAKE_PAD_120>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151803": {
+      "content": "<FAKE_PAD_121>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151804": {
+      "content": "<FAKE_PAD_122>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151805": {
+      "content": "<FAKE_PAD_123>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151806": {
+      "content": "<FAKE_PAD_124>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151807": {
+      "content": "<FAKE_PAD_125>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151808": {
+      "content": "<FAKE_PAD_126>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151809": {
+      "content": "<FAKE_PAD_127>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151810": {
+      "content": "<FAKE_PAD_128>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151811": {
+      "content": "<FAKE_PAD_129>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151812": {
+      "content": "<FAKE_PAD_130>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151813": {
+      "content": "<FAKE_PAD_131>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151814": {
+      "content": "<FAKE_PAD_132>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151815": {
+      "content": "<FAKE_PAD_133>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151816": {
+      "content": "<FAKE_PAD_134>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151817": {
+      "content": "<FAKE_PAD_135>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151818": {
+      "content": "<FAKE_PAD_136>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151819": {
+      "content": "<FAKE_PAD_137>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151820": {
+      "content": "<FAKE_PAD_138>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151821": {
+      "content": "<FAKE_PAD_139>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151822": {
+      "content": "<FAKE_PAD_140>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151823": {
+      "content": "<FAKE_PAD_141>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151824": {
+      "content": "<FAKE_PAD_142>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151825": {
+      "content": "<FAKE_PAD_143>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151826": {
+      "content": "<FAKE_PAD_144>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151827": {
+      "content": "<FAKE_PAD_145>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151828": {
+      "content": "<FAKE_PAD_146>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151829": {
+      "content": "<FAKE_PAD_147>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151830": {
+      "content": "<FAKE_PAD_148>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151831": {
+      "content": "<FAKE_PAD_149>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151832": {
+      "content": "<FAKE_PAD_150>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151833": {
+      "content": "<FAKE_PAD_151>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151834": {
+      "content": "<FAKE_PAD_152>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151835": {
+      "content": "<FAKE_PAD_153>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151836": {
+      "content": "<FAKE_PAD_154>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151837": {
+      "content": "<FAKE_PAD_155>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151838": {
+      "content": "<FAKE_PAD_156>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151839": {
+      "content": "<FAKE_PAD_157>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151840": {
+      "content": "<FAKE_PAD_158>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151841": {
+      "content": "<FAKE_PAD_159>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151842": {
+      "content": "<FAKE_PAD_160>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151843": {
+      "content": "<FAKE_PAD_161>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151844": {
+      "content": "<FAKE_PAD_162>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151845": {
+      "content": "<FAKE_PAD_163>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151846": {
+      "content": "<FAKE_PAD_164>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151847": {
+      "content": "<FAKE_PAD_165>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151848": {
+      "content": "<FAKE_PAD_166>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151849": {
+      "content": "<FAKE_PAD_167>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151850": {
+      "content": "<FAKE_PAD_168>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151851": {
+      "content": "<FAKE_PAD_169>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151852": {
+      "content": "<FAKE_PAD_170>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151853": {
+      "content": "<FAKE_PAD_171>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151854": {
+      "content": "<FAKE_PAD_172>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151855": {
+      "content": "<FAKE_PAD_173>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151856": {
+      "content": "<FAKE_PAD_174>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151857": {
+      "content": "<FAKE_PAD_175>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151858": {
+      "content": "<FAKE_PAD_176>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151859": {
+      "content": "<FAKE_PAD_177>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151860": {
+      "content": "<FAKE_PAD_178>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151861": {
+      "content": "<FAKE_PAD_179>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151862": {
+      "content": "<FAKE_PAD_180>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151863": {
+      "content": "<FAKE_PAD_181>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151864": {
+      "content": "<FAKE_PAD_182>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151865": {
+      "content": "<FAKE_PAD_183>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151866": {
+      "content": "<FAKE_PAD_184>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151867": {
+      "content": "<FAKE_PAD_185>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151868": {
+      "content": "<FAKE_PAD_186>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151869": {
+      "content": "<FAKE_PAD_187>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151870": {
+      "content": "<FAKE_PAD_188>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151871": {
+      "content": "<FAKE_PAD_189>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151872": {
+      "content": "<FAKE_PAD_190>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151873": {
+      "content": "<FAKE_PAD_191>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151874": {
+      "content": "<FAKE_PAD_192>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151875": {
+      "content": "<FAKE_PAD_193>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151876": {
+      "content": "<FAKE_PAD_194>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151877": {
+      "content": "<FAKE_PAD_195>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151878": {
+      "content": "<FAKE_PAD_196>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151879": {
+      "content": "<FAKE_PAD_197>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151880": {
+      "content": "<FAKE_PAD_198>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151881": {
+      "content": "<FAKE_PAD_199>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151882": {
+      "content": "<FAKE_PAD_200>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151883": {
+      "content": "<FAKE_PAD_201>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151884": {
+      "content": "<FAKE_PAD_202>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151885": {
+      "content": "<FAKE_PAD_203>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151886": {
+      "content": "<FAKE_PAD_204>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151887": {
+      "content": "<FAKE_PAD_205>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151888": {
+      "content": "<FAKE_PAD_206>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151889": {
+      "content": "<FAKE_PAD_207>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151890": {
+      "content": "<FAKE_PAD_208>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151891": {
+      "content": "<FAKE_PAD_209>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151892": {
+      "content": "<FAKE_PAD_210>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151893": {
+      "content": "<FAKE_PAD_211>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151894": {
+      "content": "<FAKE_PAD_212>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151895": {
+      "content": "<FAKE_PAD_213>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151896": {
+      "content": "<FAKE_PAD_214>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151897": {
+      "content": "<FAKE_PAD_215>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151898": {
+      "content": "<FAKE_PAD_216>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151899": {
+      "content": "<FAKE_PAD_217>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151900": {
+      "content": "<FAKE_PAD_218>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151901": {
+      "content": "<FAKE_PAD_219>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151902": {
+      "content": "<FAKE_PAD_220>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151903": {
+      "content": "<FAKE_PAD_221>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151904": {
+      "content": "<FAKE_PAD_222>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151905": {
+      "content": "<FAKE_PAD_223>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151906": {
+      "content": "<FAKE_PAD_224>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151907": {
+      "content": "<FAKE_PAD_225>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151908": {
+      "content": "<FAKE_PAD_226>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151909": {
+      "content": "<FAKE_PAD_227>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151910": {
+      "content": "<FAKE_PAD_228>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151911": {
+      "content": "<FAKE_PAD_229>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151912": {
+      "content": "<FAKE_PAD_230>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151913": {
+      "content": "<FAKE_PAD_231>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151914": {
+      "content": "<FAKE_PAD_232>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151915": {
+      "content": "<FAKE_PAD_233>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151916": {
+      "content": "<FAKE_PAD_234>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151917": {
+      "content": "<FAKE_PAD_235>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151918": {
+      "content": "<FAKE_PAD_236>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151919": {
+      "content": "<FAKE_PAD_237>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151920": {
+      "content": "<FAKE_PAD_238>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151921": {
+      "content": "<FAKE_PAD_239>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151922": {
+      "content": "<FAKE_PAD_240>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151923": {
+      "content": "<FAKE_PAD_241>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151924": {
+      "content": "<FAKE_PAD_242>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151925": {
+      "content": "<FAKE_PAD_243>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151926": {
+      "content": "<FAKE_PAD_244>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151927": {
+      "content": "<FAKE_PAD_245>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151928": {
+      "content": "<FAKE_PAD_246>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151929": {
+      "content": "<FAKE_PAD_247>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151930": {
+      "content": "<FAKE_PAD_248>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151931": {
+      "content": "<FAKE_PAD_249>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151932": {
+      "content": "<FAKE_PAD_250>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151933": {
+      "content": "<FAKE_PAD_251>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151934": {
+      "content": "<FAKE_PAD_252>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151935": {
+      "content": "<FAKE_PAD_253>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151936": {
+      "content": "<audio>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151937": {
+      "content": "</audio>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151938": {
+      "content": "<AUDIO_CONTEXT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151939": {
+      "content": "<interrupt>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151940": {
+      "content": "<FAKE_PAD_PAD_0>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151941": {
+      "content": "<FAKE_PAD_PAD_1>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151942": {
+      "content": "<FAKE_PAD_PAD_2>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151943": {
+      "content": "<FAKE_PAD_PAD_3>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151944": {
+      "content": "<FAKE_PAD_PAD_4>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151945": {
+      "content": "<FAKE_PAD_PAD_5>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151946": {
+      "content": "<FAKE_PAD_PAD_6>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151947": {
+      "content": "<FAKE_PAD_PAD_7>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151948": {
+      "content": "<FAKE_PAD_PAD_8>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151949": {
+      "content": "<FAKE_PAD_PAD_9>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151950": {
+      "content": "<FAKE_PAD_PAD_10>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151951": {
+      "content": "<FAKE_PAD_PAD_11>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151952": {
+      "content": "<FAKE_PAD_PAD_12>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151953": {
+      "content": "<FAKE_PAD_PAD_13>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151954": {
+      "content": "<FAKE_PAD_PAD_14>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151955": {
+      "content": "<FAKE_PAD_PAD_15>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151956": {
+      "content": "<FAKE_PAD_PAD_16>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151957": {
+      "content": "<FAKE_PAD_PAD_17>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151958": {
+      "content": "<FAKE_PAD_PAD_18>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151959": {
+      "content": "<FAKE_PAD_PAD_19>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151960": {
+      "content": "<FAKE_PAD_PAD_20>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151961": {
+      "content": "<FAKE_PAD_PAD_21>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151962": {
+      "content": "<FAKE_PAD_PAD_22>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151963": {
+      "content": "<FAKE_PAD_PAD_23>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151964": {
+      "content": "<FAKE_PAD_PAD_24>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151965": {
+      "content": "<FAKE_PAD_PAD_25>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151966": {
+      "content": "<FAKE_PAD_PAD_26>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151967": {
+      "content": "<FAKE_PAD_PAD_27>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>",
+    "<IMG_CONTEXT>",
+    "<img>",
+    "</img>",
+    "<quad>",
+    "</quad>",
+    "<ref>",
+    "</ref>",
+    "<box>",
+    "</box>",
+    "<|action_start|>",
+    "<|action_end|>",
+    "<|plugin|>",
+    "<|interpreter|>",
+    "<FAKE_PAD_0>",
+    "<FAKE_PAD_1>",
+    "<FAKE_PAD_2>",
+    "<FAKE_PAD_3>",
+    "<FAKE_PAD_4>",
+    "<FAKE_PAD_5>",
+    "<FAKE_PAD_6>",
+    "<FAKE_PAD_7>",
+    "<FAKE_PAD_8>",
+    "<FAKE_PAD_9>",
+    "<FAKE_PAD_10>",
+    "<FAKE_PAD_11>",
+    "<FAKE_PAD_12>",
+    "<FAKE_PAD_13>",
+    "<FAKE_PAD_14>",
+    "<FAKE_PAD_15>",
+    "<FAKE_PAD_16>",
+    "<FAKE_PAD_17>",
+    "<FAKE_PAD_18>",
+    "<FAKE_PAD_19>",
+    "<FAKE_PAD_20>",
+    "<FAKE_PAD_21>",
+    "<FAKE_PAD_22>",
+    "<FAKE_PAD_23>",
+    "<FAKE_PAD_24>",
+    "<FAKE_PAD_25>",
+    "<FAKE_PAD_26>",
+    "<FAKE_PAD_27>",
+    "<FAKE_PAD_28>",
+    "<FAKE_PAD_29>",
+    "<FAKE_PAD_30>",
+    "<FAKE_PAD_31>",
+    "<FAKE_PAD_32>",
+    "<FAKE_PAD_33>",
+    "<FAKE_PAD_34>",
+    "<FAKE_PAD_35>",
+    "<FAKE_PAD_36>",
+    "<FAKE_PAD_37>",
+    "<FAKE_PAD_38>",
+    "<FAKE_PAD_39>",
+    "<FAKE_PAD_40>",
+    "<FAKE_PAD_41>",
+    "<FAKE_PAD_42>",
+    "<FAKE_PAD_43>",
+    "<FAKE_PAD_44>",
+    "<FAKE_PAD_45>",
+    "<FAKE_PAD_46>",
+    "<FAKE_PAD_47>",
+    "<FAKE_PAD_48>",
+    "<FAKE_PAD_49>",
+    "<FAKE_PAD_50>",
+    "<FAKE_PAD_51>",
+    "<FAKE_PAD_52>",
+    "<FAKE_PAD_53>",
+    "<FAKE_PAD_54>",
+    "<FAKE_PAD_55>",
+    "<FAKE_PAD_56>",
+    "<FAKE_PAD_57>",
+    "<FAKE_PAD_58>",
+    "<FAKE_PAD_59>",
+    "<FAKE_PAD_60>",
+    "<FAKE_PAD_61>",
+    "<FAKE_PAD_62>",
+    "<FAKE_PAD_63>",
+    "<FAKE_PAD_64>",
+    "<FAKE_PAD_65>",
+    "<FAKE_PAD_66>",
+    "<FAKE_PAD_67>",
+    "<FAKE_PAD_68>",
+    "<FAKE_PAD_69>",
+    "<FAKE_PAD_70>",
+    "<FAKE_PAD_71>",
+    "<FAKE_PAD_72>",
+    "<FAKE_PAD_73>",
+    "<FAKE_PAD_74>",
+    "<FAKE_PAD_75>",
+    "<FAKE_PAD_76>",
+    "<FAKE_PAD_77>",
+    "<FAKE_PAD_78>",
+    "<FAKE_PAD_79>",
+    "<FAKE_PAD_80>",
+    "<FAKE_PAD_81>",
+    "<FAKE_PAD_82>",
+    "<FAKE_PAD_83>",
+    "<FAKE_PAD_84>",
+    "<FAKE_PAD_85>",
+    "<FAKE_PAD_86>",
+    "<FAKE_PAD_87>",
+    "<FAKE_PAD_88>",
+    "<FAKE_PAD_89>",
+    "<FAKE_PAD_90>",
+    "<FAKE_PAD_91>",
+    "<FAKE_PAD_92>",
+    "<FAKE_PAD_93>",
+    "<FAKE_PAD_94>",
+    "<FAKE_PAD_95>",
+    "<FAKE_PAD_96>",
+    "<FAKE_PAD_97>",
+    "<FAKE_PAD_98>",
+    "<FAKE_PAD_99>",
+    "<FAKE_PAD_100>",
+    "<FAKE_PAD_101>",
+    "<FAKE_PAD_102>",
+    "<FAKE_PAD_103>",
+    "<FAKE_PAD_104>",
+    "<FAKE_PAD_105>",
+    "<FAKE_PAD_106>",
+    "<FAKE_PAD_107>",
+    "<FAKE_PAD_108>",
+    "<FAKE_PAD_109>",
+    "<FAKE_PAD_110>",
+    "<FAKE_PAD_111>",
+    "<FAKE_PAD_112>",
+    "<FAKE_PAD_113>",
+    "<FAKE_PAD_114>",
+    "<FAKE_PAD_115>",
+    "<FAKE_PAD_116>",
+    "<FAKE_PAD_117>",
+    "<FAKE_PAD_118>",
+    "<FAKE_PAD_119>",
+    "<FAKE_PAD_120>",
+    "<FAKE_PAD_121>",
+    "<FAKE_PAD_122>",
+    "<FAKE_PAD_123>",
+    "<FAKE_PAD_124>",
+    "<FAKE_PAD_125>",
+    "<FAKE_PAD_126>",
+    "<FAKE_PAD_127>",
+    "<FAKE_PAD_128>",
+    "<FAKE_PAD_129>",
+    "<FAKE_PAD_130>",
+    "<FAKE_PAD_131>",
+    "<FAKE_PAD_132>",
+    "<FAKE_PAD_133>",
+    "<FAKE_PAD_134>",
+    "<FAKE_PAD_135>",
+    "<FAKE_PAD_136>",
+    "<FAKE_PAD_137>",
+    "<FAKE_PAD_138>",
+    "<FAKE_PAD_139>",
+    "<FAKE_PAD_140>",
+    "<FAKE_PAD_141>",
+    "<FAKE_PAD_142>",
+    "<FAKE_PAD_143>",
+    "<FAKE_PAD_144>",
+    "<FAKE_PAD_145>",
+    "<FAKE_PAD_146>",
+    "<FAKE_PAD_147>",
+    "<FAKE_PAD_148>",
+    "<FAKE_PAD_149>",
+    "<FAKE_PAD_150>",
+    "<FAKE_PAD_151>",
+    "<FAKE_PAD_152>",
+    "<FAKE_PAD_153>",
+    "<FAKE_PAD_154>",
+    "<FAKE_PAD_155>",
+    "<FAKE_PAD_156>",
+    "<FAKE_PAD_157>",
+    "<FAKE_PAD_158>",
+    "<FAKE_PAD_159>",
+    "<FAKE_PAD_160>",
+    "<FAKE_PAD_161>",
+    "<FAKE_PAD_162>",
+    "<FAKE_PAD_163>",
+    "<FAKE_PAD_164>",
+    "<FAKE_PAD_165>",
+    "<FAKE_PAD_166>",
+    "<FAKE_PAD_167>",
+    "<FAKE_PAD_168>",
+    "<FAKE_PAD_169>",
+    "<FAKE_PAD_170>",
+    "<FAKE_PAD_171>",
+    "<FAKE_PAD_172>",
+    "<FAKE_PAD_173>",
+    "<FAKE_PAD_174>",
+    "<FAKE_PAD_175>",
+    "<FAKE_PAD_176>",
+    "<FAKE_PAD_177>",
+    "<FAKE_PAD_178>",
+    "<FAKE_PAD_179>",
+    "<FAKE_PAD_180>",
+    "<FAKE_PAD_181>",
+    "<FAKE_PAD_182>",
+    "<FAKE_PAD_183>",
+    "<FAKE_PAD_184>",
+    "<FAKE_PAD_185>",
+    "<FAKE_PAD_186>",
+    "<FAKE_PAD_187>",
+    "<FAKE_PAD_188>",
+    "<FAKE_PAD_189>",
+    "<FAKE_PAD_190>",
+    "<FAKE_PAD_191>",
+    "<FAKE_PAD_192>",
+    "<FAKE_PAD_193>",
+    "<FAKE_PAD_194>",
+    "<FAKE_PAD_195>",
+    "<FAKE_PAD_196>",
+    "<FAKE_PAD_197>",
+    "<FAKE_PAD_198>",
+    "<FAKE_PAD_199>",
+    "<FAKE_PAD_200>",
+    "<FAKE_PAD_201>",
+    "<FAKE_PAD_202>",
+    "<FAKE_PAD_203>",
+    "<FAKE_PAD_204>",
+    "<FAKE_PAD_205>",
+    "<FAKE_PAD_206>",
+    "<FAKE_PAD_207>",
+    "<FAKE_PAD_208>",
+    "<FAKE_PAD_209>",
+    "<FAKE_PAD_210>",
+    "<FAKE_PAD_211>",
+    "<FAKE_PAD_212>",
+    "<FAKE_PAD_213>",
+    "<FAKE_PAD_214>",
+    "<FAKE_PAD_215>",
+    "<FAKE_PAD_216>",
+    "<FAKE_PAD_217>",
+    "<FAKE_PAD_218>",
+    "<FAKE_PAD_219>",
+    "<FAKE_PAD_220>",
+    "<FAKE_PAD_221>",
+    "<FAKE_PAD_222>",
+    "<FAKE_PAD_223>",
+    "<FAKE_PAD_224>",
+    "<FAKE_PAD_225>",
+    "<FAKE_PAD_226>",
+    "<FAKE_PAD_227>",
+    "<FAKE_PAD_228>",
+    "<FAKE_PAD_229>",
+    "<FAKE_PAD_230>",
+    "<FAKE_PAD_231>",
+    "<FAKE_PAD_232>",
+    "<FAKE_PAD_233>",
+    "<FAKE_PAD_234>",
+    "<FAKE_PAD_235>",
+    "<FAKE_PAD_236>",
+    "<FAKE_PAD_237>",
+    "<FAKE_PAD_238>",
+    "<FAKE_PAD_239>",
+    "<FAKE_PAD_240>",
+    "<FAKE_PAD_241>",
+    "<FAKE_PAD_242>",
+    "<FAKE_PAD_243>",
+    "<FAKE_PAD_244>",
+    "<FAKE_PAD_245>",
+    "<FAKE_PAD_246>",
+    "<FAKE_PAD_247>",
+    "<FAKE_PAD_248>",
+    "<FAKE_PAD_249>",
+    "<FAKE_PAD_250>",
+    "<FAKE_PAD_251>",
+    "<FAKE_PAD_252>",
+    "<FAKE_PAD_253>",
+    "<audio>",
+    "</audio>",
+    "<AUDIO_CONTEXT>",
+    "<interrupt>",
+    "<FAKE_PAD_PAD_0>",
+    "<FAKE_PAD_PAD_1>",
+    "<FAKE_PAD_PAD_2>",
+    "<FAKE_PAD_PAD_3>",
+    "<FAKE_PAD_PAD_4>",
+    "<FAKE_PAD_PAD_5>",
+    "<FAKE_PAD_PAD_6>",
+    "<FAKE_PAD_PAD_7>",
+    "<FAKE_PAD_PAD_8>",
+    "<FAKE_PAD_PAD_9>",
+    "<FAKE_PAD_PAD_10>",
+    "<FAKE_PAD_PAD_11>",
+    "<FAKE_PAD_PAD_12>",
+    "<FAKE_PAD_PAD_13>",
+    "<FAKE_PAD_PAD_14>",
+    "<FAKE_PAD_PAD_15>",
+    "<FAKE_PAD_PAD_16>",
+    "<FAKE_PAD_PAD_17>",
+    "<FAKE_PAD_PAD_18>",
+    "<FAKE_PAD_PAD_19>",
+    "<FAKE_PAD_PAD_20>",
+    "<FAKE_PAD_PAD_21>",
+    "<FAKE_PAD_PAD_22>",
+    "<FAKE_PAD_PAD_23>",
+    "<FAKE_PAD_PAD_24>",
+    "<FAKE_PAD_PAD_25>",
+    "<FAKE_PAD_PAD_26>",
+    "<FAKE_PAD_PAD_27>"
+  ],
+  "bos_token": null,
+  "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0].role == 'system' %}\n        {{- messages[0].content + '\\n\\n' }}\n    {%- endif %}\n    {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0].role == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n    {%- set index = (messages|length - 1) - loop.index0 %}\n    {%- if ns.multi_step_tool and message.role == \"user\" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n        {%- set ns.multi_step_tool = false %}\n        {%- set ns.last_query_index = index %}\n    {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {%- set content = message.content %}\n        {%- set reasoning_content = '' %}\n        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n            {%- set reasoning_content = message.reasoning_content %}\n        {%- else %}\n            {%- if '</think>' in message.content %}\n                {%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}\n                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n            {%- endif %}\n        {%- endif %}\n        {%- if loop.index0 > ns.last_query_index %}\n            {%- if loop.last or (not loop.last and reasoning_content) %}\n                {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n            {%- else %}\n                {{- '<|im_start|>' + message.role + '\\n' + content }}\n            {%- endif %}\n        {%- else %}\n            {{- '<|im_start|>' + message.role + '\\n' + content }}\n        {%- endif %}\n        {%- if message.tool_calls %}\n            {%- for tool_call in message.tool_calls %}\n                {%- if (loop.first and content) or (not loop.first) %}\n                    {{- '\\n' }}\n                {%- endif %}\n                {%- if tool_call.function %}\n                    {%- set tool_call = tool_call.function %}\n                {%- endif %}\n                {{- '<tool_call>\\n{\"name\": \"' }}\n                {{- tool_call.name }}\n                {{- '\", \"arguments\": ' }}\n                {%- if tool_call.arguments is string %}\n                    {{- tool_call.arguments }}\n                {%- else %}\n                    {{- tool_call.arguments | tojson }}\n                {%- endif %}\n                {{- '}\\n</tool_call>' }}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n    {%- if enable_thinking is defined and enable_thinking is false %}\n        {{- '<think>\\n\\n</think>\\n\\n' }}\n    {%- endif %}\n{%- endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "model_max_length": 4096,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff