Spaces:
Running
on
Zero
Running
on
Zero
File size: 4,094 Bytes
841f290 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
model: osum_echat
# llm_path
llm_path: &llm_path "Qwen/Qwen2.5-3B-Instruct"
#
# model config
downsample_rate: 4 # 1 2 4 8
adapter_type: osum_echat
if_instruct: true
input_dim: 80
# tokenizer ,gxl
tokenizer: huggingface
tokenizer_conf:
llm_path: *llm_path
# lora config
use_lora: false
lora_alpha: 32
lora_rank: 64 # 3B -> 85M
lora_dropout: 0.1
# speech generate config
speech_token_num: &token_num 4097 #4097
# Configuration of parameters for training
fire_module: link_and_encoder_and_lora # link encoder llm link_and_encoder link_and_encoder_and_lora, llm需要配合use_lora为true
# other config
grad_clip: 5
accum_grad: 8
log_interval: 10
save_interval: 1250 #1250 #2500
max_epoch: 1
init_step: true
# training config
optim: adamw
optim_conf:
betas:
- 0.9
- 0.99
eps: 1.0e-06
lr: 1.0e-06
weight_decay: 0.01
scheduler: warmuplr
scheduler_conf:
warmup_steps: 2000
dataset: asr
dataset_conf:
speech_token_num: *token_num
batch_conf:
batch_size: 26
batch_type: dynamic
max_frames_in_batch: 28000000 #3000 #9000 #3000 #3300 # 3900
max_seq_in_batch: 3700 #1500 #4000 #1100 #1600 # 1900
feats_type: log_mel_spectrogram
filter_conf:
max_length: 20000
min_length: 20
token_max_length: 1200
token_min_length: 1
filter_no_extra_info: true # 如果没有task lang 等信息,直接过滤掉, 适用于通用多任务训练, 推理时应该关掉
max_seq_len: 2000 #、1100 #1000
other_filter_conf:
only_s2s: false # 只针对与s2s dataloader的过滤
only_s2t: false # 只针对与s2t dataloader的过滤
only_t2t: false # 只针对与t2t dataloader的过滤
only_t2s: false # 只针对与t2s dataloader的过滤
language_conf:
limited_langs:
- zh
log_mel_spectrogram_conf:
hop_length: 160
n_fft: 400
num_mel_bins: 80
padding: 0
resample_conf:
resample_rate: 16000
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: true
sort_conf:
sort_size: 500
spec_aug: true
spec_aug_conf:
max_f: 10
max_t: 50
num_f_mask: 2
num_t_mask: 2
spec_sub: true
spec_sub_conf:
max_t: 30
num_t_sub: 3
spec_trim: false
speed_perturb: false
eod_id: 151645
split_num: 1
multi_num: 2
prompt_conf_path: conf/prompt_config.yaml
data_recover: false
data_recover_conf:
start_idx: 0 # 删除前面start_idx个item(tar包)
other_tokenze_conf: # 一些对数据额外操作的可控按钮,这些操作一般来说再test时都得为false
only_info:
only_s2s: false # 只针对与s2s dataloader的过滤
only_s2t: false # 只针对与s2t dataloader的过滤
only_t2t: false # 只针对与t2t dataloader的过滤
only_t2s: false # 只针对与t2s dataloader的过滤
use_50_per_change_if_only_X: true # 50%的句子随机替换为其only X
use_s2s_streaming_random:
enable: false
rate: 0.5 # 1.0 表示100%的句子随机替换为其only X
natural_language_convert:
enable: false
rate: 0.00 # 1.0 表示100%的转换成自然语言模式
use_s2s_convert_s2t:
enable: false # 单独为s2t dataloader 开启s2s convert
rate: 1.0 # 1.0 表示100%的句子随机替换为其only X
use_streaming_tts:
enable: false
rate: 0.5 # 1.0 表示100%的句子随机替换为其only X
use_think_mode:
enable: false # 开启think 模式, 即随机替换为think模式的句子
rate: 0.8
other_filter_conf:
fiter_txt_is_None: true # 过滤掉text is "<NONE>"的语音数据,适配由于gender数据部分含有<NONE>标签而设计。但仅train起作用
# model config for encoder
encoder: transformer
encoder_conf:
activation_type: gelu
attention_dropout_rate: 0.0
attention_heads: 16
dropout_rate: 0.1
gradient_checkpointing: true
input_layer: conv1d2
key_bias: false
linear_units: 4096
normalize_before: true
num_blocks: 24
output_size: 1024
pos_enc_layer_type: abs_pos_whisper
positional_dropout_rate: 0.1
static_chunk_size: -1
use_dynamic_chunk: false
use_dynamic_left_chunk: false
|