File size: 4,094 Bytes
841f290
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
model: osum_echat

# llm_path
llm_path: &llm_path "Qwen/Qwen2.5-3B-Instruct"

#
# model config
downsample_rate: 4 # 1 2 4 8
adapter_type: osum_echat
if_instruct: true
input_dim: 80

# tokenizer ,gxl
tokenizer: huggingface
tokenizer_conf:
  llm_path: *llm_path

# lora config
use_lora: false
lora_alpha: 32
lora_rank: 64 # 3B -> 85M
lora_dropout: 0.1

# speech generate config
speech_token_num: &token_num 4097 #4097


# Configuration of parameters for training
fire_module: link_and_encoder_and_lora  # link  encoder llm  link_and_encoder link_and_encoder_and_lora, llm需要配合use_lora为true

# other config
grad_clip: 5
accum_grad: 8
log_interval: 10
save_interval: 1250 #1250 #2500
max_epoch: 1
init_step: true

# training config
optim: adamw
optim_conf:
  betas:
  - 0.9
  - 0.99
  eps: 1.0e-06
  lr: 1.0e-06
  weight_decay: 0.01
scheduler: warmuplr
scheduler_conf:
  warmup_steps: 2000


dataset: asr
dataset_conf:
  speech_token_num: *token_num
  batch_conf:
    batch_size: 26
    batch_type: dynamic
    max_frames_in_batch: 28000000 #3000 #9000 #3000 #3300 # 3900
    max_seq_in_batch: 3700 #1500 #4000 #1100 #1600 # 1900
  feats_type: log_mel_spectrogram
  filter_conf:
    max_length: 20000
    min_length: 20
    token_max_length: 1200
    token_min_length: 1
    filter_no_extra_info: true # 如果没有task lang 等信息,直接过滤掉, 适用于通用多任务训练, 推理时应该关掉
    max_seq_len: 2000  #、1100 #1000
    other_filter_conf:
      only_s2s: false # 只针对与s2s dataloader的过滤
      only_s2t: false # 只针对与s2t dataloader的过滤
      only_t2t: false # 只针对与t2t dataloader的过滤
      only_t2s: false # 只针对与t2s dataloader的过滤
  language_conf:
    limited_langs:
    - zh
  log_mel_spectrogram_conf:
    hop_length: 160
    n_fft: 400
    num_mel_bins: 80
    padding: 0
  resample_conf:
    resample_rate: 16000
  shuffle: true
  shuffle_conf:
    shuffle_size: 1500
  sort: true
  sort_conf:
    sort_size: 500
  spec_aug: true
  spec_aug_conf:
    max_f: 10
    max_t: 50
    num_f_mask: 2
    num_t_mask: 2
  spec_sub: true
  spec_sub_conf:
    max_t: 30
    num_t_sub: 3
  spec_trim: false
  speed_perturb: false
  eod_id: 151645
  split_num: 1
  multi_num: 2
  prompt_conf_path: conf/prompt_config.yaml
  data_recover: false
  data_recover_conf:
    start_idx: 0 # 删除前面start_idx个item(tar包)
  other_tokenze_conf:  # 一些对数据额外操作的可控按钮,这些操作一般来说再test时都得为false
    only_info:
      only_s2s: false # 只针对与s2s dataloader的过滤
      only_s2t: false # 只针对与s2t dataloader的过滤
      only_t2t: false # 只针对与t2t dataloader的过滤
      only_t2s: false # 只针对与t2s dataloader的过滤
    use_50_per_change_if_only_X: true # 50%的句子随机替换为其only X
    use_s2s_streaming_random:
      enable: false
      rate: 0.5 # 1.0 表示100%的句子随机替换为其only X
    natural_language_convert:
      enable: false
      rate: 0.00 # 1.0 表示100%的转换成自然语言模式
    use_s2s_convert_s2t:
      enable: false # 单独为s2t dataloader 开启s2s convert
      rate: 1.0 # 1.0 表示100%的句子随机替换为其only X
    use_streaming_tts:
      enable: false
      rate: 0.5 # 1.0 表示100%的句子随机替换为其only X
    use_think_mode:
      enable: false # 开启think 模式, 即随机替换为think模式的句子
      rate: 0.8
  other_filter_conf:
    fiter_txt_is_None: true # 过滤掉text is "<NONE>"的语音数据,适配由于gender数据部分含有<NONE>标签而设计。但仅train起作用

# model config for encoder
encoder: transformer
encoder_conf:
  activation_type: gelu
  attention_dropout_rate: 0.0
  attention_heads: 16
  dropout_rate: 0.1
  gradient_checkpointing: true
  input_layer: conv1d2
  key_bias: false
  linear_units: 4096
  normalize_before: true
  num_blocks: 24
  output_size: 1024
  pos_enc_layer_type: abs_pos_whisper
  positional_dropout_rate: 0.1
  static_chunk_size: -1
  use_dynamic_chunk: false
  use_dynamic_left_chunk: false