Spaces:
Running
on
L4
Running
on
L4
_target_: fish_speech.models.dac.modded_dac.DAC | |
# Model setup | |
sample_rate: 44100 | |
encoder_dim: 64 | |
encoder_rates: [2, 4, 8, 8] | |
decoder_dim: 1536 | |
decoder_rates: [8, 8, 4, 2] | |
encoder_transformer_layers: [0, 0, 0, 4] | |
decoder_transformer_layers: [4, 0, 0, 0] | |
transformer_general_config: | |
_target_: fish_speech.models.dac.modded_dac.ModelArgs | |
_partial_: true | |
block_size: 16384 | |
n_local_heads: -1 | |
head_dim: 64 | |
rope_base: 10000 | |
norm_eps: 1e-5 | |
dropout_rate: 0.1 | |
attn_dropout_rate: 0.1 | |
channels_first: true | |
# Quantization | |
quantizer: | |
_target_: fish_speech.models.dac.rvq.DownsampleResidualVectorQuantize | |
input_dim: 1024 | |
n_codebooks: 9 | |
codebook_size: 1024 | |
codebook_dim: 8 | |
quantizer_dropout: 0.5 | |
downsample_factor: [2, 2] | |
post_module: | |
_target_: fish_speech.models.dac.modded_dac.WindowLimitedTransformer | |
causal: true | |
window_size: 128 # empirically this does not seem to matter | |
input_dim: 1024 | |
config: | |
_target_: fish_speech.models.dac.modded_dac.ModelArgs | |
block_size: 4096 | |
n_layer: 8 | |
n_head: 16 | |
dim: 1024 | |
intermediate_size: 3072 | |
n_local_heads: -1 | |
head_dim: 64 | |
rope_base: 10000 | |
norm_eps: 1e-5 | |
dropout_rate: 0.1 | |
attn_dropout_rate: 0.1 | |
channels_first: true | |
pre_module: | |
semantic_codebook_size: 4096 | |