Stardust-minus's picture
Upload folder using huggingface_hub
a26769d verified
_target_: fish_speech.models.dac.modded_dac.DAC
# Model setup
sample_rate: 44100
encoder_dim: 64
encoder_rates: [2, 4, 8, 8]
decoder_dim: 1536
decoder_rates: [8, 8, 4, 2]
encoder_transformer_layers: [0, 0, 0, 4]
decoder_transformer_layers: [4, 0, 0, 0]
transformer_general_config:
_target_: fish_speech.models.dac.modded_dac.ModelArgs
_partial_: true
block_size: 16384
n_local_heads: -1
head_dim: 64
rope_base: 10000
norm_eps: 1e-5
dropout_rate: 0.1
attn_dropout_rate: 0.1
channels_first: true
# Quantization
quantizer:
_target_: fish_speech.models.dac.rvq.DownsampleResidualVectorQuantize
input_dim: 1024
n_codebooks: 9
codebook_size: 1024
codebook_dim: 8
quantizer_dropout: 0.5
downsample_factor: [2, 2]
post_module: &transformer_module
_target_: fish_speech.models.dac.modded_dac.WindowLimitedTransformer
causal: true
window_size: 128 # empirically this does not seem to matter
input_dim: 1024
config: &transformer_config
_target_: fish_speech.models.dac.modded_dac.ModelArgs
block_size: 4096
n_layer: 8
n_head: 16
dim: 1024
intermediate_size: 3072
n_local_heads: -1
head_dim: 64
rope_base: 10000
norm_eps: 1e-5
dropout_rate: 0.1
attn_dropout_rate: 0.1
channels_first: true
pre_module: *transformer_module
semantic_codebook_size: 4096