from transformers.configuration_utils import PretrainedConfig from transformers.models.auto import CONFIG_MAPPING class HiggsAudioEncoderConfig(PretrainedConfig): """Configuration of the Audio encoder in Higgs-Audio.""" model_type = "higgs_audio_encoder" def __init__( self, num_mel_bins=128, encoder_layers=32, encoder_attention_heads=20, encoder_ffn_dim=5120, encoder_layerdrop=0.0, d_model=1280, dropout=0.0, attention_dropout=0.0, activation_function="gelu", activation_dropout=0.0, scale_embedding=False, init_std=0.02, max_source_positions=1500, pad_token_id=128001, **kwargs, ): super().__init__(**kwargs) self.num_mel_bins = num_mel_bins self.d_model = d_model self.encoder_layers = encoder_layers self.encoder_attention_heads = encoder_attention_heads self.encoder_ffn_dim = encoder_ffn_dim self.dropout = dropout self.attention_dropout = attention_dropout self.activation_function = activation_function self.activation_dropout = activation_dropout self.encoder_layerdrop = encoder_layerdrop self.num_hidden_layers = encoder_layers self.init_std = init_std self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True self.max_source_positions = max_source_positions self.pad_token_id = pad_token_id class HiggsAudioConfig(PretrainedConfig): r""" This is the configuration class for the HiggsAudioModel. Args: text_config (`Union[AutoConfig, dict]`): The config object or dictionary of the text backbone. audio_encoder_config (`Union[AutoConfig, dict]`): The config object or dictionary of the whisper encoder. The audio encoder will be bidirectional and will be only available for audio understanding. audio_tokenizer_config The config object or dictionary of the audio tokenizer. audio_adapter_type The type of audio adapter to use. We support two types of adapter: - stack: We stack additional Transformer layers after the main LLM backbone for audio generation. - dual_ffn: For selected part of the LLM backbone, we replace the text FFN with a dual FFN architecture that contains an additional audio FFN. The audio FFN will be triggered when the location is marked for audio tokens. - dual_ffn_fast_forward: We pick a few layers in the LLM backbone to plug-in the audio FFN. For the remaining layers, the audio hidden states will be directly fast-forward to the next layer. This reduces the computational cost for audio generation. audio_embed_avg (`bool`, *optional*, defaults to False): Whether to average the audio embeddings before sending them to the text attention layer. audio_ffn_hidden_size The hidden size of the audio feedforward network in dual-path FFN audio_ffn_intermediate_size The intermediate size of the audio feedforward network in dual-path FFN audio_dual_ffn_layers The layers in the LLM backbone to plug-in the dual FFN layer (mixture of audio FFN and text FFN). audio_decoder_proj_num_attention (`int`, *optional*, defaults to 0): The number of attention heads in the audio decoder projection layer. use_delay_pattern (`bool`, *optional*, defaults to False): Whether to use delay pattern in the audio decoder. skip_audio_tower (`bool`, *optional*, defaults to False): Whether to skip the audio tower in the audio encoder. use_audio_out_embed_projector (`bool`, *optional*, defaults to False): Whether to use an embedding projector to map audio out embeddings. use_audio_out_self_attention (`bool`, *optional*, defaults to False): Whether to use self-attention to aggregate information from audio-tokens before sending to the text attention layer. audio_num_codebooks (`int`, *optional*, defaults to 12): The number of codebooks in RVQGAN. audio_codebook_size (`int`, *optional*, defaults to 1024): The size of each codebook in RVQGAN. audio_stream_bos_id The id of the bos in the audio stream audio_stream_eos_id The id of the eos in the audio stream audio_bos_token (`str`, *optional*, defaults to "<|audio_bos|>"): The special `<|audio_bos|>` token. In Higgs-Audio, it is mapped to 128011, which is the index of `<|reserved_special_token_3|>` in Llama-3.1-8B-Instruct's tokenizer. audio_eos_token (`str`, *optional*, defaults to "<|audio_eos|>"): The special `<|audio_eos|>` token. We use 128012 as the default value, which is the index of `<|reserved_special_token_4|>` in Llama-3.1-8B-Instruct's tokenizer. audio_out_bos_token (`str`, *optional*, defaults to "<|audio_out_bos|>"): The special `<|audio_out_bos|>` token. We use 128013 as the default value, which is the index of `<|reserved_special_token_5|>` in Llama-3.1-8B-Instruct's tokenizer. audio_token (`str`, *optional*, defaults to "<|AUDIO|>"): The special `<|AUDIO|>` token. We use 128015 as the default value, which is the index of `<|reserved_special_token_7|>` in Llama-3.1-8B-Instruct's tokenizer. This token indicates that the location should be filled in with whisper features. audio_out_token (`str`, *optional*, defaults to "<|AUDIO_OUT|>"): The special `<|AUDIO_OUT|>` token. We use 128016 as the default value, which is the index of `<|reserved_special_token_8|>` in Llama-3.1-8B-Instruct's tokenizer. This token indicates that the location should be filled in with audio tokens extracted via audio tokenizer. """ model_type = "higgs_audio" is_composition = True def __init__( self, text_config=None, audio_encoder_config=None, audio_tokenizer_config=None, audio_adapter_type="stack", audio_embed_avg=False, audio_ffn_hidden_size=4096, audio_ffn_intermediate_size=14336, audio_dual_ffn_layers=None, audio_decoder_proj_num_layers=0, encode_whisper_embed=True, encode_audio_in_tokens=False, use_delay_pattern=False, skip_audio_tower=False, use_audio_out_embed_projector=False, use_audio_out_self_attention=False, use_rq_transformer=False, rq_transformer_hidden_size=None, rq_transformer_intermediate_size=None, rq_transformer_num_attention_heads=None, rq_transformer_num_key_value_heads=None, rq_transformer_num_hidden_layers=3, audio_num_codebooks=12, audio_codebook_size=1024, audio_stream_bos_id=1024, audio_stream_eos_id=1025, audio_bos_token="<|audio_bos|>", audio_eos_token="<|audio_eos|>", audio_out_bos_token="<|audio_out_bos|>", audio_in_token="<|AUDIO|>", audio_out_token="<|AUDIO_OUT|>", audio_in_token_idx=128015, audio_out_token_idx=128016, pad_token_id=128001, audio_out_bos_token_id=128013, audio_eos_token_id=128012, **kwargs, ): if isinstance(audio_encoder_config, dict): audio_encoder_config["model_type"] = ( audio_encoder_config["model_type"] if "model_type" in audio_encoder_config else "higgs_audio_encoder" ) audio_encoder_config = CONFIG_MAPPING[audio_encoder_config["model_type"]](**audio_encoder_config) elif audio_encoder_config is None: audio_encoder_config = HiggsAudioEncoderConfig() if isinstance(text_config, dict): text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama" text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) elif text_config is None: text_config = CONFIG_MAPPING["llama"]() assert audio_adapter_type in [ "stack", "dual_ffn", "dual_ffn_fast_forward", ], f"Invalid audio adapter type: {audio_adapter_type}" if audio_adapter_type.startswith("dual_ffn"): assert audio_dual_ffn_layers is not None, ( "audio_dual_ffn_layers must be specified when using dual_ffn adapter." ) self.text_config = text_config self.audio_encoder_config = audio_encoder_config self.audio_tokenizer_config = audio_tokenizer_config self.audio_adapter_type = audio_adapter_type self.audio_embed_avg = audio_embed_avg self.audio_ffn_hidden_size = audio_ffn_hidden_size self.audio_ffn_intermediate_size = audio_ffn_intermediate_size self.audio_dual_ffn_layers = audio_dual_ffn_layers self.audio_decoder_proj_num_layers = audio_decoder_proj_num_layers self.encode_whisper_embed = encode_whisper_embed self.encode_audio_in_tokens = encode_audio_in_tokens self.use_delay_pattern = use_delay_pattern self.skip_audio_tower = skip_audio_tower self.use_audio_out_embed_projector = use_audio_out_embed_projector self.use_audio_out_self_attention = use_audio_out_self_attention self.use_rq_transformer = use_rq_transformer if self.use_rq_transformer: assert not self.use_delay_pattern, "Delay pattern is not supported if you turned on RQ-Transformer!" self.rq_transformer_hidden_size = rq_transformer_hidden_size self.rq_transformer_intermediate_size = rq_transformer_intermediate_size self.rq_transformer_num_attention_heads = rq_transformer_num_attention_heads self.rq_transformer_num_key_value_heads = rq_transformer_num_key_value_heads self.rq_transformer_num_hidden_layers = rq_transformer_num_hidden_layers if use_rq_transformer: # For RQ-Transformer, we set the hidden_size to the same as the text model's hidden size if it is not specified. if self.rq_transformer_hidden_size is None: self.rq_transformer_hidden_size = text_config.hidden_size assert self.rq_transformer_hidden_size % 128 == 0 if self.rq_transformer_intermediate_size is None: self.rq_transformer_intermediate_size = text_config.intermediate_size if self.rq_transformer_num_attention_heads is None: self.rq_transformer_num_attention_heads = self.rq_transformer_hidden_size // 128 if self.rq_transformer_num_key_value_heads is None: self.rq_transformer_num_key_value_heads = self.rq_transformer_hidden_size // 128 // 4 assert self.rq_transformer_hidden_size % self.rq_transformer_num_attention_heads == 0 assert self.rq_transformer_hidden_size % self.rq_transformer_num_key_value_heads == 0 self.audio_num_codebooks = audio_num_codebooks self.audio_codebook_size = audio_codebook_size self.audio_bos_token = audio_bos_token self.audio_eos_token = audio_eos_token self.audio_out_bos_token = audio_out_bos_token self.audio_in_token = audio_in_token self.audio_out_token = audio_out_token self.audio_in_token_idx = audio_in_token_idx self.audio_out_token_idx = audio_out_token_idx self.audio_stream_bos_id = audio_stream_bos_id self.audio_stream_eos_id = audio_stream_eos_id self.audio_out_bos_token_id = audio_out_bos_token_id self.audio_eos_token_id = audio_eos_token_id super().__init__(**kwargs) self.pad_token_id = pad_token_id