higgs_audio_v2 / higgs_audio /model /configuration_higgs_audio.py
zachzzc's picture
Upload tts playground and serving engine
07f1f64
from transformers.configuration_utils import PretrainedConfig
from transformers.models.auto import CONFIG_MAPPING
class HiggsAudioEncoderConfig(PretrainedConfig):
"""Configuration of the Audio encoder in Higgs-Audio."""
model_type = "higgs_audio_encoder"
def __init__(
self,
num_mel_bins=128,
encoder_layers=32,
encoder_attention_heads=20,
encoder_ffn_dim=5120,
encoder_layerdrop=0.0,
d_model=1280,
dropout=0.0,
attention_dropout=0.0,
activation_function="gelu",
activation_dropout=0.0,
scale_embedding=False,
init_std=0.02,
max_source_positions=1500,
pad_token_id=128001,
**kwargs,
):
super().__init__(**kwargs)
self.num_mel_bins = num_mel_bins
self.d_model = d_model
self.encoder_layers = encoder_layers
self.encoder_attention_heads = encoder_attention_heads
self.encoder_ffn_dim = encoder_ffn_dim
self.dropout = dropout
self.attention_dropout = attention_dropout
self.activation_function = activation_function
self.activation_dropout = activation_dropout
self.encoder_layerdrop = encoder_layerdrop
self.num_hidden_layers = encoder_layers
self.init_std = init_std
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
self.max_source_positions = max_source_positions
self.pad_token_id = pad_token_id
class HiggsAudioConfig(PretrainedConfig):
r"""
This is the configuration class for the HiggsAudioModel.
Args:
text_config (`Union[AutoConfig, dict]`):
The config object or dictionary of the text backbone.
audio_encoder_config (`Union[AutoConfig, dict]`):
The config object or dictionary of the whisper encoder.
The audio encoder will be bidirectional and will be only available for audio understanding.
audio_tokenizer_config
The config object or dictionary of the audio tokenizer.
audio_adapter_type
The type of audio adapter to use. We support two types of adapter:
- stack:
We stack additional Transformer layers after the main LLM backbone for audio generation.
- dual_ffn:
For selected part of the LLM backbone, we replace the text FFN with a dual FFN architecture
that contains an additional audio FFN. The audio FFN will be triggered when the location is marked for audio tokens.
- dual_ffn_fast_forward:
We pick a few layers in the LLM backbone to plug-in the audio FFN. For the remaining layers,
the audio hidden states will be directly fast-forward to the next layer.
This reduces the computational cost for audio generation.
audio_embed_avg (`bool`, *optional*, defaults to False):
Whether to average the audio embeddings before sending them to the text attention layer.
audio_ffn_hidden_size
The hidden size of the audio feedforward network in dual-path FFN
audio_ffn_intermediate_size
The intermediate size of the audio feedforward network in dual-path FFN
audio_dual_ffn_layers
The layers in the LLM backbone to plug-in the dual FFN layer (mixture of audio FFN and text FFN).
audio_decoder_proj_num_attention (`int`, *optional*, defaults to 0):
The number of attention heads in the audio decoder projection layer.
use_delay_pattern (`bool`, *optional*, defaults to False):
Whether to use delay pattern in the audio decoder.
skip_audio_tower (`bool`, *optional*, defaults to False):
Whether to skip the audio tower in the audio encoder.
use_audio_out_embed_projector (`bool`, *optional*, defaults to False):
Whether to use an embedding projector to map audio out embeddings.
use_audio_out_self_attention (`bool`, *optional*, defaults to False):
Whether to use self-attention to aggregate information from audio-tokens before sending to the text attention layer.
audio_num_codebooks (`int`, *optional*, defaults to 12):
The number of codebooks in RVQGAN.
audio_codebook_size (`int`, *optional*, defaults to 1024):
The size of each codebook in RVQGAN.
audio_stream_bos_id
The id of the bos in the audio stream
audio_stream_eos_id
The id of the eos in the audio stream
audio_bos_token (`str`, *optional*, defaults to "<|audio_bos|>"):
The special `<|audio_bos|>` token. In Higgs-Audio, it is mapped to 128011,
which is the index of `<|reserved_special_token_3|>` in Llama-3.1-8B-Instruct's tokenizer.
audio_eos_token (`str`, *optional*, defaults to "<|audio_eos|>"):
The special `<|audio_eos|>` token. We use 128012 as the default value,
which is the index of `<|reserved_special_token_4|>` in Llama-3.1-8B-Instruct's tokenizer.
audio_out_bos_token (`str`, *optional*, defaults to "<|audio_out_bos|>"):
The special `<|audio_out_bos|>` token. We use 128013 as the default value,
which is the index of `<|reserved_special_token_5|>` in Llama-3.1-8B-Instruct's tokenizer.
audio_token (`str`, *optional*, defaults to "<|AUDIO|>"):
The special `<|AUDIO|>` token. We use 128015 as the default value,
which is the index of `<|reserved_special_token_7|>` in Llama-3.1-8B-Instruct's tokenizer.
This token indicates that the location should be filled in with whisper features.
audio_out_token (`str`, *optional*, defaults to "<|AUDIO_OUT|>"):
The special `<|AUDIO_OUT|>` token. We use 128016 as the default value,
which is the index of `<|reserved_special_token_8|>` in Llama-3.1-8B-Instruct's tokenizer.
This token indicates that the location should be filled in with audio tokens extracted via audio tokenizer.
"""
model_type = "higgs_audio"
is_composition = True
def __init__(
self,
text_config=None,
audio_encoder_config=None,
audio_tokenizer_config=None,
audio_adapter_type="stack",
audio_embed_avg=False,
audio_ffn_hidden_size=4096,
audio_ffn_intermediate_size=14336,
audio_dual_ffn_layers=None,
audio_decoder_proj_num_layers=0,
encode_whisper_embed=True,
encode_audio_in_tokens=False,
use_delay_pattern=False,
skip_audio_tower=False,
use_audio_out_embed_projector=False,
use_audio_out_self_attention=False,
use_rq_transformer=False,
rq_transformer_hidden_size=None,
rq_transformer_intermediate_size=None,
rq_transformer_num_attention_heads=None,
rq_transformer_num_key_value_heads=None,
rq_transformer_num_hidden_layers=3,
audio_num_codebooks=12,
audio_codebook_size=1024,
audio_stream_bos_id=1024,
audio_stream_eos_id=1025,
audio_bos_token="<|audio_bos|>",
audio_eos_token="<|audio_eos|>",
audio_out_bos_token="<|audio_out_bos|>",
audio_in_token="<|AUDIO|>",
audio_out_token="<|AUDIO_OUT|>",
audio_in_token_idx=128015,
audio_out_token_idx=128016,
pad_token_id=128001,
audio_out_bos_token_id=128013,
audio_eos_token_id=128012,
**kwargs,
):
if isinstance(audio_encoder_config, dict):
audio_encoder_config["model_type"] = (
audio_encoder_config["model_type"] if "model_type" in audio_encoder_config else "higgs_audio_encoder"
)
audio_encoder_config = CONFIG_MAPPING[audio_encoder_config["model_type"]](**audio_encoder_config)
elif audio_encoder_config is None:
audio_encoder_config = HiggsAudioEncoderConfig()
if isinstance(text_config, dict):
text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama"
text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
elif text_config is None:
text_config = CONFIG_MAPPING["llama"]()
assert audio_adapter_type in [
"stack",
"dual_ffn",
"dual_ffn_fast_forward",
], f"Invalid audio adapter type: {audio_adapter_type}"
if audio_adapter_type.startswith("dual_ffn"):
assert audio_dual_ffn_layers is not None, (
"audio_dual_ffn_layers must be specified when using dual_ffn adapter."
)
self.text_config = text_config
self.audio_encoder_config = audio_encoder_config
self.audio_tokenizer_config = audio_tokenizer_config
self.audio_adapter_type = audio_adapter_type
self.audio_embed_avg = audio_embed_avg
self.audio_ffn_hidden_size = audio_ffn_hidden_size
self.audio_ffn_intermediate_size = audio_ffn_intermediate_size
self.audio_dual_ffn_layers = audio_dual_ffn_layers
self.audio_decoder_proj_num_layers = audio_decoder_proj_num_layers
self.encode_whisper_embed = encode_whisper_embed
self.encode_audio_in_tokens = encode_audio_in_tokens
self.use_delay_pattern = use_delay_pattern
self.skip_audio_tower = skip_audio_tower
self.use_audio_out_embed_projector = use_audio_out_embed_projector
self.use_audio_out_self_attention = use_audio_out_self_attention
self.use_rq_transformer = use_rq_transformer
if self.use_rq_transformer:
assert not self.use_delay_pattern, "Delay pattern is not supported if you turned on RQ-Transformer!"
self.rq_transformer_hidden_size = rq_transformer_hidden_size
self.rq_transformer_intermediate_size = rq_transformer_intermediate_size
self.rq_transformer_num_attention_heads = rq_transformer_num_attention_heads
self.rq_transformer_num_key_value_heads = rq_transformer_num_key_value_heads
self.rq_transformer_num_hidden_layers = rq_transformer_num_hidden_layers
if use_rq_transformer:
# For RQ-Transformer, we set the hidden_size to the same as the text model's hidden size if it is not specified.
if self.rq_transformer_hidden_size is None:
self.rq_transformer_hidden_size = text_config.hidden_size
assert self.rq_transformer_hidden_size % 128 == 0
if self.rq_transformer_intermediate_size is None:
self.rq_transformer_intermediate_size = text_config.intermediate_size
if self.rq_transformer_num_attention_heads is None:
self.rq_transformer_num_attention_heads = self.rq_transformer_hidden_size // 128
if self.rq_transformer_num_key_value_heads is None:
self.rq_transformer_num_key_value_heads = self.rq_transformer_hidden_size // 128 // 4
assert self.rq_transformer_hidden_size % self.rq_transformer_num_attention_heads == 0
assert self.rq_transformer_hidden_size % self.rq_transformer_num_key_value_heads == 0
self.audio_num_codebooks = audio_num_codebooks
self.audio_codebook_size = audio_codebook_size
self.audio_bos_token = audio_bos_token
self.audio_eos_token = audio_eos_token
self.audio_out_bos_token = audio_out_bos_token
self.audio_in_token = audio_in_token
self.audio_out_token = audio_out_token
self.audio_in_token_idx = audio_in_token_idx
self.audio_out_token_idx = audio_out_token_idx
self.audio_stream_bos_id = audio_stream_bos_id
self.audio_stream_eos_id = audio_stream_eos_id
self.audio_out_bos_token_id = audio_out_bos_token_id
self.audio_eos_token_id = audio_eos_token_id
super().__init__(**kwargs)
self.pad_token_id = pad_token_id