# -------------------------------------------------------- # SenseTime # Copyright (c) 2025 SenseTime # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- import copy from transformers.configuration_utils import PretrainedConfig from transformers.utils import logging logger = logging.get_logger(__name__) class FlowConfig(PretrainedConfig): def __init__( self, input_size = 512, output_size= 80, spk_embed_dim = 192, output_type = 'mel', vocab_size = 6561, input_frame_rate = 25, only_mask_loss = True, token_mel_ratio=2, pre_lookahead_len=3, encoder_config={'output_size': 512, 'attention_heads': 8, 'linear_units': 2048, 'num_blocks': 6, 'dropout_rate': 0.1, 'positional_dropout_rate': 0.1, 'attention_dropout_rate': 0.1, 'normalize_before': True, 'input_layer': 'linear', 'pos_enc_layer_type': 'rel_pos_espnet', 'selfattention_layer_type': 'rel_selfattn', 'input_size': 512, 'use_cnn_module': False, 'macaron_style': False, }, decoder_config={'in_channels': 240, 'n_spks': 1, 'spk_emb_dim': 80, 'cfm_params': { 'sigma_min': 1e-06, 'solver': 'euler', 't_scheduler': 'cosine', 'training_cfg_rate': 0.2, 'inference_cfg_rate': 0.7, 'reg_loss_type': 'l1', }, 'estimator_config':{ 'in_channels': 320, 'out_channels': 80, 'causal': True, 'channels': [256], 'dropout': 0.0, 'attention_head_dim': 64, 'n_blocks': 4, 'num_mid_blocks': 12, 'num_heads': 8, 'act_fn': 'gelu' } }, **kwargs): super().__init__(**kwargs) self.encoder_config = encoder_config self.decoder_config = decoder_config self.input_size = input_size self.output_size = output_size self.spk_embed_dim = spk_embed_dim self.output_type = output_type self.vocab_size = vocab_size self.input_frame_rate = input_frame_rate self.only_mask_loss = only_mask_loss self.token_mel_ratio = token_mel_ratio self.pre_lookahead_len = pre_lookahead_len pass def to_dict(self): """ Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. Returns: `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, """ output = copy.deepcopy(self.__dict__) output['encoder_config'] = self.encoder_config output['decoder_config'] = self.decoder_config output['input_size'] = self.input_size output['output_size'] = self.output_size output['spk_embed_dim'] = self.spk_embed_dim output['output_type'] = self.output_type output['vocab_size'] = self.vocab_size output['input_frame_rate'] = self.input_frame_rate output['only_mask_loss'] = self.only_mask_loss output['token_mel_ratio'] = self.token_mel_ratio output['pre_lookahead_len'] = self.pre_lookahead_len return output