# -------------------------------------------------------- # SenseTime # Copyright (c) 2025 SenseTime # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- import copy from transformers.configuration_utils import PretrainedConfig from transformers.utils import logging logger = logging.get_logger(__name__) class HiFiGanConfig(PretrainedConfig): def __init__( self, in_channels = 80, base_channels = 512, nb_harmonics = 8, sampling_rate =24000, nsf_alpha= 0.1, nsf_sigma= 0.003, nsf_voiced_threshold = 10, upsample_rates = [8, 5, 3], upsample_kernel_sizes = [16, 11, 7], istft_params ={'n_fft': 16, 'hop_len': 4, }, resblock_kernel_sizes = [3, 7, 11], resblock_dilation_sizes = [[1, 3, 5], [1, 3, 5], [1, 3, 5]], source_resblock_kernel_sizes = [7, 7, 11], source_resblock_dilation_sizes = [[1, 3, 5], [1, 3, 5], [1, 3, 5]], lrelu_slope = 0.1, audio_limit =0.99, f0_predictor_config={ 'num_class': 1, 'in_channels': 80, 'cond_channels': 512 }, **kwargs): super().__init__(**kwargs) self.in_channels = in_channels self.base_channels = base_channels self.nb_harmonics = nb_harmonics self.sampling_rate = sampling_rate self.nsf_alpha = nsf_alpha self.nsf_sigma = nsf_sigma self.nsf_voiced_threshold = nsf_voiced_threshold self.upsample_rates = upsample_rates self.upsample_kernel_sizes = upsample_kernel_sizes self.istft_params = istft_params self.resblock_kernel_sizes = resblock_kernel_sizes self.resblock_dilation_sizes= resblock_dilation_sizes self.source_resblock_kernel_sizes = source_resblock_kernel_sizes self.source_resblock_dilation_sizes = source_resblock_dilation_sizes self.lrelu_slope = lrelu_slope self.audio_limit = audio_limit self.f0_predictor_config = f0_predictor_config pass def to_dict(self): """ Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. Returns: `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, """ output = copy.deepcopy(self.__dict__) output['in_channels'] = self.in_channels output['base_channels'] = self.base_channels output['nb_harmonics'] = self.nb_harmonics output['sampling_rate'] = self.sampling_rate output['nsf_alpha'] = self.nsf_alpha output['nsf_sigma'] = self.nsf_sigma output['nsf_voiced_threshold'] = self.nsf_voiced_threshold output['upsample_rates'] = self.upsample_rates output['upsample_kernel_sizes'] = self.upsample_kernel_sizes output['istft_params'] = self.istft_params output['resblock_kernel_sizes'] = self.resblock_kernel_sizes output['resblock_dilation_sizes'] = self.resblock_dilation_sizes output['source_resblock_dilation_sizes'] = self.source_resblock_dilation_sizes output['lrelu_slope'] = self.lrelu_slope output['audio_limit'] = self.audio_limit output['f0_predictor_config'] = self.f0_predictor_config return output