Spaces:

mrfakename
/

DMOSpeech2

Running on Zero

App Files Files Community

DMOSpeech2 / funasr_detach /models /emotion2vec /template.yaml

mrfakename

Super-squash branch 'main' using huggingface_hub

0102e16 verified 5 months ago

raw

history blame

3.26 kB

	# This is an example that demonstrates how to configure a model file.
	# You can modify the configuration according to your own requirements.

	# to print the register_table:
	# from funasr.register import tables
	# tables.print()

	# network architecture
	model: Emotion2vec
	model_conf:
	loss_beta: 0.0
	loss_scale: null
	depth: 8
	start_drop_path_rate: 0.0
	end_drop_path_rate: 0.0
	num_heads: 12
	norm_eps: 1e-05
	norm_affine: true
	encoder_dropout: 0.1
	post_mlp_drop: 0.1
	attention_dropout: 0.1
	activation_dropout: 0.0
	dropout_input: 0.0
	layerdrop: 0.05
	embed_dim: 768
	mlp_ratio: 4.0
	layer_norm_first: false
	average_top_k_layers: 8
	end_of_block_targets: false
	clone_batch: 8
	layer_norm_target_layer: false
	batch_norm_target_layer: false
	instance_norm_target_layer: true
	instance_norm_targets: false
	layer_norm_targets: false
	ema_decay: 0.999
	ema_same_dtype: true
	log_norms: true
	ema_end_decay: 0.99999
	ema_anneal_end_step: 20000
	ema_encoder_only: false
	max_update: 100000
	extractor_mode: layer_norm
	shared_decoder: null
	min_target_var: 0.1
	min_pred_var: 0.01
	supported_modality: AUDIO
	mae_init: false
	seed: 1
	skip_ema: false
	cls_loss: 1.0
	recon_loss: 0.0
	d2v_loss: 1.0
	decoder_group: false
	adversarial_training: false
	adversarial_hidden_dim: 128
	adversarial_weight: 0.1
	cls_type: chunk
	normalize: true

	modalities:
	audio:
	type: AUDIO
	prenet_depth: 4
	prenet_layerdrop: 0.05
	prenet_dropout: 0.1
	start_drop_path_rate: 0.0
	end_drop_path_rate: 0.0
	num_extra_tokens: 10
	init_extra_token_zero: true
	mask_noise_std: 0.01
	mask_prob_min: null
	mask_prob: 0.5
	inverse_mask: false
	mask_prob_adjust: 0.05
	keep_masked_pct: 0.0
	mask_length: 5
	add_masks: false
	remove_masks: false
	mask_dropout: 0.0
	encoder_zero_mask: true
	mask_channel_prob: 0.0
	mask_channel_length: 64
	ema_local_encoder: false
	local_grad_mult: 1.0
	use_alibi_encoder: true
	alibi_scale: 1.0
	learned_alibi: false
	alibi_max_pos: null
	learned_alibi_scale: true
	learned_alibi_scale_per_head: true
	learned_alibi_scale_per_layer: false
	num_alibi_heads: 12
	model_depth: 8
	decoder:
	decoder_dim: 384
	decoder_groups: 16
	decoder_kernel: 7
	decoder_layers: 4
	input_dropout: 0.1
	add_positions_masked: false
	add_positions_all: false
	decoder_residual: true
	projection_layers: 1
	projection_ratio: 2.0
	extractor_mode: layer_norm
	feature_encoder_spec: '[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]'
	conv_pos_width: 95
	conv_pos_groups: 16
	conv_pos_depth: 5
	conv_pos_pre_ln: false