model: names: - ft_transformer - fusion_mlp - hf_text hf_text: checkpoint_name: local://hf_text gradient_checkpointing: false pooling_mode: cls data_types: - text tokenizer_name: hf_auto use_fast: true max_text_len: 512 insert_sep: true low_cpu_mem_usage: false text_segment_num: 2 stochastic_chunk: false text_aug_detect_length: 10 text_trivial_aug_maxscale: 0.1 text_train_augment_types: null fusion_mlp: aux_loss_weight: null adapt_in_features: max hidden_sizes: - 128 activation: leaky_relu dropout: 0.1 normalization: layer_norm data_types: null ft_transformer: data_types: - numerical embedding_arch: - linear token_dim: 192 hidden_size: 192 num_blocks: 3 attention_num_heads: 8 attention_dropout: 0.2 residual_dropout: 0.0 ffn_dropout: 0.1 ffn_hidden_size: 192 ffn_activation: geglu head_activation: relu normalization: layer_norm merge: concat requires_all_dtypes: false additive_attention: false share_qv_weights: false pooling_mode: cls checkpoint_name: null data: image: missing_value_strategy: zero text: normalize_text: false categorical: minimum_cat_count: 100 maximum_num_cat: 20 convert_to_text: false convert_to_text_template: latex numerical: convert_to_text: false scaler_with_mean: true scaler_with_std: true document: missing_value_strategy: zero label: numerical_preprocessing: standardscaler pos_label: null column_features_pooling_mode: concat mixup: turn_on: false mixup_alpha: 0.8 cutmix_alpha: 1.0 cutmix_minmax: null prob: 1.0 switch_prob: 0.5 mode: batch turn_off_epoch: 5 label_smoothing: 0.1 modality_dropout: 0 templates: turn_on: false num_templates: 30 template_length: 2048 preset_templates: - super_glue - rte custom_templates: null optim: optim_type: adamw lr: 0.0001 weight_decay: 0.001 lr_choice: layerwise_decay lr_decay: 0.9 lr_schedule: cosine_decay max_epochs: 20 max_steps: -1 warmup_steps: 0.1 end_lr: 0 lr_mult: 1 patience: 10 val_check_interval: 0.5 check_val_every_n_epoch: 1 skip_final_val: false gradient_clip_val: 1 gradient_clip_algorithm: norm track_grad_norm: -1 log_every_n_steps: 10 label_smoothing: 0 top_k: 3 top_k_average_method: greedy_soup peft: null lora: module_filter: null filter: - query - value - ^q$ - ^v$ - ^k$ - ^o$ r: 8 alpha: 8 conv_lora_expert_num: 8 loss_func: auto focal_loss: alpha: null gamma: 2.0 reduction: mean mask2former_loss: loss_cross_entropy_weight: 10.0 loss_mask_weight: 5.0 loss_dice_weight: 5.0 extra_trainable_params: [] cross_modal_align: null cross_modal_align_weight: 0 automatic_optimization: true lemda: turn_on: false arch_type: mlp_vae z_dim: 8 num_layers: 6 kld_weight: 0.1 mse_weight: 0.1 adv_weight: 0.0001 consist_weight: 0.01 consist_threshold: 0.5 lr: 0.0001 optim_type: adamw weight_decay: 1.0e-05 env: num_gpus: 2 num_nodes: 1 batch_size: 128 per_gpu_batch_size: 8 inference_batch_size_ratio: 4 precision: 16-mixed num_workers: 2 num_workers_inference: 2 accelerator: auto fast_dev_run: false deterministic: false auto_select_gpus: true strategy: ddp_fork_find_unused_parameters_true deepspeed_allgather_size: 1000000000.0 deepspeed_allreduce_size: 1000000000.0 compile: turn_on: false mode: default dynamic: true backend: inductor