ml_dart_model / ml_model /config.yaml
joeyaintjoking's picture
initial commit
4aea73a
model:
names:
- ft_transformer
- fusion_mlp
- hf_text
hf_text:
checkpoint_name: local://hf_text
gradient_checkpointing: false
pooling_mode: cls
data_types:
- text
tokenizer_name: hf_auto
use_fast: true
max_text_len: 512
insert_sep: true
low_cpu_mem_usage: false
text_segment_num: 2
stochastic_chunk: false
text_aug_detect_length: 10
text_trivial_aug_maxscale: 0.1
text_train_augment_types: null
fusion_mlp:
aux_loss_weight: null
adapt_in_features: max
hidden_sizes:
- 128
activation: leaky_relu
dropout: 0.1
normalization: layer_norm
data_types: null
ft_transformer:
data_types:
- numerical
embedding_arch:
- linear
token_dim: 192
hidden_size: 192
num_blocks: 3
attention_num_heads: 8
attention_dropout: 0.2
residual_dropout: 0.0
ffn_dropout: 0.1
ffn_hidden_size: 192
ffn_activation: geglu
head_activation: relu
normalization: layer_norm
merge: concat
requires_all_dtypes: false
additive_attention: false
share_qv_weights: false
pooling_mode: cls
checkpoint_name: null
data:
image:
missing_value_strategy: zero
text:
normalize_text: false
categorical:
minimum_cat_count: 100
maximum_num_cat: 20
convert_to_text: false
convert_to_text_template: latex
numerical:
convert_to_text: false
scaler_with_mean: true
scaler_with_std: true
document:
missing_value_strategy: zero
label:
numerical_preprocessing: standardscaler
pos_label: null
column_features_pooling_mode: concat
mixup:
turn_on: false
mixup_alpha: 0.8
cutmix_alpha: 1.0
cutmix_minmax: null
prob: 1.0
switch_prob: 0.5
mode: batch
turn_off_epoch: 5
label_smoothing: 0.1
modality_dropout: 0
templates:
turn_on: false
num_templates: 30
template_length: 2048
preset_templates:
- super_glue
- rte
custom_templates: null
optim:
optim_type: adamw
lr: 0.0001
weight_decay: 0.001
lr_choice: layerwise_decay
lr_decay: 0.9
lr_schedule: cosine_decay
max_epochs: 20
max_steps: -1
warmup_steps: 0.1
end_lr: 0
lr_mult: 1
patience: 10
val_check_interval: 0.5
check_val_every_n_epoch: 1
skip_final_val: false
gradient_clip_val: 1
gradient_clip_algorithm: norm
track_grad_norm: -1
log_every_n_steps: 10
label_smoothing: 0
top_k: 3
top_k_average_method: greedy_soup
peft: null
lora:
module_filter: null
filter:
- query
- value
- ^q$
- ^v$
- ^k$
- ^o$
r: 8
alpha: 8
conv_lora_expert_num: 8
loss_func: auto
focal_loss:
alpha: null
gamma: 2.0
reduction: mean
mask2former_loss:
loss_cross_entropy_weight: 10.0
loss_mask_weight: 5.0
loss_dice_weight: 5.0
extra_trainable_params: []
cross_modal_align: null
cross_modal_align_weight: 0
automatic_optimization: true
lemda:
turn_on: false
arch_type: mlp_vae
z_dim: 8
num_layers: 6
kld_weight: 0.1
mse_weight: 0.1
adv_weight: 0.0001
consist_weight: 0.01
consist_threshold: 0.5
lr: 0.0001
optim_type: adamw
weight_decay: 1.0e-05
env:
num_gpus: 2
num_nodes: 1
batch_size: 128
per_gpu_batch_size: 8
inference_batch_size_ratio: 4
precision: 16-mixed
num_workers: 2
num_workers_inference: 2
accelerator: auto
fast_dev_run: false
deterministic: false
auto_select_gpus: true
strategy: ddp_fork_find_unused_parameters_true
deepspeed_allgather_size: 1000000000.0
deepspeed_allreduce_size: 1000000000.0
compile:
turn_on: false
mode: default
dynamic: true
backend: inductor