File size: 2,016 Bytes
c3ea51b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
## IO
save_data: data
overwrite: True
seed: 1234
report_every: 100
valid_metrics: ["BLEU"]
tensorboard: true
tensorboard_log_dir: tensorboard
### Vocab
src_vocab: en.eole.vocab
tgt_vocab: ja.eole.vocab
src_vocab_size: 20000
tgt_vocab_size: 20000
vocab_size_multiple: 8
share_vocab: false
n_sample: 0
data:
corpus_1:
path_src: hf://quickmt/quickmt-train.ja-en/en
path_tgt: hf://quickmt/quickmt-train.ja-en/ja
path_sco: hf://quickmt/quickmt-train.ja-en/sco
valid:
path_src: valid.en
path_tgt: valid.ja
transforms: [sentencepiece, filtertoolong]
transforms_configs:
sentencepiece:
src_subword_model: "en.spm.model"
tgt_subword_model: "ja.spm.model"
filtertoolong:
src_seq_length: 256
tgt_seq_length: 256
training:
# Run configuration
model_path: quickmt-en-ja-eole-model
keep_checkpoint: 4
train_steps: 100_000
save_checkpoint_steps: 5000
valid_steps: 5000
# Train on a single GPU
world_size: 1
gpu_ranks: [0]
# Batching 10240
batch_type: "tokens"
batch_size: 8000
valid_batch_size: 4096
batch_size_multiple: 8
accum_count: [10]
accum_steps: [0]
# Optimizer & Compute
compute_dtype: "fp16"
optim: "adamw"
learning_rate: 2.0
warmup_steps: 4000
decay_method: "noam"
adam_beta2: 0.998
# Data loading
bucket_size: 128000
num_workers: 4
prefetch_factor: 32
# Hyperparams
dropout_steps: [0]
dropout: [0.1]
attention_dropout: [0.1]
max_grad_norm: 0
label_smoothing: 0.1
average_decay: 0.0001
average_decay: 0
param_init_method: xavier_uniform
normalization: "tokens"
model:
architecture: "transformer"
share_embeddings: false
share_decoder_embeddings: false
hidden_size: 1024
encoder:
layers: 8
decoder:
layers: 2
heads: 8
transformer_ff: 4096
embeddings:
word_vec_size: 1024
position_encoding_type: "SinusoidalInterleaved"
|