## IO save_data: data overwrite: True seed: 1234 report_every: 100 valid_metrics: ["BLEU"] tensorboard: true tensorboard_log_dir: tensorboard ### Vocab src_vocab: en.eole.vocab tgt_vocab: ja.eole.vocab src_vocab_size: 20000 tgt_vocab_size: 20000 vocab_size_multiple: 8 share_vocab: false n_sample: 0 data: corpus_1: path_src: hf://quickmt/quickmt-train.ja-en/en path_tgt: hf://quickmt/quickmt-train.ja-en/ja path_sco: hf://quickmt/quickmt-train.ja-en/sco valid: path_src: valid.en path_tgt: valid.ja transforms: [sentencepiece, filtertoolong] transforms_configs: sentencepiece: src_subword_model: "en.spm.model" tgt_subword_model: "ja.spm.model" filtertoolong: src_seq_length: 256 tgt_seq_length: 256 training: # Run configuration model_path: quickmt-en-ja-eole-model keep_checkpoint: 4 train_steps: 100_000 save_checkpoint_steps: 5000 valid_steps: 5000 # Train on a single GPU world_size: 1 gpu_ranks: [0] # Batching 10240 batch_type: "tokens" batch_size: 8000 valid_batch_size: 4096 batch_size_multiple: 8 accum_count: [10] accum_steps: [0] # Optimizer & Compute compute_dtype: "fp16" optim: "adamw" learning_rate: 2.0 warmup_steps: 4000 decay_method: "noam" adam_beta2: 0.998 # Data loading bucket_size: 128000 num_workers: 4 prefetch_factor: 32 # Hyperparams dropout_steps: [0] dropout: [0.1] attention_dropout: [0.1] max_grad_norm: 0 label_smoothing: 0.1 average_decay: 0.0001 average_decay: 0 param_init_method: xavier_uniform normalization: "tokens" model: architecture: "transformer" share_embeddings: false share_decoder_embeddings: false hidden_size: 1024 encoder: layers: 8 decoder: layers: 2 heads: 8 transformer_ff: 4096 embeddings: word_vec_size: 1024 position_encoding_type: "SinusoidalInterleaved"