model_name: tangled-alpha-0.13-base model_config: name: tangled-alpha-0.13-base hf_config: {} block_size: 131072 n_layer: 32 n_embd: 576 vocab_size: 65536 padding_multiple: 512 padded_vocab_size: 65536 norm_class_name: RMSNorm norm_eps: 1.0e-05 norm_qk: false post_attention_norm: false post_mlp_norm: false parallel_residual: false shared_attention_norm: false n_head: 9 head_size: 64 n_query_groups: 3 attn_bias: false rope_base: 84000 rotary_percentage: 1.0 rope_condense_ratio: 1 intermediate_size: 1536 bias: false mlp_class_name: LLaMAMLP gelu_approximate: none n_expert: 0 n_expert_per_token: 0 scale_embeddings: false lm_head_bias: false out_dir: ../out/pretrain-base-0 precision: bf16-true resume: auto data: class_path: litgpt.data.LitData init_args: data_path: ../base-data-0-0-1073741824-8193-2000/ seed: 42 num_workers: 32 train: save_interval: 50 log_interval: 1 global_batch_size: 512 micro_batch_size: 2 lr_warmup_steps: 100 max_tokens: 12261897783 max_seq_length: 8193 tie_embeddings: false max_norm: 1.0 min_lr: 1.0e-05 eval: interval: 50 max_iters: 100 initial_validation: false final_validation: true evaluate_example: first optimizer: class_path: sophia_opt.SophiaG init_args: lr: 0.001 betas: - 0.965 - 0.99 rho: 0.04 weight_decay: 0.1 devices: auto num_nodes: 1 tokenizer_dir: ../tokenizer logger_name: wandb seed: 23