File size: 13,056 Bytes
3b448ed |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
2025-08-28 22:36:54 - pico-train - INFO - Step 0 -- ๐ Evaluation Results
2025-08-28 22:36:54 - pico-train - INFO - โโโ paloma: inf
2025-08-28 22:36:55 - pico-train - INFO - ==================================================
2025-08-28 22:36:55 - pico-train - INFO - โจ Training Configuration
2025-08-28 22:36:55 - pico-train - INFO - ==================================================
2025-08-28 22:36:55 - pico-train - INFO - โญโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฎ
2025-08-28 22:36:55 - pico-train - INFO - โ checkpointing: โ
2025-08-28 22:36:55 - pico-train - INFO - โ checkpoints_dir: checkpoints โ
2025-08-28 22:36:55 - pico-train - INFO - โ evaluation: โ
2025-08-28 22:36:55 - pico-train - INFO - โ eval_results_dir: eval_results โ
2025-08-28 22:36:55 - pico-train - INFO - โ fabric_checkpoint_dir: fabric_state โ
2025-08-28 22:36:55 - pico-train - INFO - โ fabric_checkpoint_filename: checkpoint.pt โ
2025-08-28 22:36:55 - pico-train - INFO - โ hf_checkpoint: โ
2025-08-28 22:36:55 - pico-train - INFO - โ collection_slug: null โ
2025-08-28 22:36:55 - pico-train - INFO - โ repo_id: ThomasTheMaker/pico-decoder-tiny โ
2025-08-28 22:36:55 - pico-train - INFO - โ learning_dynamics: โ
2025-08-28 22:36:55 - pico-train - INFO - โ batch_size: 1 โ
2025-08-28 22:36:55 - pico-train - INFO - โ eval_data: null โ
2025-08-28 22:36:55 - pico-train - INFO - โ layer_suffixes: โ
2025-08-28 22:36:55 - pico-train - INFO - โ - attention.v_proj โ
2025-08-28 22:36:55 - pico-train - INFO - โ - attention.o_proj โ
2025-08-28 22:36:55 - pico-train - INFO - โ - swiglu.w_2 โ
2025-08-28 22:36:55 - pico-train - INFO - โ sequence_idx: -1 โ
2025-08-28 22:36:55 - pico-train - INFO - โ learning_dynamics_dir: learning_dynamics โ
2025-08-28 22:36:55 - pico-train - INFO - โ logs_dir: logs โ
2025-08-28 22:36:55 - pico-train - INFO - โ run_name: pico-decoder-tiny-dolma29k โ
2025-08-28 22:36:55 - pico-train - INFO - โ runs_dir: runs โ
2025-08-28 22:36:55 - pico-train - INFO - โ save_every_n_steps: 1000 โ
2025-08-28 22:36:55 - pico-train - INFO - โ save_to_hf: true โ
2025-08-28 22:36:55 - pico-train - INFO - โ training: โ
2025-08-28 22:36:55 - pico-train - INFO - โ auto_resume: true โ
2025-08-28 22:36:55 - pico-train - INFO - โ data: โ
2025-08-28 22:36:55 - pico-train - INFO - โ dataloader: โ
2025-08-28 22:36:55 - pico-train - INFO - โ batch_size: 4 โ
2025-08-28 22:36:55 - pico-train - INFO - โ dataset: โ
2025-08-28 22:36:55 - pico-train - INFO - โ name: pico-lm/pretokenized-dolma โ
2025-08-28 22:36:55 - pico-train - INFO - โ tokenizer: โ
2025-08-28 22:36:55 - pico-train - INFO - โ name: allenai/OLMo-7B-0724-hf โ
2025-08-28 22:36:55 - pico-train - INFO - โ vocab_size: 50304 โ
2025-08-28 22:36:55 - pico-train - INFO - โ evaluation: โ
2025-08-28 22:36:55 - pico-train - INFO - โ metrics: โ
2025-08-28 22:36:55 - pico-train - INFO - โ - paloma โ
2025-08-28 22:36:55 - pico-train - INFO - โ paloma: โ
2025-08-28 22:36:55 - pico-train - INFO - โ batch_size: 1 โ
2025-08-28 22:36:55 - pico-train - INFO - โ dataset_name: pico-lm/pretokenized-paloma-tinsy โ
2025-08-28 22:36:55 - pico-train - INFO - โ dataset_split: val โ
2025-08-28 22:36:55 - pico-train - INFO - โ max_length: 2048 โ
2025-08-28 22:36:55 - pico-train - INFO - โ model: โ
2025-08-28 22:36:55 - pico-train - INFO - โ activation_hidden_dim: 384 โ
2025-08-28 22:36:55 - pico-train - INFO - โ attention_n_heads: 12 โ
2025-08-28 22:36:55 - pico-train - INFO - โ attention_n_kv_heads: 4 โ
2025-08-28 22:36:55 - pico-train - INFO - โ batch_size: 1024 โ
2025-08-28 22:36:55 - pico-train - INFO - โ d_model: 96 โ
2025-08-28 22:36:55 - pico-train - INFO - โ max_seq_len: 2048 โ
2025-08-28 22:36:55 - pico-train - INFO - โ model_type: pico_decoder โ
2025-08-28 22:36:55 - pico-train - INFO - โ n_layers: 12 โ
2025-08-28 22:36:55 - pico-train - INFO - โ norm_eps: 1.0e-06 โ
2025-08-28 22:36:55 - pico-train - INFO - โ position_emb_theta: 10000.0 โ
2025-08-28 22:36:55 - pico-train - INFO - โ vocab_size: 50304 โ
2025-08-28 22:36:55 - pico-train - INFO - โ monitoring: โ
2025-08-28 22:36:55 - pico-train - INFO - โ logging: โ
2025-08-28 22:36:55 - pico-train - INFO - โ log_every_n_steps: 100 โ
2025-08-28 22:36:55 - pico-train - INFO - โ log_level: INFO โ
2025-08-28 22:36:55 - pico-train - INFO - โ save_to_wandb: false โ
2025-08-28 22:36:55 - pico-train - INFO - โ wandb: โ
2025-08-28 22:36:55 - pico-train - INFO - โ entity: boymyc โ
2025-08-28 22:36:55 - pico-train - INFO - โ project: pico-decoder-tiny โ
2025-08-28 22:36:55 - pico-train - INFO - โ training: โ
2025-08-28 22:36:55 - pico-train - INFO - โ fabric: โ
2025-08-28 22:36:55 - pico-train - INFO - โ accelerator: cuda โ
2025-08-28 22:36:55 - pico-train - INFO - โ num_devices: 1 โ
2025-08-28 22:36:55 - pico-train - INFO - โ num_nodes: 1 โ
2025-08-28 22:36:55 - pico-train - INFO - โ precision: bf16-mixed โ
2025-08-28 22:36:55 - pico-train - INFO - โ max_steps: 200000 โ
2025-08-28 22:36:55 - pico-train - INFO - โ optimization: โ
2025-08-28 22:36:55 - pico-train - INFO - โ gradient_accumulation_steps: 4 โ
2025-08-28 22:36:55 - pico-train - INFO - โ lr: 0.0003 โ
2025-08-28 22:36:55 - pico-train - INFO - โ lr_scheduler: linear_with_warmup โ
2025-08-28 22:36:55 - pico-train - INFO - โ lr_warmup_steps: 2500 โ
2025-08-28 22:36:55 - pico-train - INFO - โ optimizer: adamw โ
2025-08-28 22:36:55 - pico-train - INFO - โ โ
2025-08-28 22:36:55 - pico-train - INFO - โฐโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฏ
2025-08-28 22:36:55 - pico-train - INFO - ==================================================
2025-08-28 22:36:55 - pico-train - INFO - โญ Runtime Summary:
2025-08-28 22:36:55 - pico-train - INFO - ==================================================
2025-08-28 22:36:55 - pico-train - INFO - Starting from step: 0
2025-08-28 22:36:55 - pico-train - INFO - Model Setup:
2025-08-28 22:36:55 - pico-train - INFO - โโ Total Parameters: 11,282,784
2025-08-28 22:36:55 - pico-train - INFO - โโ Trainable Parameters: 11,282,784
2025-08-28 22:36:55 - pico-train - INFO - Distributed Setup:
2025-08-28 22:36:55 - pico-train - INFO - โโ Number of Devices: 1
2025-08-28 22:36:55 - pico-train - INFO - โโ Device Type: NVIDIA GeForce RTX 5090
2025-08-28 22:36:55 - pico-train - INFO - โโ Available Memory: 33.68 GB
2025-08-28 22:36:55 - pico-train - INFO - Software Setup:
2025-08-28 22:36:55 - pico-train - INFO - โโ Python Version: 3.10.12
2025-08-28 22:36:55 - pico-train - INFO - โโ PyTorch Version: 2.8.0+cu128
2025-08-28 22:36:55 - pico-train - INFO - โโ CUDA Version: 12.8
2025-08-28 22:36:55 - pico-train - INFO - โโ Operating System: Linux 6.8.0-63-generic
2025-08-28 22:36:55 - pico-train - INFO - Batch Size Configuration:
2025-08-28 22:36:55 - pico-train - INFO - โโ Global Batch Size: 4
2025-08-28 22:36:55 - pico-train - INFO - โโ Per Device Batch Size: 1
2025-08-28 22:36:55 - pico-train - INFO - โโ Gradient Accumulation Steps: 4
2025-08-28 22:36:55 - pico-train - INFO - ==================================================
2025-08-28 22:36:56 - pico-train - INFO - Step 0 -- ๐ Training Metrics
2025-08-28 22:36:56 - pico-train - INFO - โโโ Loss: 10.9975
2025-08-28 22:36:56 - pico-train - INFO - โโโ Learning Rate: 0.00e+00
2025-08-28 22:36:56 - pico-train - INFO - โโโ Inf/NaN count: 0
2025-08-28 22:36:56 - pico-train - INFO - Step 0 -- ๐ Saving Learning Dynamics
2025-08-28 22:37:50 - pico-train - INFO - Step 100 -- ๐ Training Metrics
2025-08-28 22:37:50 - pico-train - INFO - โโโ Loss: 10.9763
2025-08-28 22:37:50 - pico-train - INFO - โโโ Learning Rate: 1.20e-05
2025-08-28 22:37:50 - pico-train - INFO - โโโ Inf/NaN count: 0
2025-08-28 22:38:41 - pico-train - INFO - Step 200 -- ๐ Training Metrics
2025-08-28 22:38:41 - pico-train - INFO - โโโ Loss: 10.7900
2025-08-28 22:38:41 - pico-train - INFO - โโโ Learning Rate: 2.40e-05
2025-08-28 22:38:41 - pico-train - INFO - โโโ Inf/NaN count: 0
2025-08-28 22:39:32 - pico-train - INFO - Step 300 -- ๐ Training Metrics
2025-08-28 22:39:32 - pico-train - INFO - โโโ Loss: 10.2971
2025-08-28 22:39:32 - pico-train - INFO - โโโ Learning Rate: 3.60e-05
2025-08-28 22:39:32 - pico-train - INFO - โโโ Inf/NaN count: 0
2025-08-28 22:40:22 - pico-train - INFO - Step 400 -- ๐ Training Metrics
2025-08-28 22:40:22 - pico-train - INFO - โโโ Loss: 9.8307
2025-08-28 22:40:22 - pico-train - INFO - โโโ Learning Rate: 4.80e-05
2025-08-28 22:40:22 - pico-train - INFO - โโโ Inf/NaN count: 0
2025-08-28 22:41:14 - pico-train - INFO - Step 500 -- ๐ Training Metrics
2025-08-28 22:41:14 - pico-train - INFO - โโโ Loss: 9.3733
2025-08-28 22:41:14 - pico-train - INFO - โโโ Learning Rate: 6.00e-05
2025-08-28 22:41:14 - pico-train - INFO - โโโ Inf/NaN count: 0
2025-08-28 22:42:05 - pico-train - INFO - Step 600 -- ๐ Training Metrics
2025-08-28 22:42:05 - pico-train - INFO - โโโ Loss: 8.8910
2025-08-28 22:42:05 - pico-train - INFO - โโโ Learning Rate: 7.20e-05
2025-08-28 22:42:05 - pico-train - INFO - โโโ Inf/NaN count: 0
2025-08-28 22:42:56 - pico-train - INFO - Step 700 -- ๐ Training Metrics
2025-08-28 22:42:56 - pico-train - INFO - โโโ Loss: 8.4162
2025-08-28 22:42:56 - pico-train - INFO - โโโ Learning Rate: 8.40e-05
2025-08-28 22:42:56 - pico-train - INFO - โโโ Inf/NaN count: 0
2025-08-28 22:43:47 - pico-train - INFO - Step 800 -- ๐ Training Metrics
2025-08-28 22:43:47 - pico-train - INFO - โโโ Loss: 8.0678
2025-08-28 22:43:47 - pico-train - INFO - โโโ Learning Rate: 9.60e-05
2025-08-28 22:43:47 - pico-train - INFO - โโโ Inf/NaN count: 0
2025-08-28 22:44:38 - pico-train - INFO - Step 900 -- ๐ Training Metrics
2025-08-28 22:44:38 - pico-train - INFO - โโโ Loss: 7.8578
2025-08-28 22:44:38 - pico-train - INFO - โโโ Learning Rate: 1.08e-04
2025-08-28 22:44:38 - pico-train - INFO - โโโ Inf/NaN count: 0
2025-08-28 22:45:28 - pico-train - INFO - Step 1000 -- ๐พ Saving Checkpoint
|