{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 30, "global_step": 192, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010452961672473868, "grad_norm": 1.415148138999939, "learning_rate": 0.0, "loss": 0.737, "step": 1 }, { "epoch": 0.05226480836236934, "grad_norm": 1.0576598644256592, "learning_rate": 6.666666666666667e-05, "loss": 0.6984, "step": 5 }, { "epoch": 0.10452961672473868, "grad_norm": 0.21559220552444458, "learning_rate": 9.993582535855263e-05, "loss": 0.5093, "step": 10 }, { "epoch": 0.156794425087108, "grad_norm": 0.20353108644485474, "learning_rate": 9.954424340791196e-05, "loss": 0.4536, "step": 15 }, { "epoch": 0.20905923344947736, "grad_norm": 0.3409803807735443, "learning_rate": 9.879951981385578e-05, "loss": 0.4662, "step": 20 }, { "epoch": 0.2613240418118467, "grad_norm": 0.11676070839166641, "learning_rate": 9.770696282000244e-05, "loss": 0.4243, "step": 25 }, { "epoch": 0.313588850174216, "grad_norm": 0.09078264981508255, "learning_rate": 9.627435995817799e-05, "loss": 0.2723, "step": 30 }, { "epoch": 0.313588850174216, "eval_loss": 0.31834542751312256, "eval_runtime": 1748.9904, "eval_samples_per_second": 0.292, "eval_steps_per_second": 0.073, "step": 30 }, { "epoch": 0.36585365853658536, "grad_norm": 0.08448098599910736, "learning_rate": 9.451192254041758e-05, "loss": 0.285, "step": 35 }, { "epoch": 0.4181184668989547, "grad_norm": 0.0991150364279747, "learning_rate": 9.243221287473756e-05, "loss": 0.3197, "step": 40 }, { "epoch": 0.47038327526132406, "grad_norm": 0.17393162846565247, "learning_rate": 9.005005472346924e-05, "loss": 0.3749, "step": 45 }, { "epoch": 0.5226480836236934, "grad_norm": 0.0953838899731636, "learning_rate": 8.738242764239046e-05, "loss": 0.2699, "step": 50 }, { "epoch": 0.5749128919860628, "grad_norm": 0.07958400994539261, "learning_rate": 8.444834595378434e-05, "loss": 0.2421, "step": 55 }, { "epoch": 0.627177700348432, "grad_norm": 0.08708363026380539, "learning_rate": 8.126872321608184e-05, "loss": 0.2645, "step": 60 }, { "epoch": 0.627177700348432, "eval_loss": 0.26565828919410706, "eval_runtime": 1749.0443, "eval_samples_per_second": 0.292, "eval_steps_per_second": 0.073, "step": 60 }, { "epoch": 0.6794425087108014, "grad_norm": 0.10611709207296371, "learning_rate": 7.786622315612183e-05, "loss": 0.3069, "step": 65 }, { "epoch": 0.7317073170731707, "grad_norm": 0.093358114361763, "learning_rate": 7.426509812655406e-05, "loss": 0.3514, "step": 70 }, { "epoch": 0.7839721254355401, "grad_norm": 0.0683087483048439, "learning_rate": 7.049101623982937e-05, "loss": 0.2057, "step": 75 }, { "epoch": 0.8362369337979094, "grad_norm": 0.0792144313454628, "learning_rate": 6.65708784109318e-05, "loss": 0.2101, "step": 80 }, { "epoch": 0.8885017421602788, "grad_norm": 0.09680237621068954, "learning_rate": 6.253262661293604e-05, "loss": 0.2638, "step": 85 }, { "epoch": 0.9407665505226481, "grad_norm": 0.1257801651954651, "learning_rate": 5.840504471210742e-05, "loss": 0.3075, "step": 90 }, { "epoch": 0.9407665505226481, "eval_loss": 0.2471843957901001, "eval_runtime": 1748.4786, "eval_samples_per_second": 0.292, "eval_steps_per_second": 0.073, "step": 90 }, { "epoch": 0.9930313588850174, "grad_norm": 0.10834779590368271, "learning_rate": 5.4217553302152237e-05, "loss": 0.2944, "step": 95 }, { "epoch": 1.0418118466898956, "grad_norm": 0.0633026584982872, "learning_rate": 5e-05, "loss": 0.2095, "step": 100 }, { "epoch": 1.0940766550522647, "grad_norm": 0.07635607570409775, "learning_rate": 4.578244669784777e-05, "loss": 0.2058, "step": 105 }, { "epoch": 1.146341463414634, "grad_norm": 0.0802493542432785, "learning_rate": 4.15949552878926e-05, "loss": 0.2232, "step": 110 }, { "epoch": 1.1986062717770034, "grad_norm": 0.12653642892837524, "learning_rate": 3.746737338706397e-05, "loss": 0.2532, "step": 115 }, { "epoch": 1.2508710801393728, "grad_norm": 0.07813160121440887, "learning_rate": 3.3429121589068215e-05, "loss": 0.2893, "step": 120 }, { "epoch": 1.2508710801393728, "eval_loss": 0.239236518740654, "eval_runtime": 1751.4721, "eval_samples_per_second": 0.291, "eval_steps_per_second": 0.073, "step": 120 }, { "epoch": 1.3031358885017421, "grad_norm": 0.07795720547437668, "learning_rate": 2.950898376017064e-05, "loss": 0.1842, "step": 125 }, { "epoch": 1.3554006968641115, "grad_norm": 0.07542526721954346, "learning_rate": 2.573490187344596e-05, "loss": 0.2031, "step": 130 }, { "epoch": 1.4076655052264808, "grad_norm": 0.10047340393066406, "learning_rate": 2.2133776843878186e-05, "loss": 0.24, "step": 135 }, { "epoch": 1.4599303135888502, "grad_norm": 0.13595731556415558, "learning_rate": 1.873127678391816e-05, "loss": 0.2808, "step": 140 }, { "epoch": 1.5121951219512195, "grad_norm": 0.06210995092988014, "learning_rate": 1.555165404621567e-05, "loss": 0.235, "step": 145 }, { "epoch": 1.5644599303135889, "grad_norm": 0.08401988446712494, "learning_rate": 1.2617572357609564e-05, "loss": 0.1849, "step": 150 }, { "epoch": 1.5644599303135889, "eval_loss": 0.23435795307159424, "eval_runtime": 1753.006, "eval_samples_per_second": 0.291, "eval_steps_per_second": 0.073, "step": 150 }, { "epoch": 1.6167247386759582, "grad_norm": 0.07571443915367126, "learning_rate": 9.949945276530781e-06, "loss": 0.205, "step": 155 }, { "epoch": 1.6689895470383276, "grad_norm": 0.08948186039924622, "learning_rate": 7.5677871252624485e-06, "loss": 0.2501, "step": 160 }, { "epoch": 1.721254355400697, "grad_norm": 0.185760036110878, "learning_rate": 5.488077459582425e-06, "loss": 0.3175, "step": 165 }, { "epoch": 1.773519163763066, "grad_norm": 0.055869363248348236, "learning_rate": 3.7256400418220262e-06, "loss": 0.1723, "step": 170 }, { "epoch": 1.8257839721254356, "grad_norm": 0.0660533756017685, "learning_rate": 2.2930371799975594e-06, "loss": 0.1959, "step": 175 }, { "epoch": 1.8780487804878048, "grad_norm": 0.07585973292589188, "learning_rate": 1.2004801861442371e-06, "loss": 0.2145, "step": 180 }, { "epoch": 1.8780487804878048, "eval_loss": 0.23282098770141602, "eval_runtime": 1752.5559, "eval_samples_per_second": 0.291, "eval_steps_per_second": 0.073, "step": 180 }, { "epoch": 1.9303135888501743, "grad_norm": 0.10959440469741821, "learning_rate": 4.55756592088058e-07, "loss": 0.2618, "step": 185 }, { "epoch": 1.9825783972125435, "grad_norm": 0.07039328664541245, "learning_rate": 6.417464144736208e-08, "loss": 0.2559, "step": 190 } ], "logging_steps": 5, "max_steps": 192, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 30, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2340088273817969e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }