Krish356's picture
Training in progress, step 192, checkpoint
b55dfaf verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 30,
"global_step": 192,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010452961672473868,
"grad_norm": 1.415148138999939,
"learning_rate": 0.0,
"loss": 0.737,
"step": 1
},
{
"epoch": 0.05226480836236934,
"grad_norm": 1.0576598644256592,
"learning_rate": 6.666666666666667e-05,
"loss": 0.6984,
"step": 5
},
{
"epoch": 0.10452961672473868,
"grad_norm": 0.21559220552444458,
"learning_rate": 9.993582535855263e-05,
"loss": 0.5093,
"step": 10
},
{
"epoch": 0.156794425087108,
"grad_norm": 0.20353108644485474,
"learning_rate": 9.954424340791196e-05,
"loss": 0.4536,
"step": 15
},
{
"epoch": 0.20905923344947736,
"grad_norm": 0.3409803807735443,
"learning_rate": 9.879951981385578e-05,
"loss": 0.4662,
"step": 20
},
{
"epoch": 0.2613240418118467,
"grad_norm": 0.11676070839166641,
"learning_rate": 9.770696282000244e-05,
"loss": 0.4243,
"step": 25
},
{
"epoch": 0.313588850174216,
"grad_norm": 0.09078264981508255,
"learning_rate": 9.627435995817799e-05,
"loss": 0.2723,
"step": 30
},
{
"epoch": 0.313588850174216,
"eval_loss": 0.31834542751312256,
"eval_runtime": 1748.9904,
"eval_samples_per_second": 0.292,
"eval_steps_per_second": 0.073,
"step": 30
},
{
"epoch": 0.36585365853658536,
"grad_norm": 0.08448098599910736,
"learning_rate": 9.451192254041758e-05,
"loss": 0.285,
"step": 35
},
{
"epoch": 0.4181184668989547,
"grad_norm": 0.0991150364279747,
"learning_rate": 9.243221287473756e-05,
"loss": 0.3197,
"step": 40
},
{
"epoch": 0.47038327526132406,
"grad_norm": 0.17393162846565247,
"learning_rate": 9.005005472346924e-05,
"loss": 0.3749,
"step": 45
},
{
"epoch": 0.5226480836236934,
"grad_norm": 0.0953838899731636,
"learning_rate": 8.738242764239046e-05,
"loss": 0.2699,
"step": 50
},
{
"epoch": 0.5749128919860628,
"grad_norm": 0.07958400994539261,
"learning_rate": 8.444834595378434e-05,
"loss": 0.2421,
"step": 55
},
{
"epoch": 0.627177700348432,
"grad_norm": 0.08708363026380539,
"learning_rate": 8.126872321608184e-05,
"loss": 0.2645,
"step": 60
},
{
"epoch": 0.627177700348432,
"eval_loss": 0.26565828919410706,
"eval_runtime": 1749.0443,
"eval_samples_per_second": 0.292,
"eval_steps_per_second": 0.073,
"step": 60
},
{
"epoch": 0.6794425087108014,
"grad_norm": 0.10611709207296371,
"learning_rate": 7.786622315612183e-05,
"loss": 0.3069,
"step": 65
},
{
"epoch": 0.7317073170731707,
"grad_norm": 0.093358114361763,
"learning_rate": 7.426509812655406e-05,
"loss": 0.3514,
"step": 70
},
{
"epoch": 0.7839721254355401,
"grad_norm": 0.0683087483048439,
"learning_rate": 7.049101623982937e-05,
"loss": 0.2057,
"step": 75
},
{
"epoch": 0.8362369337979094,
"grad_norm": 0.0792144313454628,
"learning_rate": 6.65708784109318e-05,
"loss": 0.2101,
"step": 80
},
{
"epoch": 0.8885017421602788,
"grad_norm": 0.09680237621068954,
"learning_rate": 6.253262661293604e-05,
"loss": 0.2638,
"step": 85
},
{
"epoch": 0.9407665505226481,
"grad_norm": 0.1257801651954651,
"learning_rate": 5.840504471210742e-05,
"loss": 0.3075,
"step": 90
},
{
"epoch": 0.9407665505226481,
"eval_loss": 0.2471843957901001,
"eval_runtime": 1748.4786,
"eval_samples_per_second": 0.292,
"eval_steps_per_second": 0.073,
"step": 90
},
{
"epoch": 0.9930313588850174,
"grad_norm": 0.10834779590368271,
"learning_rate": 5.4217553302152237e-05,
"loss": 0.2944,
"step": 95
},
{
"epoch": 1.0418118466898956,
"grad_norm": 0.0633026584982872,
"learning_rate": 5e-05,
"loss": 0.2095,
"step": 100
},
{
"epoch": 1.0940766550522647,
"grad_norm": 0.07635607570409775,
"learning_rate": 4.578244669784777e-05,
"loss": 0.2058,
"step": 105
},
{
"epoch": 1.146341463414634,
"grad_norm": 0.0802493542432785,
"learning_rate": 4.15949552878926e-05,
"loss": 0.2232,
"step": 110
},
{
"epoch": 1.1986062717770034,
"grad_norm": 0.12653642892837524,
"learning_rate": 3.746737338706397e-05,
"loss": 0.2532,
"step": 115
},
{
"epoch": 1.2508710801393728,
"grad_norm": 0.07813160121440887,
"learning_rate": 3.3429121589068215e-05,
"loss": 0.2893,
"step": 120
},
{
"epoch": 1.2508710801393728,
"eval_loss": 0.239236518740654,
"eval_runtime": 1751.4721,
"eval_samples_per_second": 0.291,
"eval_steps_per_second": 0.073,
"step": 120
},
{
"epoch": 1.3031358885017421,
"grad_norm": 0.07795720547437668,
"learning_rate": 2.950898376017064e-05,
"loss": 0.1842,
"step": 125
},
{
"epoch": 1.3554006968641115,
"grad_norm": 0.07542526721954346,
"learning_rate": 2.573490187344596e-05,
"loss": 0.2031,
"step": 130
},
{
"epoch": 1.4076655052264808,
"grad_norm": 0.10047340393066406,
"learning_rate": 2.2133776843878186e-05,
"loss": 0.24,
"step": 135
},
{
"epoch": 1.4599303135888502,
"grad_norm": 0.13595731556415558,
"learning_rate": 1.873127678391816e-05,
"loss": 0.2808,
"step": 140
},
{
"epoch": 1.5121951219512195,
"grad_norm": 0.06210995092988014,
"learning_rate": 1.555165404621567e-05,
"loss": 0.235,
"step": 145
},
{
"epoch": 1.5644599303135889,
"grad_norm": 0.08401988446712494,
"learning_rate": 1.2617572357609564e-05,
"loss": 0.1849,
"step": 150
},
{
"epoch": 1.5644599303135889,
"eval_loss": 0.23435795307159424,
"eval_runtime": 1753.006,
"eval_samples_per_second": 0.291,
"eval_steps_per_second": 0.073,
"step": 150
},
{
"epoch": 1.6167247386759582,
"grad_norm": 0.07571443915367126,
"learning_rate": 9.949945276530781e-06,
"loss": 0.205,
"step": 155
},
{
"epoch": 1.6689895470383276,
"grad_norm": 0.08948186039924622,
"learning_rate": 7.5677871252624485e-06,
"loss": 0.2501,
"step": 160
},
{
"epoch": 1.721254355400697,
"grad_norm": 0.185760036110878,
"learning_rate": 5.488077459582425e-06,
"loss": 0.3175,
"step": 165
},
{
"epoch": 1.773519163763066,
"grad_norm": 0.055869363248348236,
"learning_rate": 3.7256400418220262e-06,
"loss": 0.1723,
"step": 170
},
{
"epoch": 1.8257839721254356,
"grad_norm": 0.0660533756017685,
"learning_rate": 2.2930371799975594e-06,
"loss": 0.1959,
"step": 175
},
{
"epoch": 1.8780487804878048,
"grad_norm": 0.07585973292589188,
"learning_rate": 1.2004801861442371e-06,
"loss": 0.2145,
"step": 180
},
{
"epoch": 1.8780487804878048,
"eval_loss": 0.23282098770141602,
"eval_runtime": 1752.5559,
"eval_samples_per_second": 0.291,
"eval_steps_per_second": 0.073,
"step": 180
},
{
"epoch": 1.9303135888501743,
"grad_norm": 0.10959440469741821,
"learning_rate": 4.55756592088058e-07,
"loss": 0.2618,
"step": 185
},
{
"epoch": 1.9825783972125435,
"grad_norm": 0.07039328664541245,
"learning_rate": 6.417464144736208e-08,
"loss": 0.2559,
"step": 190
}
],
"logging_steps": 5,
"max_steps": 192,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 30,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.2340088273817969e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}