|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 8.61333046193604, |
|
"eval_steps": 500, |
|
"global_step": 10000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.008614191881124151, |
|
"grad_norm": 3.921875, |
|
"learning_rate": 1.9982000000000003e-05, |
|
"loss": 1.3429, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.017228383762248303, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 1.9962000000000003e-05, |
|
"loss": 0.7212, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.025842575643372456, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 1.9942e-05, |
|
"loss": 0.6892, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.034456767524496605, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 1.9922e-05, |
|
"loss": 0.6611, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.04307095940562076, |
|
"grad_norm": 2.875, |
|
"learning_rate": 1.9902e-05, |
|
"loss": 0.6514, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.05168515128674491, |
|
"grad_norm": 3.0, |
|
"learning_rate": 1.9882e-05, |
|
"loss": 0.6437, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.060299343167869064, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 1.9862e-05, |
|
"loss": 0.6161, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.06891353504899321, |
|
"grad_norm": 3.109375, |
|
"learning_rate": 1.9842e-05, |
|
"loss": 0.6083, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.07752772693011736, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 1.9822e-05, |
|
"loss": 0.5967, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.08614191881124152, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 1.9802e-05, |
|
"loss": 0.5773, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.09475611069236567, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 1.9782e-05, |
|
"loss": 0.5722, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.10337030257348982, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 1.9762e-05, |
|
"loss": 0.5528, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.11198449445461398, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 1.9742000000000002e-05, |
|
"loss": 0.5952, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.12059868633573813, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 1.9722000000000002e-05, |
|
"loss": 0.5309, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.12921287821686228, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.9702000000000002e-05, |
|
"loss": 0.5353, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.13782707009798642, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 1.9682000000000002e-05, |
|
"loss": 0.5447, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.1464412619791106, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 1.9662000000000003e-05, |
|
"loss": 0.4998, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.15505545386023473, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 1.9642000000000003e-05, |
|
"loss": 0.5191, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.1636696457413589, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 1.9622e-05, |
|
"loss": 0.5358, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.17228383762248303, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.9602e-05, |
|
"loss": 0.4914, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.1808980295036072, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 1.9582e-05, |
|
"loss": 0.4943, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.18951222138473134, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 1.9562e-05, |
|
"loss": 0.4731, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.1981264132658555, |
|
"grad_norm": 2.625, |
|
"learning_rate": 1.9542e-05, |
|
"loss": 0.497, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.20674060514697964, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.9522e-05, |
|
"loss": 0.4746, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.2153547970281038, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.9502e-05, |
|
"loss": 0.4763, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.22396898890922795, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.9482e-05, |
|
"loss": 0.4759, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.23258318079035212, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 1.9462e-05, |
|
"loss": 0.5039, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.24119737267147626, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.9442e-05, |
|
"loss": 0.4799, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.2498115645526004, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 1.9422e-05, |
|
"loss": 0.4446, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.25842575643372456, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 1.9402e-05, |
|
"loss": 0.4727, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.26703994831484873, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 1.9382000000000002e-05, |
|
"loss": 0.4189, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.27565414019597284, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 1.9362000000000002e-05, |
|
"loss": 0.4409, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.284268332077097, |
|
"grad_norm": 3.25, |
|
"learning_rate": 1.9342000000000002e-05, |
|
"loss": 0.4656, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.2928825239582212, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 1.9322000000000002e-05, |
|
"loss": 0.4713, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.30149671583934534, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1.9302e-05, |
|
"loss": 0.4282, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.31011090772046945, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 1.9282e-05, |
|
"loss": 0.4565, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.3187250996015936, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 1.9262e-05, |
|
"loss": 0.4346, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.3273392914827178, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 1.9242e-05, |
|
"loss": 0.426, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.33595348336384195, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 1.9222e-05, |
|
"loss": 0.42, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.34456767524496607, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 1.9202e-05, |
|
"loss": 0.4317, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.35318186712609023, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 1.9182e-05, |
|
"loss": 0.4311, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.3617960590072144, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.9162e-05, |
|
"loss": 0.4056, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.3704102508883385, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 1.9142e-05, |
|
"loss": 0.4029, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.3790244427694627, |
|
"grad_norm": 3.125, |
|
"learning_rate": 1.9122e-05, |
|
"loss": 0.4337, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.38763863465058684, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.9102e-05, |
|
"loss": 0.4381, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.396252826531711, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 1.9082e-05, |
|
"loss": 0.4174, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.4048670184128351, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.9062e-05, |
|
"loss": 0.3928, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.4134812102939593, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.9042e-05, |
|
"loss": 0.4051, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.42209540217508346, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.9022000000000002e-05, |
|
"loss": 0.3992, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.4307095940562076, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 1.9002000000000002e-05, |
|
"loss": 0.4194, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.43932378593733173, |
|
"grad_norm": 3.328125, |
|
"learning_rate": 1.8982000000000002e-05, |
|
"loss": 0.3951, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.4479379778184559, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.8962000000000002e-05, |
|
"loss": 0.3918, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.45655216969958007, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 1.8942000000000003e-05, |
|
"loss": 0.3854, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.46516636158070424, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1.8922000000000003e-05, |
|
"loss": 0.3836, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.47378055346182835, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 1.8902000000000003e-05, |
|
"loss": 0.3824, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.4823947453429525, |
|
"grad_norm": 2.625, |
|
"learning_rate": 1.8882000000000003e-05, |
|
"loss": 0.3913, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.4910089372240767, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 1.8862000000000003e-05, |
|
"loss": 0.3834, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.4996231291052008, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.8842000000000004e-05, |
|
"loss": 0.3848, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.508237320986325, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 1.8822000000000004e-05, |
|
"loss": 0.3845, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.5168515128674491, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 1.8802000000000004e-05, |
|
"loss": 0.3836, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.5254657047485732, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.8782e-05, |
|
"loss": 0.3799, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.5340798966296975, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 1.8762e-05, |
|
"loss": 0.3715, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.5426940885108216, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1.8742e-05, |
|
"loss": 0.3825, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.5513082803919457, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.8722e-05, |
|
"loss": 0.364, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.5599224722730699, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 1.8702e-05, |
|
"loss": 0.3765, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.568536664154194, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 1.8682000000000002e-05, |
|
"loss": 0.3748, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.5771508560353182, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.8662000000000002e-05, |
|
"loss": 0.3751, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.5857650479164423, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.8642000000000002e-05, |
|
"loss": 0.3778, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.5943792397975665, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 1.8622000000000002e-05, |
|
"loss": 0.3798, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.6029934316786907, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.8602000000000002e-05, |
|
"loss": 0.3682, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.6116076235598148, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 1.8582000000000003e-05, |
|
"loss": 0.3652, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.6202218154409389, |
|
"grad_norm": 2.25, |
|
"learning_rate": 1.8562000000000003e-05, |
|
"loss": 0.3658, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.6288360073220631, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 1.8542000000000003e-05, |
|
"loss": 0.389, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.6374501992031872, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 1.8522000000000003e-05, |
|
"loss": 0.375, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.6460643910843114, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.8502000000000003e-05, |
|
"loss": 0.3617, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.6546785829654356, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 1.8482000000000004e-05, |
|
"loss": 0.3777, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.6632927748465597, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 1.8462000000000004e-05, |
|
"loss": 0.3599, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.6719069667276839, |
|
"grad_norm": 1.75, |
|
"learning_rate": 1.8442e-05, |
|
"loss": 0.3495, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.680521158608808, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.8422e-05, |
|
"loss": 0.3461, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.6891353504899321, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 1.8402e-05, |
|
"loss": 0.3661, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.6977495423710564, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.8382e-05, |
|
"loss": 0.3594, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.7063637342521805, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 1.8362e-05, |
|
"loss": 0.3512, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.7149779261333046, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1.8342e-05, |
|
"loss": 0.3616, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.7235921180144288, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.8322000000000002e-05, |
|
"loss": 0.3575, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.7322063098955529, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 1.8302000000000002e-05, |
|
"loss": 0.3712, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.740820501776677, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.8282000000000002e-05, |
|
"loss": 0.3724, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.7494346936578012, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 1.8262000000000002e-05, |
|
"loss": 0.3524, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.7580488855389254, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.8242000000000003e-05, |
|
"loss": 0.3543, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.7666630774200496, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.8222000000000003e-05, |
|
"loss": 0.3697, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.7752772693011737, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.8202000000000003e-05, |
|
"loss": 0.3583, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.7838914611822978, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 1.8182000000000003e-05, |
|
"loss": 0.3753, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.792505653063422, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 1.8162000000000003e-05, |
|
"loss": 0.3581, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.8011198449445461, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 1.8142000000000004e-05, |
|
"loss": 0.3534, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.8097340368256702, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 1.8122e-05, |
|
"loss": 0.3654, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.8183482287067945, |
|
"grad_norm": 1.875, |
|
"learning_rate": 1.8102e-05, |
|
"loss": 0.3663, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.8269624205879186, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 1.8082e-05, |
|
"loss": 0.3599, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.8355766124690428, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 1.8062e-05, |
|
"loss": 0.3511, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.8441908043501669, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1.8042e-05, |
|
"loss": 0.3615, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.852804996231291, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 1.8022e-05, |
|
"loss": 0.3523, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.8614191881124152, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.8002e-05, |
|
"loss": 0.3591, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.8700333799935394, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 1.7982e-05, |
|
"loss": 0.3567, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.8786475718746635, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 1.7962000000000002e-05, |
|
"loss": 0.3568, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.8872617637557877, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 1.7942000000000002e-05, |
|
"loss": 0.3492, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.8958759556369118, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.7922000000000002e-05, |
|
"loss": 0.3386, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.9044901475180359, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 1.7902000000000002e-05, |
|
"loss": 0.3496, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.9131043393991601, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 1.7882000000000003e-05, |
|
"loss": 0.3278, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.9217185312802842, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 1.7862000000000003e-05, |
|
"loss": 0.3343, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.9303327231614085, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 1.7842000000000003e-05, |
|
"loss": 0.3389, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.9389469150425326, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 1.7822000000000003e-05, |
|
"loss": 0.351, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.9475611069236567, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 1.7802e-05, |
|
"loss": 0.3625, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.9561752988047809, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.7782e-05, |
|
"loss": 0.3597, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.964789490685905, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.7762e-05, |
|
"loss": 0.3631, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.9734036825670291, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 1.7742e-05, |
|
"loss": 0.3378, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.9820178744481534, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 1.7722e-05, |
|
"loss": 0.3461, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.9906320663292775, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 1.7702e-05, |
|
"loss": 0.3691, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.9992462582104016, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.7682e-05, |
|
"loss": 0.3332, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.0077527726930118, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 1.7662e-05, |
|
"loss": 0.2853, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.0163669645741358, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 1.7642e-05, |
|
"loss": 0.284, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.02498115645526, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 1.7622000000000002e-05, |
|
"loss": 0.2768, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.0335953483363842, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 1.7602000000000002e-05, |
|
"loss": 0.2734, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.0422095402175084, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 1.7582000000000002e-05, |
|
"loss": 0.2697, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.0508237320986324, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 1.7562000000000002e-05, |
|
"loss": 0.2799, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.0594379239797567, |
|
"grad_norm": 1.875, |
|
"learning_rate": 1.7542000000000002e-05, |
|
"loss": 0.2766, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.068052115860881, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 1.7522000000000003e-05, |
|
"loss": 0.2742, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.076666307742005, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 1.7502000000000003e-05, |
|
"loss": 0.2882, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.0852804996231291, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 1.7482e-05, |
|
"loss": 0.2882, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.0938946915042533, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.7462e-05, |
|
"loss": 0.2828, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.1025088833853773, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 1.7442e-05, |
|
"loss": 0.2732, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.1111230752665016, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.7422e-05, |
|
"loss": 0.2798, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.1197372671476258, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 1.7402e-05, |
|
"loss": 0.293, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.1283514590287498, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 1.7382e-05, |
|
"loss": 0.2971, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.136965650909874, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 1.7362e-05, |
|
"loss": 0.2673, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.1455798427909982, |
|
"grad_norm": 1.75, |
|
"learning_rate": 1.7342e-05, |
|
"loss": 0.2875, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.1541940346721222, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.7322e-05, |
|
"loss": 0.2723, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.1628082265532464, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 1.7302e-05, |
|
"loss": 0.3001, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.1714224184343707, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 1.7282e-05, |
|
"loss": 0.2878, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.180036610315495, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 1.7262000000000002e-05, |
|
"loss": 0.2663, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.188650802196619, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 1.7242000000000002e-05, |
|
"loss": 0.3061, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.1972649940777431, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 1.7222000000000002e-05, |
|
"loss": 0.3015, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.2058791859588673, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 1.7202000000000002e-05, |
|
"loss": 0.2825, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.2144933778399913, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 1.7182000000000003e-05, |
|
"loss": 0.2818, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.2231075697211156, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 1.7162e-05, |
|
"loss": 0.2755, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.2317217616022398, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 1.7142e-05, |
|
"loss": 0.2897, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.2403359534833638, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 1.7122e-05, |
|
"loss": 0.2816, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.248950145364488, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.7102e-05, |
|
"loss": 0.2798, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.2575643372456122, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.7082e-05, |
|
"loss": 0.2799, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.2661785291267362, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.7062e-05, |
|
"loss": 0.2843, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.2747927210078605, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 1.7042e-05, |
|
"loss": 0.2876, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.2834069128889847, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 1.7022e-05, |
|
"loss": 0.2912, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.2920211047701087, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.7002e-05, |
|
"loss": 0.2811, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.300635296651233, |
|
"grad_norm": 1.875, |
|
"learning_rate": 1.6982e-05, |
|
"loss": 0.2944, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.3092494885323571, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 1.6962e-05, |
|
"loss": 0.2796, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.3178636804134811, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 1.6942e-05, |
|
"loss": 0.2857, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.3264778722946053, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 1.6922e-05, |
|
"loss": 0.278, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.3350920641757296, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.6902000000000002e-05, |
|
"loss": 0.2711, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.3437062560568536, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 1.6882000000000002e-05, |
|
"loss": 0.2894, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.3523204479379778, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 1.6862000000000002e-05, |
|
"loss": 0.2834, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.360934639819102, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 1.6842e-05, |
|
"loss": 0.2748, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.369548831700226, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.6822e-05, |
|
"loss": 0.2904, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.3781630235813502, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.6802e-05, |
|
"loss": 0.2816, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.3867772154624745, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 1.6782e-05, |
|
"loss": 0.2993, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.3953914073435985, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 1.6762e-05, |
|
"loss": 0.2747, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.4040055992247227, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 1.6742e-05, |
|
"loss": 0.2814, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.412619791105847, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 1.6722e-05, |
|
"loss": 0.2777, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.421233982986971, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.6702e-05, |
|
"loss": 0.2789, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.4298481748680951, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 1.6682e-05, |
|
"loss": 0.2876, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.4384623667492193, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 1.6662e-05, |
|
"loss": 0.2855, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.4470765586303436, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.6642e-05, |
|
"loss": 0.2823, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.4556907505114676, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 1.6622e-05, |
|
"loss": 0.2759, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.4643049423925918, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.6602e-05, |
|
"loss": 0.2804, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.472919134273716, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 1.6582e-05, |
|
"loss": 0.2772, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.4815333261548402, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 1.6562e-05, |
|
"loss": 0.2834, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.4901475180359642, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 1.6542000000000002e-05, |
|
"loss": 0.2788, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.4987617099170885, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 1.6522e-05, |
|
"loss": 0.2796, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.5073759017982127, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 1.6502e-05, |
|
"loss": 0.2764, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.5159900936793367, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 1.6482000000000002e-05, |
|
"loss": 0.2893, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.524604285560461, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 1.6462000000000003e-05, |
|
"loss": 0.2868, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.5332184774415851, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.6442000000000003e-05, |
|
"loss": 0.2801, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.5418326693227091, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 1.6422000000000003e-05, |
|
"loss": 0.2845, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.5504468612038333, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 1.6402000000000003e-05, |
|
"loss": 0.2814, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.5590610530849576, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 1.6382000000000003e-05, |
|
"loss": 0.2707, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.5676752449660816, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 1.6362000000000004e-05, |
|
"loss": 0.2809, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.5762894368472058, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 1.6342000000000004e-05, |
|
"loss": 0.2875, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.58490362872833, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.6322e-05, |
|
"loss": 0.292, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.593517820609454, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 1.6302e-05, |
|
"loss": 0.2954, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.6021320124905782, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 1.6282e-05, |
|
"loss": 0.2823, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.6107462043717025, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.6262e-05, |
|
"loss": 0.2822, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.6193603962528265, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 1.6242e-05, |
|
"loss": 0.2776, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.6279745881339507, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 1.6222e-05, |
|
"loss": 0.2798, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.636588780015075, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1.6202000000000002e-05, |
|
"loss": 0.2873, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.645202971896199, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 1.6182000000000002e-05, |
|
"loss": 0.2783, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 1.6538171637773231, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 1.6162000000000002e-05, |
|
"loss": 0.2847, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 1.6624313556584474, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.6142000000000002e-05, |
|
"loss": 0.2917, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 1.6710455475395714, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 1.6122000000000003e-05, |
|
"loss": 0.2682, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 1.6796597394206956, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.6102000000000003e-05, |
|
"loss": 0.2837, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.6882739313018198, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 1.6082000000000003e-05, |
|
"loss": 0.2852, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 1.6968881231829438, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 1.6062000000000003e-05, |
|
"loss": 0.2896, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 1.705502315064068, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 1.6042000000000003e-05, |
|
"loss": 0.2827, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.7141165069451922, |
|
"grad_norm": 1.75, |
|
"learning_rate": 1.6022000000000003e-05, |
|
"loss": 0.2725, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 1.7227306988263162, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 1.6002000000000004e-05, |
|
"loss": 0.2835, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.7313448907074405, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 1.5982e-05, |
|
"loss": 0.2779, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 1.7399590825885647, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 1.5962e-05, |
|
"loss": 0.2856, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 1.7485732744696887, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 1.5942e-05, |
|
"loss": 0.2835, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 1.757187466350813, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 1.5922e-05, |
|
"loss": 0.2777, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 1.7658016582319371, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 1.5902e-05, |
|
"loss": 0.2787, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.7744158501130611, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 1.5882e-05, |
|
"loss": 0.2842, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 1.7830300419941856, |
|
"grad_norm": 1.75, |
|
"learning_rate": 1.5862e-05, |
|
"loss": 0.2654, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 1.7916442338753096, |
|
"grad_norm": 1.875, |
|
"learning_rate": 1.5842000000000002e-05, |
|
"loss": 0.2701, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 1.8002584257564336, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 1.5822000000000002e-05, |
|
"loss": 0.2709, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 1.808872617637558, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 1.5802000000000002e-05, |
|
"loss": 0.2781, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.817486809518682, |
|
"grad_norm": 1.875, |
|
"learning_rate": 1.5782000000000002e-05, |
|
"loss": 0.2823, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 1.826101001399806, |
|
"grad_norm": 1.875, |
|
"learning_rate": 1.5762000000000003e-05, |
|
"loss": 0.2747, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 1.8347151932809305, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 1.5742000000000003e-05, |
|
"loss": 0.2731, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 1.8433293851620545, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 1.5722000000000003e-05, |
|
"loss": 0.2738, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 1.8519435770431787, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 1.5702000000000003e-05, |
|
"loss": 0.2887, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.860557768924303, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.5682000000000003e-05, |
|
"loss": 0.2841, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 1.869171960805427, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.5662e-05, |
|
"loss": 0.2916, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 1.8777861526865511, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 1.5642e-05, |
|
"loss": 0.2747, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 1.8864003445676754, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 1.5622e-05, |
|
"loss": 0.2769, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 1.8950145364487994, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 1.5602e-05, |
|
"loss": 0.2798, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.9036287283299236, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 1.5582e-05, |
|
"loss": 0.2791, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 1.9122429202110478, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 1.5562e-05, |
|
"loss": 0.2721, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 1.9208571120921718, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.5542e-05, |
|
"loss": 0.2778, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 1.929471303973296, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 1.5522e-05, |
|
"loss": 0.2792, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 1.9380854958544202, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.5502e-05, |
|
"loss": 0.2819, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.9466996877355442, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 1.5482000000000002e-05, |
|
"loss": 0.2691, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 1.9553138796166685, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 1.5462000000000002e-05, |
|
"loss": 0.2731, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 1.9639280714977927, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.5442000000000002e-05, |
|
"loss": 0.2709, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 1.9725422633789167, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1.5422000000000002e-05, |
|
"loss": 0.288, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 1.981156455260041, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 1.5402000000000003e-05, |
|
"loss": 0.2807, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.9897706471411651, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 1.5382000000000003e-05, |
|
"loss": 0.2769, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 1.9983848390222891, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 1.5362000000000003e-05, |
|
"loss": 0.2855, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 2.006891353504899, |
|
"grad_norm": 1.875, |
|
"learning_rate": 1.5342e-05, |
|
"loss": 0.2429, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 2.0155055453860236, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.5322e-05, |
|
"loss": 0.2166, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 2.0241197372671476, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 1.5302e-05, |
|
"loss": 0.2022, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 2.0327339291482716, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 1.5282e-05, |
|
"loss": 0.1995, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 2.041348121029396, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 1.5262e-05, |
|
"loss": 0.1997, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 2.04996231291052, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.5242e-05, |
|
"loss": 0.2031, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 2.0585765047916444, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 1.5222000000000001e-05, |
|
"loss": 0.21, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 2.0671906966727684, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 1.5202000000000001e-05, |
|
"loss": 0.2159, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.0758048885538924, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.5182000000000001e-05, |
|
"loss": 0.1967, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 2.084419080435017, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.5162000000000002e-05, |
|
"loss": 0.2168, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 2.093033272316141, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 1.5142000000000002e-05, |
|
"loss": 0.2028, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 2.101647464197265, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.5122000000000002e-05, |
|
"loss": 0.1958, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 2.1102616560783893, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.5102e-05, |
|
"loss": 0.2105, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 2.1188758479595133, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 1.5082e-05, |
|
"loss": 0.2027, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 2.1274900398406373, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 1.5062e-05, |
|
"loss": 0.207, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 2.136104231721762, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.5042000000000001e-05, |
|
"loss": 0.2125, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 2.144718423602886, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 1.5022000000000001e-05, |
|
"loss": 0.2052, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 2.15333261548401, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 1.5002000000000001e-05, |
|
"loss": 0.2057, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.1619468073651342, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.4982000000000002e-05, |
|
"loss": 0.2159, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 2.1705609992462582, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 1.4962000000000002e-05, |
|
"loss": 0.2088, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 2.1791751911273822, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.4942e-05, |
|
"loss": 0.2128, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 2.1877893830085067, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 1.4922e-05, |
|
"loss": 0.2143, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 2.1964035748896307, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.4902e-05, |
|
"loss": 0.2098, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 2.2050177667707547, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 1.4882e-05, |
|
"loss": 0.2027, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 2.213631958651879, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 1.4862000000000001e-05, |
|
"loss": 0.2082, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 2.222246150533003, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.4842000000000001e-05, |
|
"loss": 0.2132, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 2.230860342414127, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 1.4822000000000001e-05, |
|
"loss": 0.2008, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 2.2394745342952516, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 1.4802000000000002e-05, |
|
"loss": 0.207, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.2480887261763756, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 1.4782e-05, |
|
"loss": 0.2129, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 2.2567029180574996, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.4762e-05, |
|
"loss": 0.2085, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 2.265317109938624, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.4742e-05, |
|
"loss": 0.216, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 2.273931301819748, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 1.4722e-05, |
|
"loss": 0.2133, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 2.282545493700872, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 1.4702000000000001e-05, |
|
"loss": 0.2032, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 2.2911596855819965, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.4682000000000001e-05, |
|
"loss": 0.2248, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 2.2997738774631205, |
|
"grad_norm": 2.25, |
|
"learning_rate": 1.4662000000000001e-05, |
|
"loss": 0.2056, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 2.3083880693442445, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.4642000000000001e-05, |
|
"loss": 0.2115, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 2.317002261225369, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 1.4622e-05, |
|
"loss": 0.1984, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 2.325616453106493, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.4602e-05, |
|
"loss": 0.2122, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.334230644987617, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 1.4582e-05, |
|
"loss": 0.2094, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 2.3428448368687413, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.4562e-05, |
|
"loss": 0.2225, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 2.3514590287498653, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.4542e-05, |
|
"loss": 0.2131, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 2.36007322063099, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.4522000000000001e-05, |
|
"loss": 0.2118, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 2.368687412512114, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 1.4502000000000001e-05, |
|
"loss": 0.2148, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 2.377301604393238, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.4482000000000001e-05, |
|
"loss": 0.214, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 2.385915796274362, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 1.4462e-05, |
|
"loss": 0.2028, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 2.3945299881554862, |
|
"grad_norm": 2.25, |
|
"learning_rate": 1.4442e-05, |
|
"loss": 0.2073, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 2.4031441800366102, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 1.4422e-05, |
|
"loss": 0.2145, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 2.4117583719177347, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 1.4402e-05, |
|
"loss": 0.2069, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.4203725637988587, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 1.4382e-05, |
|
"loss": 0.2216, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 2.4289867556799827, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.4362e-05, |
|
"loss": 0.2117, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 2.4376009475611067, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 1.4342000000000001e-05, |
|
"loss": 0.219, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 2.446215139442231, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 1.4322000000000001e-05, |
|
"loss": 0.2061, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 2.454829331323355, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.4302e-05, |
|
"loss": 0.2182, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 2.4634435232044796, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 1.4282e-05, |
|
"loss": 0.2135, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 2.4720577150856036, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.4262e-05, |
|
"loss": 0.215, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 2.4806719069667276, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.4242e-05, |
|
"loss": 0.2105, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 2.489286098847852, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 1.4222e-05, |
|
"loss": 0.2067, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 2.497900290728976, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.4202e-05, |
|
"loss": 0.2145, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.5065144826101, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 1.4182e-05, |
|
"loss": 0.2191, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 2.5151286744912245, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.4162000000000001e-05, |
|
"loss": 0.2068, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 2.5237428663723485, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.4142e-05, |
|
"loss": 0.2025, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 2.5323570582534725, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.4122e-05, |
|
"loss": 0.2089, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 2.540971250134597, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 1.4102e-05, |
|
"loss": 0.2167, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 2.549585442015721, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.4082e-05, |
|
"loss": 0.2049, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 2.558199633896845, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 1.4062e-05, |
|
"loss": 0.2034, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 2.5668138257779693, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 1.4042e-05, |
|
"loss": 0.2095, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 2.5754280176590933, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 1.4022e-05, |
|
"loss": 0.2047, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 2.5840422095402173, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.4002e-05, |
|
"loss": 0.2055, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.592656401421342, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.3982000000000003e-05, |
|
"loss": 0.211, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 2.601270593302466, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 1.3962000000000003e-05, |
|
"loss": 0.2009, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 2.6098847851835902, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 1.3942000000000001e-05, |
|
"loss": 0.2173, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 2.6184989770647142, |
|
"grad_norm": 2.25, |
|
"learning_rate": 1.3922000000000002e-05, |
|
"loss": 0.2102, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 2.6271131689458382, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 1.3902000000000002e-05, |
|
"loss": 0.2102, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 2.6357273608269622, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 1.3882000000000002e-05, |
|
"loss": 0.213, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 2.6443415527080867, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.3862000000000002e-05, |
|
"loss": 0.2117, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 2.6529557445892107, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 1.3842000000000002e-05, |
|
"loss": 0.2142, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 2.661569936470335, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.3822000000000003e-05, |
|
"loss": 0.202, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 2.670184128351459, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 1.3802000000000003e-05, |
|
"loss": 0.2133, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 2.678798320232583, |
|
"grad_norm": 2.25, |
|
"learning_rate": 1.3782000000000001e-05, |
|
"loss": 0.2086, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 2.687412512113707, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 1.3762000000000001e-05, |
|
"loss": 0.2032, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 2.6960267039948316, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.3742000000000002e-05, |
|
"loss": 0.2157, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 2.7046408958759556, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 1.3722000000000002e-05, |
|
"loss": 0.2139, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 2.71325508775708, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.3702000000000002e-05, |
|
"loss": 0.2177, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 2.721869279638204, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 1.3682000000000002e-05, |
|
"loss": 0.2082, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 2.730483471519328, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.3662000000000002e-05, |
|
"loss": 0.2093, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 2.739097663400452, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 1.3642000000000003e-05, |
|
"loss": 0.2054, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 2.7477118552815765, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 1.3622000000000003e-05, |
|
"loss": 0.2007, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 2.7563260471627005, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 1.3602000000000001e-05, |
|
"loss": 0.2109, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 2.764940239043825, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.3582000000000001e-05, |
|
"loss": 0.2106, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 2.773554430924949, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.3562000000000002e-05, |
|
"loss": 0.2101, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 2.782168622806073, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.3542000000000002e-05, |
|
"loss": 0.216, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 2.790782814687197, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.3522000000000002e-05, |
|
"loss": 0.2103, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 2.7993970065683214, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.3502000000000002e-05, |
|
"loss": 0.2083, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 2.8080111984494454, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.3482000000000002e-05, |
|
"loss": 0.2124, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 2.81662539033057, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.3462000000000003e-05, |
|
"loss": 0.2114, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 2.825239582211694, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.3442000000000001e-05, |
|
"loss": 0.2122, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 2.833853774092818, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.3422000000000001e-05, |
|
"loss": 0.217, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 2.842467965973942, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 1.3402000000000001e-05, |
|
"loss": 0.2137, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 2.8510821578550662, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.3382000000000002e-05, |
|
"loss": 0.2171, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 2.8596963497361902, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 1.3362000000000002e-05, |
|
"loss": 0.21, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 2.8683105416173147, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.3342000000000002e-05, |
|
"loss": 0.2083, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 2.8769247334984387, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 1.3322000000000002e-05, |
|
"loss": 0.2178, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 2.8855389253795627, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.3302000000000002e-05, |
|
"loss": 0.2086, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 2.894153117260687, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 1.3282000000000001e-05, |
|
"loss": 0.2144, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 2.902767309141811, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 1.3262000000000001e-05, |
|
"loss": 0.2067, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 2.911381501022935, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.3242000000000001e-05, |
|
"loss": 0.2104, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 2.9199956929040596, |
|
"grad_norm": 2.25, |
|
"learning_rate": 1.3222000000000001e-05, |
|
"loss": 0.2135, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 2.9286098847851836, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 1.3202000000000002e-05, |
|
"loss": 0.2095, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 2.9372240766663076, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.3182000000000002e-05, |
|
"loss": 0.2147, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 2.945838268547432, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.3162000000000002e-05, |
|
"loss": 0.2146, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 2.954452460428556, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 1.3142000000000002e-05, |
|
"loss": 0.2137, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 2.9630666523096805, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 1.3122e-05, |
|
"loss": 0.2111, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 2.9716808441908045, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 1.3102000000000001e-05, |
|
"loss": 0.2194, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 2.9802950360719285, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 1.3082000000000001e-05, |
|
"loss": 0.21, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 2.9889092279530525, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.3062000000000001e-05, |
|
"loss": 0.2004, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 2.997523419834177, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 1.3042000000000002e-05, |
|
"loss": 0.212, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 3.006029934316787, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 1.3022000000000002e-05, |
|
"loss": 0.1743, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 3.014644126197911, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 1.3002000000000002e-05, |
|
"loss": 0.1524, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 3.0232583180790353, |
|
"grad_norm": 3.359375, |
|
"learning_rate": 1.2982000000000002e-05, |
|
"loss": 0.1476, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 3.0318725099601593, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.2962e-05, |
|
"loss": 0.1408, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 3.0404867018412833, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 1.2942e-05, |
|
"loss": 0.1495, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 3.049100893722408, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 1.2922000000000001e-05, |
|
"loss": 0.1457, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 3.057715085603532, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 1.2902000000000001e-05, |
|
"loss": 0.1483, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 3.066329277484656, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.2882000000000001e-05, |
|
"loss": 0.1451, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 3.0749434693657802, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.2862000000000002e-05, |
|
"loss": 0.1378, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 3.0835576612469042, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 1.2842000000000002e-05, |
|
"loss": 0.1539, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 3.0921718531280282, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.2822000000000002e-05, |
|
"loss": 0.1503, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 3.1007860450091527, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 1.2802e-05, |
|
"loss": 0.147, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 3.1094002368902767, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 1.2782e-05, |
|
"loss": 0.1447, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 3.1180144287714007, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.2762e-05, |
|
"loss": 0.1568, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 3.126628620652525, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.2742000000000001e-05, |
|
"loss": 0.1397, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 3.135242812533649, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.2722000000000001e-05, |
|
"loss": 0.1347, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 3.143857004414773, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 1.2702000000000001e-05, |
|
"loss": 0.1487, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 3.1524711962958976, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 1.2682000000000002e-05, |
|
"loss": 0.1458, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 3.1610853881770216, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.2662000000000002e-05, |
|
"loss": 0.139, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 3.169699580058146, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.2642e-05, |
|
"loss": 0.141, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 3.17831377193927, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.2622e-05, |
|
"loss": 0.1355, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 3.186927963820394, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 1.2602e-05, |
|
"loss": 0.149, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 3.1955421557015184, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.2582e-05, |
|
"loss": 0.1485, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 3.2041563475826425, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.2562000000000001e-05, |
|
"loss": 0.1475, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 3.2127705394637665, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 1.2542000000000001e-05, |
|
"loss": 0.1469, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 3.221384731344891, |
|
"grad_norm": 2.625, |
|
"learning_rate": 1.2522000000000001e-05, |
|
"loss": 0.1448, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 3.229998923226015, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 1.2502000000000002e-05, |
|
"loss": 0.1454, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 3.238613115107139, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.2482e-05, |
|
"loss": 0.1437, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 3.2472273069882633, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.2462e-05, |
|
"loss": 0.1521, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 3.2558414988693873, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.2442e-05, |
|
"loss": 0.1462, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 3.2644556907505113, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 1.2422e-05, |
|
"loss": 0.1528, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 3.273069882631636, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.2402000000000001e-05, |
|
"loss": 0.1438, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 3.28168407451276, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 1.2382000000000001e-05, |
|
"loss": 0.1481, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 3.290298266393884, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 1.2362000000000001e-05, |
|
"loss": 0.1521, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 3.2989124582750082, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.2342000000000001e-05, |
|
"loss": 0.1453, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 3.3075266501561322, |
|
"grad_norm": 2.75, |
|
"learning_rate": 1.2322e-05, |
|
"loss": 0.1453, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 3.3161408420372562, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 1.2302e-05, |
|
"loss": 0.1512, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 3.3247550339183807, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.2282e-05, |
|
"loss": 0.1484, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 3.3333692257995047, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 1.2262e-05, |
|
"loss": 0.1374, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 3.3419834176806287, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.2242e-05, |
|
"loss": 0.149, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 3.350597609561753, |
|
"grad_norm": 2.75, |
|
"learning_rate": 1.2222000000000001e-05, |
|
"loss": 0.151, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 3.359211801442877, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 1.2202000000000001e-05, |
|
"loss": 0.143, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 3.367825993324001, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 1.2182000000000001e-05, |
|
"loss": 0.1484, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 3.3764401852051256, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.2162e-05, |
|
"loss": 0.1532, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 3.3850543770862496, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.2142e-05, |
|
"loss": 0.1472, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 3.3936685689673736, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.2122e-05, |
|
"loss": 0.1448, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 3.402282760848498, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.2102e-05, |
|
"loss": 0.1548, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 3.410896952729622, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 1.2082e-05, |
|
"loss": 0.1372, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 3.419511144610746, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 1.2062e-05, |
|
"loss": 0.1466, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 3.4281253364918705, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.2042000000000001e-05, |
|
"loss": 0.1406, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 3.4367395283729945, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 1.2022000000000001e-05, |
|
"loss": 0.1516, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 3.4453537202541185, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 1.2002e-05, |
|
"loss": 0.1491, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 3.453967912135243, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 1.1982e-05, |
|
"loss": 0.146, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 3.462582104016367, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 1.1962e-05, |
|
"loss": 0.1438, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 3.4711962958974913, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 1.1942e-05, |
|
"loss": 0.1372, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 3.4798104877786153, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.1922e-05, |
|
"loss": 0.1501, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 3.4884246796597393, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 1.1902e-05, |
|
"loss": 0.1515, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 3.4970388715408633, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 1.1882e-05, |
|
"loss": 0.151, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 3.505653063421988, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 1.1862000000000001e-05, |
|
"loss": 0.1468, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 3.514267255303112, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 1.1842e-05, |
|
"loss": 0.1477, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 3.5228814471842362, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.1822e-05, |
|
"loss": 0.1516, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 3.5314956390653602, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.1802e-05, |
|
"loss": 0.145, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 3.5401098309464842, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 1.1782e-05, |
|
"loss": 0.1434, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 3.5487240228276082, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.1762e-05, |
|
"loss": 0.1424, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 3.5573382147087327, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.1742e-05, |
|
"loss": 0.1433, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 3.5659524065898567, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 1.1722e-05, |
|
"loss": 0.1511, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 3.574566598470981, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 1.1702e-05, |
|
"loss": 0.149, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 3.583180790352105, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 1.1682e-05, |
|
"loss": 0.151, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 3.591794982233229, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 1.1662e-05, |
|
"loss": 0.1412, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 3.600409174114353, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.1642e-05, |
|
"loss": 0.1461, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 3.6090233659954776, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.1622e-05, |
|
"loss": 0.1481, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 3.6176375578766016, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.1602e-05, |
|
"loss": 0.1535, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 3.626251749757726, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.1582e-05, |
|
"loss": 0.1579, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 3.63486594163885, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 1.1562e-05, |
|
"loss": 0.1528, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 3.643480133519974, |
|
"grad_norm": 2.625, |
|
"learning_rate": 1.1542e-05, |
|
"loss": 0.1436, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 3.6520943254010985, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 1.1521999999999999e-05, |
|
"loss": 0.1488, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 3.6607085172822225, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 1.1502e-05, |
|
"loss": 0.1404, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 3.6693227091633465, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.1482000000000001e-05, |
|
"loss": 0.1584, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 3.677936901044471, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 1.1462000000000001e-05, |
|
"loss": 0.1429, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 3.686551092925595, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 1.1442000000000002e-05, |
|
"loss": 0.1478, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 3.695165284806719, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 1.1422000000000002e-05, |
|
"loss": 0.1495, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 3.7037794766878434, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 1.1402000000000002e-05, |
|
"loss": 0.1487, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 3.7123936685689674, |
|
"grad_norm": 2.75, |
|
"learning_rate": 1.1382000000000002e-05, |
|
"loss": 0.1515, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 3.721007860450092, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.1362000000000002e-05, |
|
"loss": 0.1524, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 3.729622052331216, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 1.1342000000000003e-05, |
|
"loss": 0.1506, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 3.73823624421234, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.1322000000000001e-05, |
|
"loss": 0.1448, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 3.746850436093464, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.1302000000000001e-05, |
|
"loss": 0.1444, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 3.7554646279745882, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.1282000000000001e-05, |
|
"loss": 0.1483, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 3.7640788198557122, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 1.1262000000000002e-05, |
|
"loss": 0.1524, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 3.7726930117368367, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.1242000000000002e-05, |
|
"loss": 0.1553, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 3.7813072036179607, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.1222000000000002e-05, |
|
"loss": 0.156, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 3.7899213954990847, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 1.1202000000000002e-05, |
|
"loss": 0.1543, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 3.7985355873802087, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.1182000000000002e-05, |
|
"loss": 0.1521, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 3.807149779261333, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 1.1162000000000003e-05, |
|
"loss": 0.1515, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 3.815763971142457, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 1.1142000000000001e-05, |
|
"loss": 0.1527, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 3.8243781630235816, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 1.1122000000000001e-05, |
|
"loss": 0.1543, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 3.8329923549047056, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.1102000000000001e-05, |
|
"loss": 0.1461, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 3.8416065467858296, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 1.1082000000000002e-05, |
|
"loss": 0.1448, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 3.8502207386669536, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 1.1062000000000002e-05, |
|
"loss": 0.1462, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 3.858834930548078, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 1.1042000000000002e-05, |
|
"loss": 0.146, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 3.867449122429202, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 1.1022000000000002e-05, |
|
"loss": 0.1416, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 3.8760633143103265, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.1002000000000002e-05, |
|
"loss": 0.156, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 3.8846775061914505, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 1.0982000000000001e-05, |
|
"loss": 0.1462, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 3.8932916980725745, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 1.0962000000000001e-05, |
|
"loss": 0.1447, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 3.9019058899536985, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 1.0942000000000001e-05, |
|
"loss": 0.1467, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 3.910520081834823, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 1.0922000000000001e-05, |
|
"loss": 0.1452, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 3.919134273715947, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.0902000000000002e-05, |
|
"loss": 0.1519, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 3.9277484655970714, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 1.0882000000000002e-05, |
|
"loss": 0.1485, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 3.9363626574781954, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 1.0862000000000002e-05, |
|
"loss": 0.1473, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 3.9449768493593194, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 1.0842000000000002e-05, |
|
"loss": 0.1462, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 3.9535910412404434, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.0822e-05, |
|
"loss": 0.1472, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 3.962205233121568, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.0802000000000001e-05, |
|
"loss": 0.1497, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 3.970819425002692, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.0782000000000001e-05, |
|
"loss": 0.1474, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 3.9794336168838162, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.0762000000000001e-05, |
|
"loss": 0.1477, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 3.9880478087649402, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 1.0742000000000002e-05, |
|
"loss": 0.1508, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 3.9966620006460643, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.0722000000000002e-05, |
|
"loss": 0.1449, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 4.005168515128674, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 1.0702000000000002e-05, |
|
"loss": 0.1182, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 4.013782707009798, |
|
"grad_norm": 2.75, |
|
"learning_rate": 1.0682000000000002e-05, |
|
"loss": 0.1047, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 4.022396898890923, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 1.0662e-05, |
|
"loss": 0.0972, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 4.031011090772047, |
|
"grad_norm": 3.109375, |
|
"learning_rate": 1.0642e-05, |
|
"loss": 0.099, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 4.039625282653171, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.0622000000000001e-05, |
|
"loss": 0.0977, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 4.048239474534295, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.0602000000000001e-05, |
|
"loss": 0.1003, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 4.056853666415419, |
|
"grad_norm": 2.625, |
|
"learning_rate": 1.0582000000000001e-05, |
|
"loss": 0.0964, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 4.065467858296543, |
|
"grad_norm": 2.625, |
|
"learning_rate": 1.0562000000000002e-05, |
|
"loss": 0.0969, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 4.074082050177668, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 1.0542000000000002e-05, |
|
"loss": 0.1009, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 4.082696242058792, |
|
"grad_norm": 2.625, |
|
"learning_rate": 1.0522000000000002e-05, |
|
"loss": 0.1039, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 4.091310433939916, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 1.0502e-05, |
|
"loss": 0.102, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 4.09992462582104, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 1.0482e-05, |
|
"loss": 0.0998, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 4.108538817702164, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.0462e-05, |
|
"loss": 0.1001, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 4.117153009583289, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.0442000000000001e-05, |
|
"loss": 0.0929, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 4.125767201464413, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 1.0422000000000001e-05, |
|
"loss": 0.0971, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 4.134381393345537, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 1.0402000000000001e-05, |
|
"loss": 0.0997, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 4.142995585226661, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.0382000000000002e-05, |
|
"loss": 0.1048, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 4.151609777107785, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 1.0362000000000002e-05, |
|
"loss": 0.1024, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 4.160223968988909, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 1.0342e-05, |
|
"loss": 0.1015, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 4.168838160870034, |
|
"grad_norm": 2.875, |
|
"learning_rate": 1.0322e-05, |
|
"loss": 0.1015, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 4.177452352751158, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 1.0302e-05, |
|
"loss": 0.0962, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 4.186066544632282, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 1.0282e-05, |
|
"loss": 0.1004, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 4.194680736513406, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 1.0262000000000001e-05, |
|
"loss": 0.0913, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 4.20329492839453, |
|
"grad_norm": 4.0, |
|
"learning_rate": 1.0242000000000001e-05, |
|
"loss": 0.0974, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 4.211909120275654, |
|
"grad_norm": 2.875, |
|
"learning_rate": 1.0222000000000001e-05, |
|
"loss": 0.1002, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 4.220523312156779, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.0202000000000002e-05, |
|
"loss": 0.1032, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 4.229137504037903, |
|
"grad_norm": 3.0, |
|
"learning_rate": 1.0182e-05, |
|
"loss": 0.0937, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 4.237751695919027, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 1.0162e-05, |
|
"loss": 0.0974, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 4.246365887800151, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 1.0142e-05, |
|
"loss": 0.1001, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 4.254980079681275, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 1.0122e-05, |
|
"loss": 0.1008, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 4.263594271562399, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 1.0102000000000001e-05, |
|
"loss": 0.0994, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 4.272208463443524, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.0082000000000001e-05, |
|
"loss": 0.1013, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 4.280822655324648, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 1.0062000000000001e-05, |
|
"loss": 0.0959, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 4.289436847205772, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 1.0042000000000001e-05, |
|
"loss": 0.0992, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 4.298051039086896, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 1.0022e-05, |
|
"loss": 0.0962, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 4.30666523096802, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 1.0002e-05, |
|
"loss": 0.1062, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 4.315279422849144, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 9.982e-06, |
|
"loss": 0.0995, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 4.3238936147302685, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 9.962e-06, |
|
"loss": 0.1022, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 4.3325078066113925, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 9.942e-06, |
|
"loss": 0.0996, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 4.3411219984925165, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 9.922000000000001e-06, |
|
"loss": 0.0993, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 4.3497361903736405, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 9.902000000000001e-06, |
|
"loss": 0.1029, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 4.3583503822547645, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 9.882000000000001e-06, |
|
"loss": 0.1003, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 4.3669645741358885, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 9.862e-06, |
|
"loss": 0.0979, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 4.375578766017013, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 9.842e-06, |
|
"loss": 0.1003, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 4.384192957898137, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 9.822e-06, |
|
"loss": 0.0981, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 4.392807149779261, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 9.802e-06, |
|
"loss": 0.098, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 4.401421341660385, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 9.782e-06, |
|
"loss": 0.1007, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 4.410035533541509, |
|
"grad_norm": 2.75, |
|
"learning_rate": 9.762e-06, |
|
"loss": 0.0968, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 4.418649725422634, |
|
"grad_norm": 2.75, |
|
"learning_rate": 9.742000000000001e-06, |
|
"loss": 0.1031, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 4.427263917303758, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 9.722000000000001e-06, |
|
"loss": 0.0996, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 4.435878109184882, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 9.702e-06, |
|
"loss": 0.0982, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 4.444492301066006, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 9.682e-06, |
|
"loss": 0.0991, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 4.45310649294713, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 9.662e-06, |
|
"loss": 0.0999, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 4.461720684828254, |
|
"grad_norm": 3.109375, |
|
"learning_rate": 9.642e-06, |
|
"loss": 0.1042, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 4.470334876709378, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 9.622000000000002e-06, |
|
"loss": 0.1016, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 4.478949068590503, |
|
"grad_norm": 2.5, |
|
"learning_rate": 9.602e-06, |
|
"loss": 0.0992, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 4.487563260471627, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 9.582e-06, |
|
"loss": 0.0986, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 4.496177452352751, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 9.562000000000001e-06, |
|
"loss": 0.0995, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 4.504791644233875, |
|
"grad_norm": 3.25, |
|
"learning_rate": 9.542000000000001e-06, |
|
"loss": 0.1039, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 4.513405836114999, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 9.522000000000001e-06, |
|
"loss": 0.1017, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 4.522020027996124, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 9.502000000000002e-06, |
|
"loss": 0.1018, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 4.530634219877248, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 9.482000000000002e-06, |
|
"loss": 0.1021, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 4.539248411758372, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 9.462000000000002e-06, |
|
"loss": 0.1007, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 4.547862603639496, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 9.442e-06, |
|
"loss": 0.0974, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 4.55647679552062, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 9.422e-06, |
|
"loss": 0.0994, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 4.565090987401744, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 9.402e-06, |
|
"loss": 0.1027, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 4.573705179282869, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 9.382000000000001e-06, |
|
"loss": 0.1081, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 4.582319371163993, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 9.362000000000001e-06, |
|
"loss": 0.0977, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 4.590933563045117, |
|
"grad_norm": 2.75, |
|
"learning_rate": 9.342000000000001e-06, |
|
"loss": 0.1057, |
|
"step": 5330 |
|
}, |
|
{ |
|
"epoch": 4.599547754926241, |
|
"grad_norm": 3.0, |
|
"learning_rate": 9.322000000000002e-06, |
|
"loss": 0.1008, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 4.608161946807365, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 9.302000000000002e-06, |
|
"loss": 0.102, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 4.616776138688489, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 9.282e-06, |
|
"loss": 0.106, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 4.625390330569614, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 9.262e-06, |
|
"loss": 0.1025, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 4.634004522450738, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 9.242e-06, |
|
"loss": 0.0968, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 4.642618714331862, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 9.222e-06, |
|
"loss": 0.0951, |
|
"step": 5390 |
|
}, |
|
{ |
|
"epoch": 4.651232906212986, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 9.202000000000001e-06, |
|
"loss": 0.1027, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 4.65984709809411, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 9.182000000000001e-06, |
|
"loss": 0.1014, |
|
"step": 5410 |
|
}, |
|
{ |
|
"epoch": 4.668461289975234, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 9.162000000000001e-06, |
|
"loss": 0.1017, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 4.677075481856359, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 9.142000000000002e-06, |
|
"loss": 0.1019, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 4.685689673737483, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 9.122e-06, |
|
"loss": 0.1034, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 4.694303865618607, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 9.102e-06, |
|
"loss": 0.098, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 4.702918057499731, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 9.082e-06, |
|
"loss": 0.0971, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 4.711532249380855, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 9.062e-06, |
|
"loss": 0.0989, |
|
"step": 5470 |
|
}, |
|
{ |
|
"epoch": 4.72014644126198, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 9.042e-06, |
|
"loss": 0.0987, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 4.728760633143104, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 9.022000000000001e-06, |
|
"loss": 0.0983, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 4.737374825024228, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 9.002000000000001e-06, |
|
"loss": 0.102, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 4.745989016905352, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 8.982000000000001e-06, |
|
"loss": 0.1022, |
|
"step": 5510 |
|
}, |
|
{ |
|
"epoch": 4.754603208786476, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 8.962e-06, |
|
"loss": 0.0997, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 4.7632174006676, |
|
"grad_norm": 2.625, |
|
"learning_rate": 8.942e-06, |
|
"loss": 0.1032, |
|
"step": 5530 |
|
}, |
|
{ |
|
"epoch": 4.771831592548724, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 8.922e-06, |
|
"loss": 0.0995, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 4.7804457844298485, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 8.902e-06, |
|
"loss": 0.1053, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 4.7890599763109725, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 8.882e-06, |
|
"loss": 0.0929, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 4.7976741681920965, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 8.862000000000001e-06, |
|
"loss": 0.1015, |
|
"step": 5570 |
|
}, |
|
{ |
|
"epoch": 4.8062883600732205, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 8.842000000000001e-06, |
|
"loss": 0.0987, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 4.8149025519543445, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 8.822000000000001e-06, |
|
"loss": 0.1039, |
|
"step": 5590 |
|
}, |
|
{ |
|
"epoch": 4.823516743835469, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 8.802e-06, |
|
"loss": 0.102, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 4.832130935716593, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 8.782e-06, |
|
"loss": 0.1002, |
|
"step": 5610 |
|
}, |
|
{ |
|
"epoch": 4.840745127597717, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 8.762e-06, |
|
"loss": 0.1023, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 4.849359319478841, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 8.742e-06, |
|
"loss": 0.1007, |
|
"step": 5630 |
|
}, |
|
{ |
|
"epoch": 4.857973511359965, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 8.722e-06, |
|
"loss": 0.1036, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 4.866587703241089, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 8.702e-06, |
|
"loss": 0.1044, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 4.875201895122213, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 8.682000000000001e-06, |
|
"loss": 0.0943, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 4.883816087003338, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 8.662000000000001e-06, |
|
"loss": 0.0982, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 4.892430278884462, |
|
"grad_norm": 3.125, |
|
"learning_rate": 8.642e-06, |
|
"loss": 0.1033, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 4.901044470765586, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 8.622e-06, |
|
"loss": 0.1034, |
|
"step": 5690 |
|
}, |
|
{ |
|
"epoch": 4.90965866264671, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 8.602e-06, |
|
"loss": 0.0999, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 4.918272854527834, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 8.582e-06, |
|
"loss": 0.1048, |
|
"step": 5710 |
|
}, |
|
{ |
|
"epoch": 4.926887046408959, |
|
"grad_norm": 3.75, |
|
"learning_rate": 8.562e-06, |
|
"loss": 0.0956, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 4.935501238290083, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 8.542e-06, |
|
"loss": 0.1041, |
|
"step": 5730 |
|
}, |
|
{ |
|
"epoch": 4.944115430171207, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 8.522e-06, |
|
"loss": 0.1029, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 4.952729622052331, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 8.502000000000001e-06, |
|
"loss": 0.1009, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 4.961343813933455, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 8.482e-06, |
|
"loss": 0.1065, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 4.969958005814579, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 8.462e-06, |
|
"loss": 0.0968, |
|
"step": 5770 |
|
}, |
|
{ |
|
"epoch": 4.978572197695704, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 8.442e-06, |
|
"loss": 0.1061, |
|
"step": 5780 |
|
}, |
|
{ |
|
"epoch": 4.987186389576828, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 8.422e-06, |
|
"loss": 0.0998, |
|
"step": 5790 |
|
}, |
|
{ |
|
"epoch": 4.995800581457952, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 8.402e-06, |
|
"loss": 0.1004, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 5.004307095940562, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 8.382e-06, |
|
"loss": 0.0826, |
|
"step": 5810 |
|
}, |
|
{ |
|
"epoch": 5.012921287821686, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 8.362e-06, |
|
"loss": 0.0723, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 5.02153547970281, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 8.342e-06, |
|
"loss": 0.0668, |
|
"step": 5830 |
|
}, |
|
{ |
|
"epoch": 5.030149671583935, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 8.322000000000001e-06, |
|
"loss": 0.072, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 5.038763863465059, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 8.302000000000001e-06, |
|
"loss": 0.0673, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 5.047378055346183, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 8.282000000000001e-06, |
|
"loss": 0.0656, |
|
"step": 5860 |
|
}, |
|
{ |
|
"epoch": 5.055992247227307, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 8.262000000000002e-06, |
|
"loss": 0.0721, |
|
"step": 5870 |
|
}, |
|
{ |
|
"epoch": 5.064606439108431, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 8.242000000000002e-06, |
|
"loss": 0.0737, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 5.073220630989555, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 8.222000000000002e-06, |
|
"loss": 0.068, |
|
"step": 5890 |
|
}, |
|
{ |
|
"epoch": 5.08183482287068, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 8.202e-06, |
|
"loss": 0.0682, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 5.090449014751804, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 8.182e-06, |
|
"loss": 0.0677, |
|
"step": 5910 |
|
}, |
|
{ |
|
"epoch": 5.099063206632928, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 8.162e-06, |
|
"loss": 0.0686, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 5.107677398514052, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 8.142000000000001e-06, |
|
"loss": 0.0688, |
|
"step": 5930 |
|
}, |
|
{ |
|
"epoch": 5.116291590395176, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 8.122000000000001e-06, |
|
"loss": 0.072, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 5.1249057822763, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 8.102000000000001e-06, |
|
"loss": 0.0707, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 5.133519974157425, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 8.082000000000002e-06, |
|
"loss": 0.0733, |
|
"step": 5960 |
|
}, |
|
{ |
|
"epoch": 5.142134166038549, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 8.062000000000002e-06, |
|
"loss": 0.0726, |
|
"step": 5970 |
|
}, |
|
{ |
|
"epoch": 5.150748357919673, |
|
"grad_norm": 3.109375, |
|
"learning_rate": 8.042e-06, |
|
"loss": 0.069, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 5.159362549800797, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 8.022e-06, |
|
"loss": 0.0699, |
|
"step": 5990 |
|
}, |
|
{ |
|
"epoch": 5.167976741681921, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 8.002e-06, |
|
"loss": 0.0704, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 5.176590933563045, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 7.982e-06, |
|
"loss": 0.0676, |
|
"step": 6010 |
|
}, |
|
{ |
|
"epoch": 5.18520512544417, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 7.962000000000001e-06, |
|
"loss": 0.0674, |
|
"step": 6020 |
|
}, |
|
{ |
|
"epoch": 5.193819317325294, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 7.942000000000001e-06, |
|
"loss": 0.0721, |
|
"step": 6030 |
|
}, |
|
{ |
|
"epoch": 5.202433509206418, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 7.922000000000001e-06, |
|
"loss": 0.0727, |
|
"step": 6040 |
|
}, |
|
{ |
|
"epoch": 5.211047701087542, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 7.902000000000002e-06, |
|
"loss": 0.071, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 5.219661892968666, |
|
"grad_norm": 3.5, |
|
"learning_rate": 7.882e-06, |
|
"loss": 0.0682, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 5.2282760848497905, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 7.862e-06, |
|
"loss": 0.0708, |
|
"step": 6070 |
|
}, |
|
{ |
|
"epoch": 5.2368902767309145, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 7.842e-06, |
|
"loss": 0.0695, |
|
"step": 6080 |
|
}, |
|
{ |
|
"epoch": 5.2455044686120385, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 7.822e-06, |
|
"loss": 0.0731, |
|
"step": 6090 |
|
}, |
|
{ |
|
"epoch": 5.2541186604931625, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 7.802000000000001e-06, |
|
"loss": 0.0695, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 5.2627328523742865, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 7.782000000000001e-06, |
|
"loss": 0.0765, |
|
"step": 6110 |
|
}, |
|
{ |
|
"epoch": 5.2713470442554105, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 7.762000000000001e-06, |
|
"loss": 0.0702, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 5.279961236136535, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 7.742000000000001e-06, |
|
"loss": 0.0698, |
|
"step": 6130 |
|
}, |
|
{ |
|
"epoch": 5.288575428017659, |
|
"grad_norm": 2.75, |
|
"learning_rate": 7.722e-06, |
|
"loss": 0.0724, |
|
"step": 6140 |
|
}, |
|
{ |
|
"epoch": 5.297189619898783, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 7.702e-06, |
|
"loss": 0.0708, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 5.305803811779907, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 7.682e-06, |
|
"loss": 0.0703, |
|
"step": 6160 |
|
}, |
|
{ |
|
"epoch": 5.314418003661031, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 7.662e-06, |
|
"loss": 0.0704, |
|
"step": 6170 |
|
}, |
|
{ |
|
"epoch": 5.323032195542155, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 7.642e-06, |
|
"loss": 0.0674, |
|
"step": 6180 |
|
}, |
|
{ |
|
"epoch": 5.33164638742328, |
|
"grad_norm": 4.5, |
|
"learning_rate": 7.622000000000001e-06, |
|
"loss": 0.077, |
|
"step": 6190 |
|
}, |
|
{ |
|
"epoch": 5.340260579304404, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 7.602e-06, |
|
"loss": 0.0722, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 5.348874771185528, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 7.582e-06, |
|
"loss": 0.0728, |
|
"step": 6210 |
|
}, |
|
{ |
|
"epoch": 5.357488963066652, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 7.562000000000001e-06, |
|
"loss": 0.0709, |
|
"step": 6220 |
|
}, |
|
{ |
|
"epoch": 5.366103154947776, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 7.542000000000001e-06, |
|
"loss": 0.069, |
|
"step": 6230 |
|
}, |
|
{ |
|
"epoch": 5.3747173468289, |
|
"grad_norm": 3.109375, |
|
"learning_rate": 7.522e-06, |
|
"loss": 0.0717, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 5.383331538710025, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 7.502e-06, |
|
"loss": 0.0677, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 5.391945730591149, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 7.4820000000000005e-06, |
|
"loss": 0.0706, |
|
"step": 6260 |
|
}, |
|
{ |
|
"epoch": 5.400559922472273, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 7.462000000000001e-06, |
|
"loss": 0.0753, |
|
"step": 6270 |
|
}, |
|
{ |
|
"epoch": 5.409174114353397, |
|
"grad_norm": 3.3125, |
|
"learning_rate": 7.442e-06, |
|
"loss": 0.0698, |
|
"step": 6280 |
|
}, |
|
{ |
|
"epoch": 5.417788306234521, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 7.422e-06, |
|
"loss": 0.0733, |
|
"step": 6290 |
|
}, |
|
{ |
|
"epoch": 5.426402498115645, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 7.4020000000000005e-06, |
|
"loss": 0.0672, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 5.43501668999677, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 7.382000000000001e-06, |
|
"loss": 0.0701, |
|
"step": 6310 |
|
}, |
|
{ |
|
"epoch": 5.443630881877894, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 7.362e-06, |
|
"loss": 0.0677, |
|
"step": 6320 |
|
}, |
|
{ |
|
"epoch": 5.452245073759018, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 7.342e-06, |
|
"loss": 0.0715, |
|
"step": 6330 |
|
}, |
|
{ |
|
"epoch": 5.460859265640142, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 7.322e-06, |
|
"loss": 0.0713, |
|
"step": 6340 |
|
}, |
|
{ |
|
"epoch": 5.469473457521266, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 7.3020000000000006e-06, |
|
"loss": 0.0697, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 5.47808764940239, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 7.282e-06, |
|
"loss": 0.0686, |
|
"step": 6360 |
|
}, |
|
{ |
|
"epoch": 5.486701841283515, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 7.262e-06, |
|
"loss": 0.0685, |
|
"step": 6370 |
|
}, |
|
{ |
|
"epoch": 5.495316033164639, |
|
"grad_norm": 2.625, |
|
"learning_rate": 7.242e-06, |
|
"loss": 0.0697, |
|
"step": 6380 |
|
}, |
|
{ |
|
"epoch": 5.503930225045763, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 7.2220000000000005e-06, |
|
"loss": 0.0684, |
|
"step": 6390 |
|
}, |
|
{ |
|
"epoch": 5.512544416926887, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 7.202e-06, |
|
"loss": 0.0674, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 5.521158608808011, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 7.182e-06, |
|
"loss": 0.0728, |
|
"step": 6410 |
|
}, |
|
{ |
|
"epoch": 5.529772800689136, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 7.162e-06, |
|
"loss": 0.0706, |
|
"step": 6420 |
|
}, |
|
{ |
|
"epoch": 5.53838699257026, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 7.142e-06, |
|
"loss": 0.0725, |
|
"step": 6430 |
|
}, |
|
{ |
|
"epoch": 5.547001184451384, |
|
"grad_norm": 2.375, |
|
"learning_rate": 7.1220000000000014e-06, |
|
"loss": 0.07, |
|
"step": 6440 |
|
}, |
|
{ |
|
"epoch": 5.555615376332508, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 7.102000000000001e-06, |
|
"loss": 0.0682, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 5.564229568213632, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 7.082000000000001e-06, |
|
"loss": 0.0715, |
|
"step": 6460 |
|
}, |
|
{ |
|
"epoch": 5.572843760094756, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 7.062000000000001e-06, |
|
"loss": 0.0715, |
|
"step": 6470 |
|
}, |
|
{ |
|
"epoch": 5.58145795197588, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 7.042000000000001e-06, |
|
"loss": 0.0702, |
|
"step": 6480 |
|
}, |
|
{ |
|
"epoch": 5.590072143857005, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 7.022000000000001e-06, |
|
"loss": 0.0729, |
|
"step": 6490 |
|
}, |
|
{ |
|
"epoch": 5.598686335738129, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 7.002000000000001e-06, |
|
"loss": 0.0734, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 5.607300527619253, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 6.982000000000001e-06, |
|
"loss": 0.0785, |
|
"step": 6510 |
|
}, |
|
{ |
|
"epoch": 5.615914719500377, |
|
"grad_norm": 2.875, |
|
"learning_rate": 6.962000000000001e-06, |
|
"loss": 0.0727, |
|
"step": 6520 |
|
}, |
|
{ |
|
"epoch": 5.624528911381501, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 6.942000000000001e-06, |
|
"loss": 0.0703, |
|
"step": 6530 |
|
}, |
|
{ |
|
"epoch": 5.633143103262626, |
|
"grad_norm": 3.453125, |
|
"learning_rate": 6.922000000000001e-06, |
|
"loss": 0.0698, |
|
"step": 6540 |
|
}, |
|
{ |
|
"epoch": 5.64175729514375, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 6.902000000000001e-06, |
|
"loss": 0.0667, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 5.650371487024874, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 6.882000000000001e-06, |
|
"loss": 0.0707, |
|
"step": 6560 |
|
}, |
|
{ |
|
"epoch": 5.658985678905998, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 6.8620000000000005e-06, |
|
"loss": 0.0696, |
|
"step": 6570 |
|
}, |
|
{ |
|
"epoch": 5.667599870787122, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 6.842000000000001e-06, |
|
"loss": 0.0726, |
|
"step": 6580 |
|
}, |
|
{ |
|
"epoch": 5.676214062668246, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 6.822000000000001e-06, |
|
"loss": 0.0737, |
|
"step": 6590 |
|
}, |
|
{ |
|
"epoch": 5.6848282545493705, |
|
"grad_norm": 2.875, |
|
"learning_rate": 6.802000000000001e-06, |
|
"loss": 0.0752, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 5.6934424464304945, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 6.7820000000000005e-06, |
|
"loss": 0.0711, |
|
"step": 6610 |
|
}, |
|
{ |
|
"epoch": 5.7020566383116185, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 6.762000000000001e-06, |
|
"loss": 0.0715, |
|
"step": 6620 |
|
}, |
|
{ |
|
"epoch": 5.7106708301927425, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 6.742000000000001e-06, |
|
"loss": 0.0715, |
|
"step": 6630 |
|
}, |
|
{ |
|
"epoch": 5.7192850220738665, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 6.722000000000001e-06, |
|
"loss": 0.0706, |
|
"step": 6640 |
|
}, |
|
{ |
|
"epoch": 5.7278992139549905, |
|
"grad_norm": 2.875, |
|
"learning_rate": 6.702e-06, |
|
"loss": 0.0671, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 5.736513405836115, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 6.6820000000000006e-06, |
|
"loss": 0.0754, |
|
"step": 6660 |
|
}, |
|
{ |
|
"epoch": 5.745127597717239, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 6.662000000000001e-06, |
|
"loss": 0.0702, |
|
"step": 6670 |
|
}, |
|
{ |
|
"epoch": 5.753741789598363, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 6.642000000000001e-06, |
|
"loss": 0.071, |
|
"step": 6680 |
|
}, |
|
{ |
|
"epoch": 5.762355981479487, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 6.622e-06, |
|
"loss": 0.0724, |
|
"step": 6690 |
|
}, |
|
{ |
|
"epoch": 5.770970173360611, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 6.6020000000000005e-06, |
|
"loss": 0.0739, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 5.779584365241735, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 6.582000000000001e-06, |
|
"loss": 0.0713, |
|
"step": 6710 |
|
}, |
|
{ |
|
"epoch": 5.78819855712286, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 6.562000000000001e-06, |
|
"loss": 0.0691, |
|
"step": 6720 |
|
}, |
|
{ |
|
"epoch": 5.796812749003984, |
|
"grad_norm": 2.75, |
|
"learning_rate": 6.542e-06, |
|
"loss": 0.0713, |
|
"step": 6730 |
|
}, |
|
{ |
|
"epoch": 5.805426940885108, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 6.522e-06, |
|
"loss": 0.0685, |
|
"step": 6740 |
|
}, |
|
{ |
|
"epoch": 5.814041132766232, |
|
"grad_norm": 3.46875, |
|
"learning_rate": 6.502000000000001e-06, |
|
"loss": 0.0728, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 5.822655324647356, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 6.482000000000001e-06, |
|
"loss": 0.0722, |
|
"step": 6760 |
|
}, |
|
{ |
|
"epoch": 5.831269516528481, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 6.462e-06, |
|
"loss": 0.0687, |
|
"step": 6770 |
|
}, |
|
{ |
|
"epoch": 5.839883708409605, |
|
"grad_norm": 3.0, |
|
"learning_rate": 6.442e-06, |
|
"loss": 0.0682, |
|
"step": 6780 |
|
}, |
|
{ |
|
"epoch": 5.848497900290729, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 6.4220000000000005e-06, |
|
"loss": 0.0674, |
|
"step": 6790 |
|
}, |
|
{ |
|
"epoch": 5.857112092171853, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 6.402000000000001e-06, |
|
"loss": 0.0743, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 5.865726284052977, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 6.382e-06, |
|
"loss": 0.071, |
|
"step": 6810 |
|
}, |
|
{ |
|
"epoch": 5.874340475934101, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 6.362e-06, |
|
"loss": 0.0688, |
|
"step": 6820 |
|
}, |
|
{ |
|
"epoch": 5.882954667815225, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 6.3420000000000004e-06, |
|
"loss": 0.0696, |
|
"step": 6830 |
|
}, |
|
{ |
|
"epoch": 5.89156885969635, |
|
"grad_norm": 2.875, |
|
"learning_rate": 6.322000000000001e-06, |
|
"loss": 0.0698, |
|
"step": 6840 |
|
}, |
|
{ |
|
"epoch": 5.900183051577474, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 6.302e-06, |
|
"loss": 0.0688, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 5.908797243458598, |
|
"grad_norm": 3.0, |
|
"learning_rate": 6.282e-06, |
|
"loss": 0.0704, |
|
"step": 6860 |
|
}, |
|
{ |
|
"epoch": 5.917411435339722, |
|
"grad_norm": 3.0, |
|
"learning_rate": 6.262e-06, |
|
"loss": 0.0742, |
|
"step": 6870 |
|
}, |
|
{ |
|
"epoch": 5.926025627220846, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 6.2420000000000005e-06, |
|
"loss": 0.0656, |
|
"step": 6880 |
|
}, |
|
{ |
|
"epoch": 5.934639819101971, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 6.222e-06, |
|
"loss": 0.072, |
|
"step": 6890 |
|
}, |
|
{ |
|
"epoch": 5.943254010983095, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 6.202e-06, |
|
"loss": 0.0668, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 5.951868202864219, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 6.182e-06, |
|
"loss": 0.0706, |
|
"step": 6910 |
|
}, |
|
{ |
|
"epoch": 5.960482394745343, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 6.1620000000000005e-06, |
|
"loss": 0.0685, |
|
"step": 6920 |
|
}, |
|
{ |
|
"epoch": 5.969096586626467, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 6.142e-06, |
|
"loss": 0.074, |
|
"step": 6930 |
|
}, |
|
{ |
|
"epoch": 5.977710778507591, |
|
"grad_norm": 3.40625, |
|
"learning_rate": 6.122e-06, |
|
"loss": 0.0713, |
|
"step": 6940 |
|
}, |
|
{ |
|
"epoch": 5.986324970388715, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 6.102e-06, |
|
"loss": 0.0719, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 5.99493916226984, |
|
"grad_norm": 3.0, |
|
"learning_rate": 6.082e-06, |
|
"loss": 0.0704, |
|
"step": 6960 |
|
}, |
|
{ |
|
"epoch": 6.00344567675245, |
|
"grad_norm": 2.0, |
|
"learning_rate": 6.062e-06, |
|
"loss": 0.0591, |
|
"step": 6970 |
|
}, |
|
{ |
|
"epoch": 6.012059868633574, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 6.042e-06, |
|
"loss": 0.0506, |
|
"step": 6980 |
|
}, |
|
{ |
|
"epoch": 6.020674060514698, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 6.022e-06, |
|
"loss": 0.055, |
|
"step": 6990 |
|
}, |
|
{ |
|
"epoch": 6.029288252395822, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 6.002e-06, |
|
"loss": 0.0564, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 6.037902444276947, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 5.982e-06, |
|
"loss": 0.0567, |
|
"step": 7010 |
|
}, |
|
{ |
|
"epoch": 6.046516636158071, |
|
"grad_norm": 2.5, |
|
"learning_rate": 5.962e-06, |
|
"loss": 0.056, |
|
"step": 7020 |
|
}, |
|
{ |
|
"epoch": 6.055130828039195, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 5.942e-06, |
|
"loss": 0.0564, |
|
"step": 7030 |
|
}, |
|
{ |
|
"epoch": 6.063745019920319, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 5.922e-06, |
|
"loss": 0.0511, |
|
"step": 7040 |
|
}, |
|
{ |
|
"epoch": 6.072359211801443, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 5.9019999999999996e-06, |
|
"loss": 0.0535, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 6.080973403682567, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 5.882e-06, |
|
"loss": 0.0558, |
|
"step": 7060 |
|
}, |
|
{ |
|
"epoch": 6.089587595563692, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 5.862000000000001e-06, |
|
"loss": 0.0526, |
|
"step": 7070 |
|
}, |
|
{ |
|
"epoch": 6.098201787444816, |
|
"grad_norm": 2.25, |
|
"learning_rate": 5.842000000000001e-06, |
|
"loss": 0.0541, |
|
"step": 7080 |
|
}, |
|
{ |
|
"epoch": 6.10681597932594, |
|
"grad_norm": 3.328125, |
|
"learning_rate": 5.822000000000001e-06, |
|
"loss": 0.0552, |
|
"step": 7090 |
|
}, |
|
{ |
|
"epoch": 6.115430171207064, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 5.802000000000001e-06, |
|
"loss": 0.0551, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 6.124044363088188, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 5.782000000000001e-06, |
|
"loss": 0.0528, |
|
"step": 7110 |
|
}, |
|
{ |
|
"epoch": 6.132658554969312, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 5.762000000000001e-06, |
|
"loss": 0.0528, |
|
"step": 7120 |
|
}, |
|
{ |
|
"epoch": 6.1412727468504364, |
|
"grad_norm": 2.125, |
|
"learning_rate": 5.742000000000001e-06, |
|
"loss": 0.0538, |
|
"step": 7130 |
|
}, |
|
{ |
|
"epoch": 6.1498869387315604, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 5.722000000000001e-06, |
|
"loss": 0.0521, |
|
"step": 7140 |
|
}, |
|
{ |
|
"epoch": 6.1585011306126844, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 5.702000000000001e-06, |
|
"loss": 0.0557, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 6.1671153224938084, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 5.682000000000001e-06, |
|
"loss": 0.0536, |
|
"step": 7160 |
|
}, |
|
{ |
|
"epoch": 6.1757295143749324, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 5.662000000000001e-06, |
|
"loss": 0.0526, |
|
"step": 7170 |
|
}, |
|
{ |
|
"epoch": 6.1843437062560564, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 5.642000000000001e-06, |
|
"loss": 0.0547, |
|
"step": 7180 |
|
}, |
|
{ |
|
"epoch": 6.192957898137181, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 5.6220000000000006e-06, |
|
"loss": 0.0508, |
|
"step": 7190 |
|
}, |
|
{ |
|
"epoch": 6.201572090018305, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 5.602000000000001e-06, |
|
"loss": 0.0541, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 6.210186281899429, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 5.582000000000001e-06, |
|
"loss": 0.0547, |
|
"step": 7210 |
|
}, |
|
{ |
|
"epoch": 6.218800473780553, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 5.562000000000001e-06, |
|
"loss": 0.056, |
|
"step": 7220 |
|
}, |
|
{ |
|
"epoch": 6.227414665661677, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 5.5420000000000005e-06, |
|
"loss": 0.0538, |
|
"step": 7230 |
|
}, |
|
{ |
|
"epoch": 6.236028857542801, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 5.522000000000001e-06, |
|
"loss": 0.0491, |
|
"step": 7240 |
|
}, |
|
{ |
|
"epoch": 6.244643049423926, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 5.502000000000001e-06, |
|
"loss": 0.0532, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 6.25325724130505, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 5.482000000000001e-06, |
|
"loss": 0.0496, |
|
"step": 7260 |
|
}, |
|
{ |
|
"epoch": 6.261871433186174, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 5.462e-06, |
|
"loss": 0.0587, |
|
"step": 7270 |
|
}, |
|
{ |
|
"epoch": 6.270485625067298, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 5.442000000000001e-06, |
|
"loss": 0.055, |
|
"step": 7280 |
|
}, |
|
{ |
|
"epoch": 6.279099816948422, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 5.422000000000001e-06, |
|
"loss": 0.0559, |
|
"step": 7290 |
|
}, |
|
{ |
|
"epoch": 6.287714008829546, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 5.402000000000001e-06, |
|
"loss": 0.0572, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 6.296328200710671, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 5.382e-06, |
|
"loss": 0.0516, |
|
"step": 7310 |
|
}, |
|
{ |
|
"epoch": 6.304942392591795, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 5.3620000000000005e-06, |
|
"loss": 0.05, |
|
"step": 7320 |
|
}, |
|
{ |
|
"epoch": 6.313556584472919, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 5.342000000000001e-06, |
|
"loss": 0.0534, |
|
"step": 7330 |
|
}, |
|
{ |
|
"epoch": 6.322170776354043, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 5.322000000000001e-06, |
|
"loss": 0.056, |
|
"step": 7340 |
|
}, |
|
{ |
|
"epoch": 6.330784968235167, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 5.302e-06, |
|
"loss": 0.0542, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 6.339399160116292, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 5.282e-06, |
|
"loss": 0.0573, |
|
"step": 7360 |
|
}, |
|
{ |
|
"epoch": 6.348013351997416, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 5.262000000000001e-06, |
|
"loss": 0.0518, |
|
"step": 7370 |
|
}, |
|
{ |
|
"epoch": 6.35662754387854, |
|
"grad_norm": 2.5, |
|
"learning_rate": 5.242000000000001e-06, |
|
"loss": 0.0567, |
|
"step": 7380 |
|
}, |
|
{ |
|
"epoch": 6.365241735759664, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 5.222e-06, |
|
"loss": 0.0563, |
|
"step": 7390 |
|
}, |
|
{ |
|
"epoch": 6.373855927640788, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 5.202e-06, |
|
"loss": 0.0534, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 6.382470119521912, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 5.1820000000000005e-06, |
|
"loss": 0.0515, |
|
"step": 7410 |
|
}, |
|
{ |
|
"epoch": 6.391084311403037, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 5.162000000000001e-06, |
|
"loss": 0.0568, |
|
"step": 7420 |
|
}, |
|
{ |
|
"epoch": 6.399698503284161, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 5.142e-06, |
|
"loss": 0.0527, |
|
"step": 7430 |
|
}, |
|
{ |
|
"epoch": 6.408312695165285, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 5.122e-06, |
|
"loss": 0.0528, |
|
"step": 7440 |
|
}, |
|
{ |
|
"epoch": 6.416926887046409, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 5.1020000000000004e-06, |
|
"loss": 0.0553, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 6.425541078927533, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 5.082000000000001e-06, |
|
"loss": 0.0531, |
|
"step": 7460 |
|
}, |
|
{ |
|
"epoch": 6.434155270808657, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 5.062e-06, |
|
"loss": 0.05, |
|
"step": 7470 |
|
}, |
|
{ |
|
"epoch": 6.442769462689782, |
|
"grad_norm": 3.25, |
|
"learning_rate": 5.042e-06, |
|
"loss": 0.0562, |
|
"step": 7480 |
|
}, |
|
{ |
|
"epoch": 6.451383654570906, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 5.022e-06, |
|
"loss": 0.0531, |
|
"step": 7490 |
|
}, |
|
{ |
|
"epoch": 6.45999784645203, |
|
"grad_norm": 2.625, |
|
"learning_rate": 5.0020000000000006e-06, |
|
"loss": 0.0545, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 6.468612038333154, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 4.982e-06, |
|
"loss": 0.0528, |
|
"step": 7510 |
|
}, |
|
{ |
|
"epoch": 6.477226230214278, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 4.962e-06, |
|
"loss": 0.0519, |
|
"step": 7520 |
|
}, |
|
{ |
|
"epoch": 6.485840422095402, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 4.942e-06, |
|
"loss": 0.0548, |
|
"step": 7530 |
|
}, |
|
{ |
|
"epoch": 6.494454613976527, |
|
"grad_norm": 3.390625, |
|
"learning_rate": 4.9220000000000005e-06, |
|
"loss": 0.0534, |
|
"step": 7540 |
|
}, |
|
{ |
|
"epoch": 6.503068805857651, |
|
"grad_norm": 2.625, |
|
"learning_rate": 4.902000000000001e-06, |
|
"loss": 0.0516, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 6.511682997738775, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 4.882000000000001e-06, |
|
"loss": 0.055, |
|
"step": 7560 |
|
}, |
|
{ |
|
"epoch": 6.520297189619899, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 4.862e-06, |
|
"loss": 0.0508, |
|
"step": 7570 |
|
}, |
|
{ |
|
"epoch": 6.528911381501023, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 4.842e-06, |
|
"loss": 0.0557, |
|
"step": 7580 |
|
}, |
|
{ |
|
"epoch": 6.537525573382148, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 4.822000000000001e-06, |
|
"loss": 0.0564, |
|
"step": 7590 |
|
}, |
|
{ |
|
"epoch": 6.546139765263272, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 4.802000000000001e-06, |
|
"loss": 0.0519, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 6.554753957144396, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 4.782e-06, |
|
"loss": 0.056, |
|
"step": 7610 |
|
}, |
|
{ |
|
"epoch": 6.56336814902552, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 4.762e-06, |
|
"loss": 0.0554, |
|
"step": 7620 |
|
}, |
|
{ |
|
"epoch": 6.571982340906644, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 4.7420000000000005e-06, |
|
"loss": 0.0538, |
|
"step": 7630 |
|
}, |
|
{ |
|
"epoch": 6.580596532787768, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 4.722000000000001e-06, |
|
"loss": 0.0527, |
|
"step": 7640 |
|
}, |
|
{ |
|
"epoch": 6.589210724668892, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 4.702e-06, |
|
"loss": 0.0558, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 6.5978249165500165, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 4.682e-06, |
|
"loss": 0.0506, |
|
"step": 7660 |
|
}, |
|
{ |
|
"epoch": 6.6064391084311405, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 4.6620000000000004e-06, |
|
"loss": 0.055, |
|
"step": 7670 |
|
}, |
|
{ |
|
"epoch": 6.6150533003122645, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 4.642000000000001e-06, |
|
"loss": 0.0535, |
|
"step": 7680 |
|
}, |
|
{ |
|
"epoch": 6.6236674921933885, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 4.622e-06, |
|
"loss": 0.0564, |
|
"step": 7690 |
|
}, |
|
{ |
|
"epoch": 6.6322816840745125, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 4.602e-06, |
|
"loss": 0.0567, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 6.640895875955637, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 4.582e-06, |
|
"loss": 0.0557, |
|
"step": 7710 |
|
}, |
|
{ |
|
"epoch": 6.649510067836761, |
|
"grad_norm": 2.625, |
|
"learning_rate": 4.5620000000000005e-06, |
|
"loss": 0.0533, |
|
"step": 7720 |
|
}, |
|
{ |
|
"epoch": 6.658124259717885, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 4.542e-06, |
|
"loss": 0.0548, |
|
"step": 7730 |
|
}, |
|
{ |
|
"epoch": 6.666738451599009, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 4.522e-06, |
|
"loss": 0.0543, |
|
"step": 7740 |
|
}, |
|
{ |
|
"epoch": 6.675352643480133, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 4.502e-06, |
|
"loss": 0.0498, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 6.683966835361257, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 4.4820000000000005e-06, |
|
"loss": 0.0559, |
|
"step": 7760 |
|
}, |
|
{ |
|
"epoch": 6.692581027242381, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 4.462e-06, |
|
"loss": 0.0556, |
|
"step": 7770 |
|
}, |
|
{ |
|
"epoch": 6.701195219123506, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 4.442e-06, |
|
"loss": 0.051, |
|
"step": 7780 |
|
}, |
|
{ |
|
"epoch": 6.70980941100463, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 4.422e-06, |
|
"loss": 0.0555, |
|
"step": 7790 |
|
}, |
|
{ |
|
"epoch": 6.718423602885754, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 4.402e-06, |
|
"loss": 0.0522, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 6.727037794766878, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 4.382e-06, |
|
"loss": 0.0555, |
|
"step": 7810 |
|
}, |
|
{ |
|
"epoch": 6.735651986648002, |
|
"grad_norm": 4.125, |
|
"learning_rate": 4.362e-06, |
|
"loss": 0.0519, |
|
"step": 7820 |
|
}, |
|
{ |
|
"epoch": 6.744266178529127, |
|
"grad_norm": 3.25, |
|
"learning_rate": 4.342e-06, |
|
"loss": 0.0538, |
|
"step": 7830 |
|
}, |
|
{ |
|
"epoch": 6.752880370410251, |
|
"grad_norm": 3.328125, |
|
"learning_rate": 4.322e-06, |
|
"loss": 0.056, |
|
"step": 7840 |
|
}, |
|
{ |
|
"epoch": 6.761494562291375, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 4.3020000000000005e-06, |
|
"loss": 0.0538, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 6.770108754172499, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 4.282000000000001e-06, |
|
"loss": 0.0513, |
|
"step": 7860 |
|
}, |
|
{ |
|
"epoch": 6.778722946053623, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 4.262000000000001e-06, |
|
"loss": 0.0548, |
|
"step": 7870 |
|
}, |
|
{ |
|
"epoch": 6.787337137934747, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 4.242e-06, |
|
"loss": 0.0538, |
|
"step": 7880 |
|
}, |
|
{ |
|
"epoch": 6.795951329815872, |
|
"grad_norm": 2.75, |
|
"learning_rate": 4.222e-06, |
|
"loss": 0.0537, |
|
"step": 7890 |
|
}, |
|
{ |
|
"epoch": 6.804565521696996, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 4.202000000000001e-06, |
|
"loss": 0.0582, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 6.81317971357812, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 4.182000000000001e-06, |
|
"loss": 0.0577, |
|
"step": 7910 |
|
}, |
|
{ |
|
"epoch": 6.821793905459244, |
|
"grad_norm": 2.75, |
|
"learning_rate": 4.162e-06, |
|
"loss": 0.0564, |
|
"step": 7920 |
|
}, |
|
{ |
|
"epoch": 6.830408097340368, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 4.142e-06, |
|
"loss": 0.0551, |
|
"step": 7930 |
|
}, |
|
{ |
|
"epoch": 6.839022289221492, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 4.1220000000000005e-06, |
|
"loss": 0.0501, |
|
"step": 7940 |
|
}, |
|
{ |
|
"epoch": 6.847636481102617, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 4.102000000000001e-06, |
|
"loss": 0.0498, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 6.856250672983741, |
|
"grad_norm": 3.125, |
|
"learning_rate": 4.082e-06, |
|
"loss": 0.0512, |
|
"step": 7960 |
|
}, |
|
{ |
|
"epoch": 6.864864864864865, |
|
"grad_norm": 2.5, |
|
"learning_rate": 4.062e-06, |
|
"loss": 0.052, |
|
"step": 7970 |
|
}, |
|
{ |
|
"epoch": 6.873479056745989, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 4.0420000000000004e-06, |
|
"loss": 0.0563, |
|
"step": 7980 |
|
}, |
|
{ |
|
"epoch": 6.882093248627113, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 4.022000000000001e-06, |
|
"loss": 0.0526, |
|
"step": 7990 |
|
}, |
|
{ |
|
"epoch": 6.890707440508237, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 4.002e-06, |
|
"loss": 0.0556, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 6.899321632389362, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 3.982e-06, |
|
"loss": 0.0553, |
|
"step": 8010 |
|
}, |
|
{ |
|
"epoch": 6.907935824270486, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 3.962e-06, |
|
"loss": 0.0538, |
|
"step": 8020 |
|
}, |
|
{ |
|
"epoch": 6.91655001615161, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 3.9420000000000005e-06, |
|
"loss": 0.056, |
|
"step": 8030 |
|
}, |
|
{ |
|
"epoch": 6.925164208032734, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 3.922e-06, |
|
"loss": 0.0553, |
|
"step": 8040 |
|
}, |
|
{ |
|
"epoch": 6.933778399913858, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 3.902e-06, |
|
"loss": 0.054, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 6.942392591794983, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 3.882e-06, |
|
"loss": 0.056, |
|
"step": 8060 |
|
}, |
|
{ |
|
"epoch": 6.951006783676107, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 3.8620000000000005e-06, |
|
"loss": 0.055, |
|
"step": 8070 |
|
}, |
|
{ |
|
"epoch": 6.959620975557231, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 3.842e-06, |
|
"loss": 0.0542, |
|
"step": 8080 |
|
}, |
|
{ |
|
"epoch": 6.968235167438355, |
|
"grad_norm": 2.5, |
|
"learning_rate": 3.822e-06, |
|
"loss": 0.052, |
|
"step": 8090 |
|
}, |
|
{ |
|
"epoch": 6.976849359319479, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 3.802e-06, |
|
"loss": 0.0528, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 6.985463551200603, |
|
"grad_norm": 2.875, |
|
"learning_rate": 3.782e-06, |
|
"loss": 0.0551, |
|
"step": 8110 |
|
}, |
|
{ |
|
"epoch": 6.994077743081727, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 3.762e-06, |
|
"loss": 0.0545, |
|
"step": 8120 |
|
}, |
|
{ |
|
"epoch": 7.0025842575643376, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 3.742e-06, |
|
"loss": 0.0536, |
|
"step": 8130 |
|
}, |
|
{ |
|
"epoch": 7.0111984494454616, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 3.722e-06, |
|
"loss": 0.0441, |
|
"step": 8140 |
|
}, |
|
{ |
|
"epoch": 7.019812641326586, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 3.702e-06, |
|
"loss": 0.0494, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 7.02842683320771, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 3.6820000000000005e-06, |
|
"loss": 0.0473, |
|
"step": 8160 |
|
}, |
|
{ |
|
"epoch": 7.037041025088834, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 3.6620000000000007e-06, |
|
"loss": 0.0443, |
|
"step": 8170 |
|
}, |
|
{ |
|
"epoch": 7.0456552169699584, |
|
"grad_norm": 2.25, |
|
"learning_rate": 3.6420000000000005e-06, |
|
"loss": 0.0454, |
|
"step": 8180 |
|
}, |
|
{ |
|
"epoch": 7.0542694088510824, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 3.6220000000000006e-06, |
|
"loss": 0.0465, |
|
"step": 8190 |
|
}, |
|
{ |
|
"epoch": 7.0628836007322064, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 3.6020000000000004e-06, |
|
"loss": 0.0463, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 7.0714977926133304, |
|
"grad_norm": 2.375, |
|
"learning_rate": 3.5820000000000006e-06, |
|
"loss": 0.0454, |
|
"step": 8210 |
|
}, |
|
{ |
|
"epoch": 7.0801119844944544, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 3.5620000000000004e-06, |
|
"loss": 0.0453, |
|
"step": 8220 |
|
}, |
|
{ |
|
"epoch": 7.0887261763755784, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 3.5420000000000006e-06, |
|
"loss": 0.0459, |
|
"step": 8230 |
|
}, |
|
{ |
|
"epoch": 7.097340368256703, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 3.5220000000000003e-06, |
|
"loss": 0.0462, |
|
"step": 8240 |
|
}, |
|
{ |
|
"epoch": 7.105954560137827, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 3.5020000000000005e-06, |
|
"loss": 0.0486, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 7.114568752018951, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 3.4820000000000003e-06, |
|
"loss": 0.0436, |
|
"step": 8260 |
|
}, |
|
{ |
|
"epoch": 7.123182943900075, |
|
"grad_norm": 2.375, |
|
"learning_rate": 3.4620000000000005e-06, |
|
"loss": 0.0462, |
|
"step": 8270 |
|
}, |
|
{ |
|
"epoch": 7.131797135781199, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 3.4420000000000002e-06, |
|
"loss": 0.045, |
|
"step": 8280 |
|
}, |
|
{ |
|
"epoch": 7.140411327662323, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 3.4220000000000004e-06, |
|
"loss": 0.0461, |
|
"step": 8290 |
|
}, |
|
{ |
|
"epoch": 7.149025519543448, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 3.402e-06, |
|
"loss": 0.0453, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 7.157639711424572, |
|
"grad_norm": 2.375, |
|
"learning_rate": 3.3820000000000004e-06, |
|
"loss": 0.045, |
|
"step": 8310 |
|
}, |
|
{ |
|
"epoch": 7.166253903305696, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 3.362e-06, |
|
"loss": 0.0483, |
|
"step": 8320 |
|
}, |
|
{ |
|
"epoch": 7.17486809518682, |
|
"grad_norm": 2.25, |
|
"learning_rate": 3.3420000000000004e-06, |
|
"loss": 0.0452, |
|
"step": 8330 |
|
}, |
|
{ |
|
"epoch": 7.183482287067944, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 3.322e-06, |
|
"loss": 0.0485, |
|
"step": 8340 |
|
}, |
|
{ |
|
"epoch": 7.192096478949068, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 3.3020000000000003e-06, |
|
"loss": 0.0485, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 7.200710670830193, |
|
"grad_norm": 2.125, |
|
"learning_rate": 3.282e-06, |
|
"loss": 0.0461, |
|
"step": 8360 |
|
}, |
|
{ |
|
"epoch": 7.209324862711317, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 3.2620000000000003e-06, |
|
"loss": 0.0498, |
|
"step": 8370 |
|
}, |
|
{ |
|
"epoch": 7.217939054592441, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 3.242e-06, |
|
"loss": 0.0469, |
|
"step": 8380 |
|
}, |
|
{ |
|
"epoch": 7.226553246473565, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 3.2220000000000002e-06, |
|
"loss": 0.0428, |
|
"step": 8390 |
|
}, |
|
{ |
|
"epoch": 7.235167438354689, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 3.202e-06, |
|
"loss": 0.0449, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 7.243781630235813, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 3.182e-06, |
|
"loss": 0.0441, |
|
"step": 8410 |
|
}, |
|
{ |
|
"epoch": 7.252395822116938, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 3.162e-06, |
|
"loss": 0.0452, |
|
"step": 8420 |
|
}, |
|
{ |
|
"epoch": 7.261010013998062, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 3.142e-06, |
|
"loss": 0.0458, |
|
"step": 8430 |
|
}, |
|
{ |
|
"epoch": 7.269624205879186, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 3.122e-06, |
|
"loss": 0.0421, |
|
"step": 8440 |
|
}, |
|
{ |
|
"epoch": 7.27823839776031, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 3.102e-06, |
|
"loss": 0.0443, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 7.286852589641434, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 3.082e-06, |
|
"loss": 0.0446, |
|
"step": 8460 |
|
}, |
|
{ |
|
"epoch": 7.295466781522558, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 3.0620000000000005e-06, |
|
"loss": 0.0449, |
|
"step": 8470 |
|
}, |
|
{ |
|
"epoch": 7.304080973403683, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 3.0420000000000007e-06, |
|
"loss": 0.0413, |
|
"step": 8480 |
|
}, |
|
{ |
|
"epoch": 7.312695165284807, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 3.0220000000000005e-06, |
|
"loss": 0.0456, |
|
"step": 8490 |
|
}, |
|
{ |
|
"epoch": 7.321309357165931, |
|
"grad_norm": 2.5, |
|
"learning_rate": 3.0020000000000006e-06, |
|
"loss": 0.0459, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 7.329923549047055, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 2.9820000000000004e-06, |
|
"loss": 0.0477, |
|
"step": 8510 |
|
}, |
|
{ |
|
"epoch": 7.338537740928179, |
|
"grad_norm": 2.375, |
|
"learning_rate": 2.9620000000000006e-06, |
|
"loss": 0.0462, |
|
"step": 8520 |
|
}, |
|
{ |
|
"epoch": 7.347151932809304, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 2.9420000000000004e-06, |
|
"loss": 0.0428, |
|
"step": 8530 |
|
}, |
|
{ |
|
"epoch": 7.355766124690428, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 2.9220000000000006e-06, |
|
"loss": 0.0438, |
|
"step": 8540 |
|
}, |
|
{ |
|
"epoch": 7.364380316571552, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 2.9020000000000003e-06, |
|
"loss": 0.0451, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 7.372994508452676, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 2.8820000000000005e-06, |
|
"loss": 0.0468, |
|
"step": 8560 |
|
}, |
|
{ |
|
"epoch": 7.3816087003338, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 2.8620000000000003e-06, |
|
"loss": 0.0462, |
|
"step": 8570 |
|
}, |
|
{ |
|
"epoch": 7.390222892214924, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 2.8420000000000005e-06, |
|
"loss": 0.0465, |
|
"step": 8580 |
|
}, |
|
{ |
|
"epoch": 7.398837084096048, |
|
"grad_norm": 2.375, |
|
"learning_rate": 2.8220000000000003e-06, |
|
"loss": 0.0471, |
|
"step": 8590 |
|
}, |
|
{ |
|
"epoch": 7.407451275977173, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 2.8020000000000004e-06, |
|
"loss": 0.0473, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 7.416065467858297, |
|
"grad_norm": 2.625, |
|
"learning_rate": 2.7820000000000002e-06, |
|
"loss": 0.0486, |
|
"step": 8610 |
|
}, |
|
{ |
|
"epoch": 7.424679659739421, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 2.7620000000000004e-06, |
|
"loss": 0.0457, |
|
"step": 8620 |
|
}, |
|
{ |
|
"epoch": 7.433293851620545, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 2.742e-06, |
|
"loss": 0.0481, |
|
"step": 8630 |
|
}, |
|
{ |
|
"epoch": 7.441908043501669, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 2.7220000000000004e-06, |
|
"loss": 0.0452, |
|
"step": 8640 |
|
}, |
|
{ |
|
"epoch": 7.450522235382794, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 2.702e-06, |
|
"loss": 0.0428, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 7.459136427263918, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 2.6820000000000003e-06, |
|
"loss": 0.0508, |
|
"step": 8660 |
|
}, |
|
{ |
|
"epoch": 7.467750619145042, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 2.662e-06, |
|
"loss": 0.0443, |
|
"step": 8670 |
|
}, |
|
{ |
|
"epoch": 7.476364811026166, |
|
"grad_norm": 2.5, |
|
"learning_rate": 2.6420000000000003e-06, |
|
"loss": 0.047, |
|
"step": 8680 |
|
}, |
|
{ |
|
"epoch": 7.48497900290729, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 2.622e-06, |
|
"loss": 0.0457, |
|
"step": 8690 |
|
}, |
|
{ |
|
"epoch": 7.493593194788414, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 2.6020000000000002e-06, |
|
"loss": 0.0467, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 7.502207386669538, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 2.582e-06, |
|
"loss": 0.0468, |
|
"step": 8710 |
|
}, |
|
{ |
|
"epoch": 7.5108215785506625, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 2.562e-06, |
|
"loss": 0.0447, |
|
"step": 8720 |
|
}, |
|
{ |
|
"epoch": 7.5194357704317865, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 2.542e-06, |
|
"loss": 0.0449, |
|
"step": 8730 |
|
}, |
|
{ |
|
"epoch": 7.5280499623129105, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 2.522e-06, |
|
"loss": 0.0466, |
|
"step": 8740 |
|
}, |
|
{ |
|
"epoch": 7.5366641541940345, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 2.502e-06, |
|
"loss": 0.0429, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 7.5452783460751585, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 2.482e-06, |
|
"loss": 0.0474, |
|
"step": 8760 |
|
}, |
|
{ |
|
"epoch": 7.553892537956283, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 2.4620000000000003e-06, |
|
"loss": 0.0469, |
|
"step": 8770 |
|
}, |
|
{ |
|
"epoch": 7.562506729837407, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 2.442e-06, |
|
"loss": 0.0467, |
|
"step": 8780 |
|
}, |
|
{ |
|
"epoch": 7.571120921718531, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 2.4220000000000003e-06, |
|
"loss": 0.0497, |
|
"step": 8790 |
|
}, |
|
{ |
|
"epoch": 7.579735113599655, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 2.402e-06, |
|
"loss": 0.045, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 7.588349305480779, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 2.3820000000000002e-06, |
|
"loss": 0.0472, |
|
"step": 8810 |
|
}, |
|
{ |
|
"epoch": 7.596963497361903, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 2.362e-06, |
|
"loss": 0.0495, |
|
"step": 8820 |
|
}, |
|
{ |
|
"epoch": 7.605577689243028, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 2.342e-06, |
|
"loss": 0.0441, |
|
"step": 8830 |
|
}, |
|
{ |
|
"epoch": 7.614191881124152, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 2.322e-06, |
|
"loss": 0.0466, |
|
"step": 8840 |
|
}, |
|
{ |
|
"epoch": 7.622806073005276, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 2.302e-06, |
|
"loss": 0.0447, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 7.6314202648864, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 2.282e-06, |
|
"loss": 0.0469, |
|
"step": 8860 |
|
}, |
|
{ |
|
"epoch": 7.640034456767524, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 2.262e-06, |
|
"loss": 0.0475, |
|
"step": 8870 |
|
}, |
|
{ |
|
"epoch": 7.648648648648649, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 2.2420000000000003e-06, |
|
"loss": 0.0457, |
|
"step": 8880 |
|
}, |
|
{ |
|
"epoch": 7.657262840529773, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 2.222e-06, |
|
"loss": 0.0437, |
|
"step": 8890 |
|
}, |
|
{ |
|
"epoch": 7.665877032410897, |
|
"grad_norm": 2.25, |
|
"learning_rate": 2.2020000000000003e-06, |
|
"loss": 0.0467, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 7.674491224292021, |
|
"grad_norm": 2.75, |
|
"learning_rate": 2.182e-06, |
|
"loss": 0.0472, |
|
"step": 8910 |
|
}, |
|
{ |
|
"epoch": 7.683105416173145, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 2.1620000000000002e-06, |
|
"loss": 0.0477, |
|
"step": 8920 |
|
}, |
|
{ |
|
"epoch": 7.691719608054269, |
|
"grad_norm": 2.875, |
|
"learning_rate": 2.142e-06, |
|
"loss": 0.0469, |
|
"step": 8930 |
|
}, |
|
{ |
|
"epoch": 7.700333799935393, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 2.122e-06, |
|
"loss": 0.0481, |
|
"step": 8940 |
|
}, |
|
{ |
|
"epoch": 7.708947991816518, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 2.102e-06, |
|
"loss": 0.0464, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 7.717562183697642, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 2.082e-06, |
|
"loss": 0.0469, |
|
"step": 8960 |
|
}, |
|
{ |
|
"epoch": 7.726176375578766, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 2.062e-06, |
|
"loss": 0.045, |
|
"step": 8970 |
|
}, |
|
{ |
|
"epoch": 7.73479056745989, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 2.042e-06, |
|
"loss": 0.0434, |
|
"step": 8980 |
|
}, |
|
{ |
|
"epoch": 7.743404759341014, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 2.022e-06, |
|
"loss": 0.0437, |
|
"step": 8990 |
|
}, |
|
{ |
|
"epoch": 7.752018951222139, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 2.002e-06, |
|
"loss": 0.0469, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 7.760633143103263, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 1.982e-06, |
|
"loss": 0.0458, |
|
"step": 9010 |
|
}, |
|
{ |
|
"epoch": 7.769247334984387, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.9620000000000004e-06, |
|
"loss": 0.049, |
|
"step": 9020 |
|
}, |
|
{ |
|
"epoch": 7.777861526865511, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.942e-06, |
|
"loss": 0.0472, |
|
"step": 9030 |
|
}, |
|
{ |
|
"epoch": 7.786475718746635, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 1.9220000000000004e-06, |
|
"loss": 0.0474, |
|
"step": 9040 |
|
}, |
|
{ |
|
"epoch": 7.795089910627759, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.9020000000000002e-06, |
|
"loss": 0.0456, |
|
"step": 9050 |
|
}, |
|
{ |
|
"epoch": 7.803704102508883, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 1.8820000000000001e-06, |
|
"loss": 0.0427, |
|
"step": 9060 |
|
}, |
|
{ |
|
"epoch": 7.812318294390008, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 1.8620000000000001e-06, |
|
"loss": 0.0441, |
|
"step": 9070 |
|
}, |
|
{ |
|
"epoch": 7.820932486271132, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 1.8420000000000001e-06, |
|
"loss": 0.0435, |
|
"step": 9080 |
|
}, |
|
{ |
|
"epoch": 7.829546678152256, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.822e-06, |
|
"loss": 0.0489, |
|
"step": 9090 |
|
}, |
|
{ |
|
"epoch": 7.83816087003338, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 1.802e-06, |
|
"loss": 0.0443, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 7.846775061914504, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 1.782e-06, |
|
"loss": 0.045, |
|
"step": 9110 |
|
}, |
|
{ |
|
"epoch": 7.855389253795629, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 1.762e-06, |
|
"loss": 0.0422, |
|
"step": 9120 |
|
}, |
|
{ |
|
"epoch": 7.864003445676753, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.742e-06, |
|
"loss": 0.0454, |
|
"step": 9130 |
|
}, |
|
{ |
|
"epoch": 7.872617637557877, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.722e-06, |
|
"loss": 0.0441, |
|
"step": 9140 |
|
}, |
|
{ |
|
"epoch": 7.881231829439001, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.702e-06, |
|
"loss": 0.046, |
|
"step": 9150 |
|
}, |
|
{ |
|
"epoch": 7.889846021320125, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 1.682e-06, |
|
"loss": 0.0467, |
|
"step": 9160 |
|
}, |
|
{ |
|
"epoch": 7.898460213201249, |
|
"grad_norm": 2.75, |
|
"learning_rate": 1.662e-06, |
|
"loss": 0.0486, |
|
"step": 9170 |
|
}, |
|
{ |
|
"epoch": 7.907074405082374, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.6420000000000003e-06, |
|
"loss": 0.0475, |
|
"step": 9180 |
|
}, |
|
{ |
|
"epoch": 7.915688596963498, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.6220000000000003e-06, |
|
"loss": 0.0476, |
|
"step": 9190 |
|
}, |
|
{ |
|
"epoch": 7.924302788844622, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.6020000000000003e-06, |
|
"loss": 0.0425, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 7.932916980725746, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 1.5820000000000003e-06, |
|
"loss": 0.0447, |
|
"step": 9210 |
|
}, |
|
{ |
|
"epoch": 7.94153117260687, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.5620000000000002e-06, |
|
"loss": 0.0484, |
|
"step": 9220 |
|
}, |
|
{ |
|
"epoch": 7.950145364487994, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.5420000000000002e-06, |
|
"loss": 0.0455, |
|
"step": 9230 |
|
}, |
|
{ |
|
"epoch": 7.9587595563691185, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 1.5220000000000002e-06, |
|
"loss": 0.0462, |
|
"step": 9240 |
|
}, |
|
{ |
|
"epoch": 7.9673737482502425, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 1.5020000000000002e-06, |
|
"loss": 0.045, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 7.9759879401313665, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 1.4820000000000002e-06, |
|
"loss": 0.0447, |
|
"step": 9260 |
|
}, |
|
{ |
|
"epoch": 7.9846021320124905, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 1.4620000000000001e-06, |
|
"loss": 0.0472, |
|
"step": 9270 |
|
}, |
|
{ |
|
"epoch": 7.9932163238936145, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 1.4420000000000001e-06, |
|
"loss": 0.047, |
|
"step": 9280 |
|
}, |
|
{ |
|
"epoch": 8.001722838376224, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.4220000000000001e-06, |
|
"loss": 0.0444, |
|
"step": 9290 |
|
}, |
|
{ |
|
"epoch": 8.010337030257348, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 1.402e-06, |
|
"loss": 0.0444, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 8.018951222138472, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 1.382e-06, |
|
"loss": 0.0397, |
|
"step": 9310 |
|
}, |
|
{ |
|
"epoch": 8.027565414019596, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.362e-06, |
|
"loss": 0.0425, |
|
"step": 9320 |
|
}, |
|
{ |
|
"epoch": 8.036179605900722, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 1.3420000000000002e-06, |
|
"loss": 0.043, |
|
"step": 9330 |
|
}, |
|
{ |
|
"epoch": 8.044793797781846, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.3220000000000002e-06, |
|
"loss": 0.0451, |
|
"step": 9340 |
|
}, |
|
{ |
|
"epoch": 8.05340798966297, |
|
"grad_norm": 2.25, |
|
"learning_rate": 1.3020000000000002e-06, |
|
"loss": 0.0442, |
|
"step": 9350 |
|
}, |
|
{ |
|
"epoch": 8.062022181544094, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 1.2820000000000002e-06, |
|
"loss": 0.0428, |
|
"step": 9360 |
|
}, |
|
{ |
|
"epoch": 8.070636373425218, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 1.2620000000000002e-06, |
|
"loss": 0.0441, |
|
"step": 9370 |
|
}, |
|
{ |
|
"epoch": 8.079250565306342, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.2420000000000001e-06, |
|
"loss": 0.045, |
|
"step": 9380 |
|
}, |
|
{ |
|
"epoch": 8.087864757187466, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.2220000000000001e-06, |
|
"loss": 0.0443, |
|
"step": 9390 |
|
}, |
|
{ |
|
"epoch": 8.09647894906859, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.202e-06, |
|
"loss": 0.0422, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 8.105093140949714, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 1.182e-06, |
|
"loss": 0.0417, |
|
"step": 9410 |
|
}, |
|
{ |
|
"epoch": 8.113707332830838, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 1.162e-06, |
|
"loss": 0.0432, |
|
"step": 9420 |
|
}, |
|
{ |
|
"epoch": 8.122321524711962, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 1.142e-06, |
|
"loss": 0.0419, |
|
"step": 9430 |
|
}, |
|
{ |
|
"epoch": 8.130935716593086, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.122e-06, |
|
"loss": 0.0416, |
|
"step": 9440 |
|
}, |
|
{ |
|
"epoch": 8.139549908474212, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.1020000000000002e-06, |
|
"loss": 0.0422, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 8.148164100355336, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 1.0820000000000002e-06, |
|
"loss": 0.0455, |
|
"step": 9460 |
|
}, |
|
{ |
|
"epoch": 8.15677829223646, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.0620000000000002e-06, |
|
"loss": 0.0455, |
|
"step": 9470 |
|
}, |
|
{ |
|
"epoch": 8.165392484117584, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 1.0420000000000001e-06, |
|
"loss": 0.0441, |
|
"step": 9480 |
|
}, |
|
{ |
|
"epoch": 8.174006675998708, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.0220000000000001e-06, |
|
"loss": 0.0433, |
|
"step": 9490 |
|
}, |
|
{ |
|
"epoch": 8.182620867879832, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 1.002e-06, |
|
"loss": 0.045, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 8.191235059760956, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 9.82e-07, |
|
"loss": 0.0443, |
|
"step": 9510 |
|
}, |
|
{ |
|
"epoch": 8.19984925164208, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 9.62e-07, |
|
"loss": 0.0427, |
|
"step": 9520 |
|
}, |
|
{ |
|
"epoch": 8.208463443523204, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 9.420000000000002e-07, |
|
"loss": 0.0412, |
|
"step": 9530 |
|
}, |
|
{ |
|
"epoch": 8.217077635404328, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 9.220000000000001e-07, |
|
"loss": 0.0431, |
|
"step": 9540 |
|
}, |
|
{ |
|
"epoch": 8.225691827285452, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 9.020000000000001e-07, |
|
"loss": 0.0414, |
|
"step": 9550 |
|
}, |
|
{ |
|
"epoch": 8.234306019166578, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 8.820000000000001e-07, |
|
"loss": 0.0454, |
|
"step": 9560 |
|
}, |
|
{ |
|
"epoch": 8.242920211047702, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 8.620000000000001e-07, |
|
"loss": 0.0417, |
|
"step": 9570 |
|
}, |
|
{ |
|
"epoch": 8.251534402928826, |
|
"grad_norm": 2.25, |
|
"learning_rate": 8.42e-07, |
|
"loss": 0.0446, |
|
"step": 9580 |
|
}, |
|
{ |
|
"epoch": 8.26014859480995, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 8.22e-07, |
|
"loss": 0.0434, |
|
"step": 9590 |
|
}, |
|
{ |
|
"epoch": 8.268762786691074, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 8.02e-07, |
|
"loss": 0.0397, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 8.277376978572198, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 7.820000000000001e-07, |
|
"loss": 0.0429, |
|
"step": 9610 |
|
}, |
|
{ |
|
"epoch": 8.285991170453322, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 7.620000000000001e-07, |
|
"loss": 0.0421, |
|
"step": 9620 |
|
}, |
|
{ |
|
"epoch": 8.294605362334446, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 7.420000000000001e-07, |
|
"loss": 0.0433, |
|
"step": 9630 |
|
}, |
|
{ |
|
"epoch": 8.30321955421557, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 7.22e-07, |
|
"loss": 0.0449, |
|
"step": 9640 |
|
}, |
|
{ |
|
"epoch": 8.311833746096694, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 7.02e-07, |
|
"loss": 0.043, |
|
"step": 9650 |
|
}, |
|
{ |
|
"epoch": 8.320447937977818, |
|
"grad_norm": 2.25, |
|
"learning_rate": 6.82e-07, |
|
"loss": 0.0442, |
|
"step": 9660 |
|
}, |
|
{ |
|
"epoch": 8.329062129858942, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 6.62e-07, |
|
"loss": 0.045, |
|
"step": 9670 |
|
}, |
|
{ |
|
"epoch": 8.337676321740068, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 6.42e-07, |
|
"loss": 0.0457, |
|
"step": 9680 |
|
}, |
|
{ |
|
"epoch": 8.346290513621192, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 6.22e-07, |
|
"loss": 0.0431, |
|
"step": 9690 |
|
}, |
|
{ |
|
"epoch": 8.354904705502316, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 6.02e-07, |
|
"loss": 0.0465, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 8.36351889738344, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 5.820000000000001e-07, |
|
"loss": 0.0423, |
|
"step": 9710 |
|
}, |
|
{ |
|
"epoch": 8.372133089264564, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 5.620000000000001e-07, |
|
"loss": 0.0428, |
|
"step": 9720 |
|
}, |
|
{ |
|
"epoch": 8.380747281145688, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 5.420000000000001e-07, |
|
"loss": 0.041, |
|
"step": 9730 |
|
}, |
|
{ |
|
"epoch": 8.389361473026812, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 5.22e-07, |
|
"loss": 0.0434, |
|
"step": 9740 |
|
}, |
|
{ |
|
"epoch": 8.397975664907936, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 5.02e-07, |
|
"loss": 0.0451, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 8.40658985678906, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 4.82e-07, |
|
"loss": 0.0422, |
|
"step": 9760 |
|
}, |
|
{ |
|
"epoch": 8.415204048670184, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 4.6200000000000003e-07, |
|
"loss": 0.0443, |
|
"step": 9770 |
|
}, |
|
{ |
|
"epoch": 8.423818240551308, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 4.4200000000000007e-07, |
|
"loss": 0.043, |
|
"step": 9780 |
|
}, |
|
{ |
|
"epoch": 8.432432432432432, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 4.2200000000000005e-07, |
|
"loss": 0.043, |
|
"step": 9790 |
|
}, |
|
{ |
|
"epoch": 8.441046624313557, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 4.02e-07, |
|
"loss": 0.0437, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 8.449660816194681, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 3.82e-07, |
|
"loss": 0.0428, |
|
"step": 9810 |
|
}, |
|
{ |
|
"epoch": 8.458275008075805, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 3.6200000000000004e-07, |
|
"loss": 0.0453, |
|
"step": 9820 |
|
}, |
|
{ |
|
"epoch": 8.46688919995693, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 3.42e-07, |
|
"loss": 0.0444, |
|
"step": 9830 |
|
}, |
|
{ |
|
"epoch": 8.475503391838053, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 3.22e-07, |
|
"loss": 0.0433, |
|
"step": 9840 |
|
}, |
|
{ |
|
"epoch": 8.484117583719177, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 3.0200000000000003e-07, |
|
"loss": 0.0442, |
|
"step": 9850 |
|
}, |
|
{ |
|
"epoch": 8.492731775600301, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 2.82e-07, |
|
"loss": 0.0434, |
|
"step": 9860 |
|
}, |
|
{ |
|
"epoch": 8.501345967481425, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 2.6200000000000004e-07, |
|
"loss": 0.0454, |
|
"step": 9870 |
|
}, |
|
{ |
|
"epoch": 8.50996015936255, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 2.42e-07, |
|
"loss": 0.0412, |
|
"step": 9880 |
|
}, |
|
{ |
|
"epoch": 8.518574351243673, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 2.2200000000000003e-07, |
|
"loss": 0.043, |
|
"step": 9890 |
|
}, |
|
{ |
|
"epoch": 8.527188543124797, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 2.02e-07, |
|
"loss": 0.0412, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 8.535802735005923, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.8200000000000002e-07, |
|
"loss": 0.0431, |
|
"step": 9910 |
|
}, |
|
{ |
|
"epoch": 8.544416926887047, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 1.62e-07, |
|
"loss": 0.0413, |
|
"step": 9920 |
|
}, |
|
{ |
|
"epoch": 8.553031118768171, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 1.4200000000000003e-07, |
|
"loss": 0.0458, |
|
"step": 9930 |
|
}, |
|
{ |
|
"epoch": 8.561645310649295, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.22e-07, |
|
"loss": 0.0445, |
|
"step": 9940 |
|
}, |
|
{ |
|
"epoch": 8.57025950253042, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.0200000000000001e-07, |
|
"loss": 0.0454, |
|
"step": 9950 |
|
}, |
|
{ |
|
"epoch": 8.578873694411543, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 8.200000000000002e-08, |
|
"loss": 0.0428, |
|
"step": 9960 |
|
}, |
|
{ |
|
"epoch": 8.587487886292667, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 6.2e-08, |
|
"loss": 0.0419, |
|
"step": 9970 |
|
}, |
|
{ |
|
"epoch": 8.596102078173791, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 4.2e-08, |
|
"loss": 0.044, |
|
"step": 9980 |
|
}, |
|
{ |
|
"epoch": 8.604716270054915, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 2.2000000000000002e-08, |
|
"loss": 0.0411, |
|
"step": 9990 |
|
}, |
|
{ |
|
"epoch": 8.61333046193604, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 2e-09, |
|
"loss": 0.0433, |
|
"step": 10000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 10000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 9, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.840766623509979e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|