|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.5197568389057752, |
|
"eval_steps": 500, |
|
"global_step": 2000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.007598784194528876, |
|
"grad_norm": 21.75, |
|
"learning_rate": 9e-06, |
|
"loss": 3.4313, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.015197568389057751, |
|
"grad_norm": 81.0, |
|
"learning_rate": 1.9e-05, |
|
"loss": 2.2326, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.022796352583586626, |
|
"grad_norm": 85.5, |
|
"learning_rate": 2.9e-05, |
|
"loss": 1.4425, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.030395136778115502, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 3.9000000000000006e-05, |
|
"loss": 0.9761, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.037993920972644375, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 4.9e-05, |
|
"loss": 0.8592, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.04559270516717325, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 5.9e-05, |
|
"loss": 0.8049, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.05319148936170213, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 6.9e-05, |
|
"loss": 0.9155, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.060790273556231005, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 7.900000000000001e-05, |
|
"loss": 0.8197, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.06838905775075987, |
|
"grad_norm": 2.0, |
|
"learning_rate": 8.900000000000001e-05, |
|
"loss": 0.7834, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.07598784194528875, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 9.900000000000001e-05, |
|
"loss": 0.8179, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.08358662613981763, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 9.958139534883721e-05, |
|
"loss": 0.7661, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.0911854103343465, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 9.911627906976745e-05, |
|
"loss": 0.8447, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.09878419452887538, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 9.865116279069768e-05, |
|
"loss": 0.8758, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.10638297872340426, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 9.818604651162792e-05, |
|
"loss": 0.752, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.11398176291793313, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 9.772093023255814e-05, |
|
"loss": 0.788, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.12158054711246201, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 9.725581395348837e-05, |
|
"loss": 0.7352, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.12917933130699089, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 9.67906976744186e-05, |
|
"loss": 0.7905, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.13677811550151975, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 9.632558139534884e-05, |
|
"loss": 0.7821, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.14437689969604864, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 9.586046511627908e-05, |
|
"loss": 0.6274, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.1519756838905775, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 9.539534883720931e-05, |
|
"loss": 0.7823, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.1595744680851064, |
|
"grad_norm": 1.75, |
|
"learning_rate": 9.493023255813955e-05, |
|
"loss": 0.725, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.16717325227963525, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 9.446511627906977e-05, |
|
"loss": 0.7733, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.17477203647416414, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 9.4e-05, |
|
"loss": 0.7945, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.182370820668693, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 9.353488372093023e-05, |
|
"loss": 0.7283, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.1899696048632219, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 9.306976744186047e-05, |
|
"loss": 0.6244, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.19756838905775076, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 9.26046511627907e-05, |
|
"loss": 0.6776, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.20516717325227962, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 9.213953488372094e-05, |
|
"loss": 0.6386, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.2127659574468085, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 9.167441860465116e-05, |
|
"loss": 0.6425, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.22036474164133737, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 9.12093023255814e-05, |
|
"loss": 0.5976, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.22796352583586627, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 9.074418604651164e-05, |
|
"loss": 0.6518, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.23556231003039513, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 9.027906976744186e-05, |
|
"loss": 0.6024, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.24316109422492402, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 8.98139534883721e-05, |
|
"loss": 0.6896, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.2507598784194529, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 8.934883720930233e-05, |
|
"loss": 0.5574, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.25835866261398177, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 8.888372093023257e-05, |
|
"loss": 0.5795, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.26595744680851063, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 8.841860465116279e-05, |
|
"loss": 0.6064, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.2735562310030395, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 8.795348837209303e-05, |
|
"loss": 0.527, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.2811550151975684, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 8.748837209302326e-05, |
|
"loss": 0.5996, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.2887537993920973, |
|
"grad_norm": 1.375, |
|
"learning_rate": 8.70232558139535e-05, |
|
"loss": 0.5588, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.29635258358662614, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 8.655813953488372e-05, |
|
"loss": 0.5675, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.303951367781155, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 8.609302325581396e-05, |
|
"loss": 0.5708, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.31155015197568386, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 8.562790697674418e-05, |
|
"loss": 0.593, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.3191489361702128, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 8.516279069767442e-05, |
|
"loss": 0.5349, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.32674772036474165, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 8.469767441860465e-05, |
|
"loss": 0.4981, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.3343465045592705, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 8.423255813953489e-05, |
|
"loss": 0.5477, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.34194528875379937, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 8.376744186046513e-05, |
|
"loss": 0.5359, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.3495440729483283, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 8.330232558139536e-05, |
|
"loss": 0.5612, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.35714285714285715, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 8.283720930232559e-05, |
|
"loss": 0.5386, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.364741641337386, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 8.237209302325581e-05, |
|
"loss": 0.4443, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.3723404255319149, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 8.190697674418605e-05, |
|
"loss": 0.5386, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.3799392097264438, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 8.144186046511628e-05, |
|
"loss": 0.4538, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.38753799392097266, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 8.097674418604652e-05, |
|
"loss": 0.4705, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.3951367781155015, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 8.051162790697675e-05, |
|
"loss": 0.5122, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.4027355623100304, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 8.004651162790698e-05, |
|
"loss": 0.4675, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.41033434650455924, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 7.958139534883721e-05, |
|
"loss": 0.5141, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.41793313069908816, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 7.911627906976744e-05, |
|
"loss": 0.492, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.425531914893617, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 7.865116279069767e-05, |
|
"loss": 0.4708, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.4331306990881459, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 7.818604651162791e-05, |
|
"loss": 0.4668, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.44072948328267475, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 7.772093023255815e-05, |
|
"loss": 0.4105, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.44832826747720367, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 7.725581395348838e-05, |
|
"loss": 0.4586, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.45592705167173253, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 7.67906976744186e-05, |
|
"loss": 0.4625, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.4635258358662614, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 7.632558139534884e-05, |
|
"loss": 0.4149, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.47112462006079026, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 7.586046511627908e-05, |
|
"loss": 0.4588, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.4787234042553192, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 7.53953488372093e-05, |
|
"loss": 0.4162, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.48632218844984804, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 7.493023255813954e-05, |
|
"loss": 0.3976, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.4939209726443769, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 7.446511627906977e-05, |
|
"loss": 0.4161, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.5015197568389058, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 7.4e-05, |
|
"loss": 0.3689, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.5091185410334347, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 7.353488372093023e-05, |
|
"loss": 0.427, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.5167173252279635, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 7.306976744186047e-05, |
|
"loss": 0.3842, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.5243161094224924, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 7.26046511627907e-05, |
|
"loss": 0.3853, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.5319148936170213, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 7.213953488372094e-05, |
|
"loss": 0.3235, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.5395136778115501, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 7.167441860465116e-05, |
|
"loss": 0.3763, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.547112462006079, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 7.12093023255814e-05, |
|
"loss": 0.3651, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.5547112462006079, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 7.074418604651162e-05, |
|
"loss": 0.3187, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.5623100303951368, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 7.027906976744186e-05, |
|
"loss": 0.3752, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.5699088145896657, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 6.98139534883721e-05, |
|
"loss": 0.3639, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.5775075987841946, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 6.934883720930233e-05, |
|
"loss": 0.3682, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.5851063829787234, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 6.888372093023257e-05, |
|
"loss": 0.3418, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.5927051671732523, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 6.841860465116279e-05, |
|
"loss": 0.3567, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.6003039513677811, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 6.795348837209301e-05, |
|
"loss": 0.371, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.60790273556231, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 6.748837209302325e-05, |
|
"loss": 0.334, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.6155015197568389, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 6.702325581395349e-05, |
|
"loss": 0.3043, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.6231003039513677, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 6.655813953488372e-05, |
|
"loss": 0.3049, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.6306990881458967, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 6.609302325581396e-05, |
|
"loss": 0.2673, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.6382978723404256, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 6.56279069767442e-05, |
|
"loss": 0.3203, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.6458966565349544, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 6.516279069767442e-05, |
|
"loss": 0.3552, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.6534954407294833, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 6.469767441860466e-05, |
|
"loss": 0.2805, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.6610942249240122, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 6.423255813953488e-05, |
|
"loss": 0.3399, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.668693009118541, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 6.376744186046512e-05, |
|
"loss": 0.2798, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.6762917933130699, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 6.330232558139535e-05, |
|
"loss": 0.3096, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.6838905775075987, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 6.283720930232559e-05, |
|
"loss": 0.3162, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.6914893617021277, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 6.237209302325581e-05, |
|
"loss": 0.2926, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.6990881458966566, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 6.190697674418605e-05, |
|
"loss": 0.3684, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.7066869300911854, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 6.144186046511628e-05, |
|
"loss": 0.3586, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 6.097674418604652e-05, |
|
"loss": 0.3382, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.7218844984802432, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 6.051162790697674e-05, |
|
"loss": 0.3272, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.729483282674772, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 6.004651162790698e-05, |
|
"loss": 0.2808, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.7370820668693009, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 5.958139534883721e-05, |
|
"loss": 0.2965, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.7446808510638298, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 5.9116279069767445e-05, |
|
"loss": 0.2778, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.7522796352583586, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 5.8651162790697675e-05, |
|
"loss": 0.3103, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.7598784194528876, |
|
"grad_norm": 1.125, |
|
"learning_rate": 5.818604651162791e-05, |
|
"loss": 0.2904, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.7674772036474165, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 5.772093023255815e-05, |
|
"loss": 0.3073, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.7750759878419453, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 5.725581395348838e-05, |
|
"loss": 0.3001, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.7826747720364742, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 5.67906976744186e-05, |
|
"loss": 0.2669, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.790273556231003, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 5.6325581395348836e-05, |
|
"loss": 0.2502, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.7978723404255319, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 5.586046511627907e-05, |
|
"loss": 0.306, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.8054711246200608, |
|
"grad_norm": 1.0, |
|
"learning_rate": 5.53953488372093e-05, |
|
"loss": 0.2415, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.8130699088145896, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 5.493023255813954e-05, |
|
"loss": 0.2257, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.8206686930091185, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 5.4465116279069775e-05, |
|
"loss": 0.2491, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.8282674772036475, |
|
"grad_norm": 1.375, |
|
"learning_rate": 5.4000000000000005e-05, |
|
"loss": 0.2659, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.8358662613981763, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 5.353488372093024e-05, |
|
"loss": 0.2436, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.8434650455927052, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 5.3069767441860464e-05, |
|
"loss": 0.2539, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.851063829787234, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 5.2604651162790694e-05, |
|
"loss": 0.2449, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.8586626139817629, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 5.213953488372093e-05, |
|
"loss": 0.2424, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.8662613981762918, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 5.1674418604651166e-05, |
|
"loss": 0.2565, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.8738601823708206, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 5.1209302325581396e-05, |
|
"loss": 0.242, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.8814589665653495, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 5.074418604651163e-05, |
|
"loss": 0.2415, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.8890577507598785, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 5.027906976744187e-05, |
|
"loss": 0.2519, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.8966565349544073, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 4.981395348837209e-05, |
|
"loss": 0.2545, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.9042553191489362, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 4.934883720930233e-05, |
|
"loss": 0.2403, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.9118541033434651, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 4.8883720930232564e-05, |
|
"loss": 0.2075, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.9194528875379939, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 4.8418604651162794e-05, |
|
"loss": 0.198, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.9270516717325228, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 4.7953488372093023e-05, |
|
"loss": 0.216, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.9346504559270516, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 4.748837209302326e-05, |
|
"loss": 0.1838, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.9422492401215805, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 4.7023255813953496e-05, |
|
"loss": 0.2367, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.9498480243161094, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 4.655813953488372e-05, |
|
"loss": 0.1958, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.9574468085106383, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 4.6093023255813955e-05, |
|
"loss": 0.1862, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.9650455927051672, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 4.562790697674419e-05, |
|
"loss": 0.2328, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.9726443768996961, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 4.516279069767442e-05, |
|
"loss": 0.2483, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.9802431610942249, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 4.469767441860465e-05, |
|
"loss": 0.1677, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.9878419452887538, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 4.423255813953489e-05, |
|
"loss": 0.2192, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.9954407294832827, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 4.376744186046512e-05, |
|
"loss": 0.2357, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.0030395136778116, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 4.3302325581395353e-05, |
|
"loss": 0.1562, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.0106382978723405, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 4.283720930232558e-05, |
|
"loss": 0.1043, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.0182370820668694, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 4.237209302325581e-05, |
|
"loss": 0.106, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.0258358662613982, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 4.190697674418605e-05, |
|
"loss": 0.1041, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.033434650455927, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 4.1441860465116285e-05, |
|
"loss": 0.1001, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.041033434650456, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 4.0976744186046515e-05, |
|
"loss": 0.0867, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.0486322188449848, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 4.0511627906976745e-05, |
|
"loss": 0.1042, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.0562310030395137, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 4.004651162790698e-05, |
|
"loss": 0.1049, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.0638297872340425, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 3.958139534883721e-05, |
|
"loss": 0.0948, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.0714285714285714, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 3.911627906976744e-05, |
|
"loss": 0.0974, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.0790273556231003, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 3.8651162790697677e-05, |
|
"loss": 0.1062, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.0866261398176291, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 3.818604651162791e-05, |
|
"loss": 0.0995, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.094224924012158, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 3.772093023255814e-05, |
|
"loss": 0.1188, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.1018237082066868, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 3.725581395348837e-05, |
|
"loss": 0.0929, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.1094224924012157, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 3.679069767441861e-05, |
|
"loss": 0.098, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.1170212765957448, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 3.632558139534884e-05, |
|
"loss": 0.0958, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.1246200607902737, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 3.5860465116279075e-05, |
|
"loss": 0.1008, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.1322188449848025, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 3.5395348837209304e-05, |
|
"loss": 0.1111, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.1398176291793314, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 3.4930232558139534e-05, |
|
"loss": 0.0917, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.1474164133738602, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 3.446511627906977e-05, |
|
"loss": 0.0912, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.155015197568389, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 0.0948, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.162613981762918, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 3.353488372093023e-05, |
|
"loss": 0.0749, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.1702127659574468, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 3.3069767441860466e-05, |
|
"loss": 0.1059, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.1778115501519757, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 3.26046511627907e-05, |
|
"loss": 0.0942, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.1854103343465046, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 3.213953488372093e-05, |
|
"loss": 0.0739, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.1930091185410334, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 3.167441860465116e-05, |
|
"loss": 0.0931, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.2006079027355623, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 3.12093023255814e-05, |
|
"loss": 0.0955, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.2082066869300911, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 3.074418604651163e-05, |
|
"loss": 0.0842, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.21580547112462, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 3.0279069767441864e-05, |
|
"loss": 0.0871, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.2234042553191489, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 2.9813953488372093e-05, |
|
"loss": 0.0972, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.2310030395136777, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 2.9348837209302326e-05, |
|
"loss": 0.0752, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.2386018237082066, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 2.888372093023256e-05, |
|
"loss": 0.0881, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.2462006079027357, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 2.8418604651162796e-05, |
|
"loss": 0.063, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.2537993920972643, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 2.7953488372093022e-05, |
|
"loss": 0.0964, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.2613981762917934, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 2.7488372093023258e-05, |
|
"loss": 0.0963, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.2689969604863223, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 2.702325581395349e-05, |
|
"loss": 0.0675, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.2765957446808511, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 2.6558139534883724e-05, |
|
"loss": 0.0826, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.28419452887538, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 2.6093023255813954e-05, |
|
"loss": 0.0659, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.2917933130699089, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 2.5627906976744187e-05, |
|
"loss": 0.0869, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.2993920972644377, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 2.516279069767442e-05, |
|
"loss": 0.0793, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.3069908814589666, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 2.4697674418604653e-05, |
|
"loss": 0.0979, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.3145896656534954, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 2.4232558139534886e-05, |
|
"loss": 0.0848, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.3221884498480243, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 2.376744186046512e-05, |
|
"loss": 0.0944, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.3297872340425532, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 2.3302325581395352e-05, |
|
"loss": 0.0668, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.337386018237082, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 2.283720930232558e-05, |
|
"loss": 0.0787, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.344984802431611, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 2.2372093023255818e-05, |
|
"loss": 0.0708, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.3525835866261398, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 2.1906976744186047e-05, |
|
"loss": 0.0571, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.3601823708206686, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 2.144186046511628e-05, |
|
"loss": 0.0626, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.3677811550151975, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 2.0976744186046513e-05, |
|
"loss": 0.0687, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.3753799392097266, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 2.0511627906976746e-05, |
|
"loss": 0.0714, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.3829787234042552, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 2.0046511627906976e-05, |
|
"loss": 0.0639, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.3905775075987843, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 1.9581395348837212e-05, |
|
"loss": 0.0674, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.3981762917933132, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 1.9116279069767442e-05, |
|
"loss": 0.0664, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.405775075987842, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 1.8651162790697675e-05, |
|
"loss": 0.0593, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.4133738601823709, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 1.8186046511627908e-05, |
|
"loss": 0.0686, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.4209726443768997, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 1.772093023255814e-05, |
|
"loss": 0.0844, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 1.7255813953488374e-05, |
|
"loss": 0.0662, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.4361702127659575, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 1.6790697674418607e-05, |
|
"loss": 0.0768, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.4437689969604863, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 1.6325581395348837e-05, |
|
"loss": 0.0686, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.4513677811550152, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 1.5860465116279073e-05, |
|
"loss": 0.0604, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 1.458966565349544, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 1.5395348837209303e-05, |
|
"loss": 0.0613, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 1.466565349544073, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 1.4930232558139537e-05, |
|
"loss": 0.0807, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 1.4741641337386018, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 1.4465116279069768e-05, |
|
"loss": 0.0923, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 1.4817629179331306, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 0.0629, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.4893617021276595, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 1.3534883720930233e-05, |
|
"loss": 0.0709, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 1.4969604863221884, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 1.3069767441860467e-05, |
|
"loss": 0.0765, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 1.5045592705167175, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 1.2604651162790699e-05, |
|
"loss": 0.07, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.512158054711246, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 1.213953488372093e-05, |
|
"loss": 0.0614, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 1.5197568389057752, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 1.1674418604651163e-05, |
|
"loss": 0.0509, |
|
"step": 2000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2250, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.361342517248e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|