{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.5197568389057752, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007598784194528876, "grad_norm": 21.75, "learning_rate": 9e-06, "loss": 3.4313, "step": 10 }, { "epoch": 0.015197568389057751, "grad_norm": 81.0, "learning_rate": 1.9e-05, "loss": 2.2326, "step": 20 }, { "epoch": 0.022796352583586626, "grad_norm": 85.5, "learning_rate": 2.9e-05, "loss": 1.4425, "step": 30 }, { "epoch": 0.030395136778115502, "grad_norm": 2.609375, "learning_rate": 3.9000000000000006e-05, "loss": 0.9761, "step": 40 }, { "epoch": 0.037993920972644375, "grad_norm": 2.890625, "learning_rate": 4.9e-05, "loss": 0.8592, "step": 50 }, { "epoch": 0.04559270516717325, "grad_norm": 2.640625, "learning_rate": 5.9e-05, "loss": 0.8049, "step": 60 }, { "epoch": 0.05319148936170213, "grad_norm": 2.9375, "learning_rate": 6.9e-05, "loss": 0.9155, "step": 70 }, { "epoch": 0.060790273556231005, "grad_norm": 1.9453125, "learning_rate": 7.900000000000001e-05, "loss": 0.8197, "step": 80 }, { "epoch": 0.06838905775075987, "grad_norm": 2.0, "learning_rate": 8.900000000000001e-05, "loss": 0.7834, "step": 90 }, { "epoch": 0.07598784194528875, "grad_norm": 2.203125, "learning_rate": 9.900000000000001e-05, "loss": 0.8179, "step": 100 }, { "epoch": 0.08358662613981763, "grad_norm": 1.984375, "learning_rate": 9.958139534883721e-05, "loss": 0.7661, "step": 110 }, { "epoch": 0.0911854103343465, "grad_norm": 1.9140625, "learning_rate": 9.911627906976745e-05, "loss": 0.8447, "step": 120 }, { "epoch": 0.09878419452887538, "grad_norm": 1.6484375, "learning_rate": 9.865116279069768e-05, "loss": 0.8758, "step": 130 }, { "epoch": 0.10638297872340426, "grad_norm": 2.171875, "learning_rate": 9.818604651162792e-05, "loss": 0.752, "step": 140 }, { "epoch": 0.11398176291793313, "grad_norm": 1.671875, "learning_rate": 9.772093023255814e-05, "loss": 0.788, "step": 150 }, { "epoch": 0.12158054711246201, "grad_norm": 1.390625, "learning_rate": 9.725581395348837e-05, "loss": 0.7352, "step": 160 }, { "epoch": 0.12917933130699089, "grad_norm": 1.5546875, "learning_rate": 9.67906976744186e-05, "loss": 0.7905, "step": 170 }, { "epoch": 0.13677811550151975, "grad_norm": 1.6953125, "learning_rate": 9.632558139534884e-05, "loss": 0.7821, "step": 180 }, { "epoch": 0.14437689969604864, "grad_norm": 1.359375, "learning_rate": 9.586046511627908e-05, "loss": 0.6274, "step": 190 }, { "epoch": 0.1519756838905775, "grad_norm": 1.6875, "learning_rate": 9.539534883720931e-05, "loss": 0.7823, "step": 200 }, { "epoch": 0.1595744680851064, "grad_norm": 1.75, "learning_rate": 9.493023255813955e-05, "loss": 0.725, "step": 210 }, { "epoch": 0.16717325227963525, "grad_norm": 1.3828125, "learning_rate": 9.446511627906977e-05, "loss": 0.7733, "step": 220 }, { "epoch": 0.17477203647416414, "grad_norm": 1.5390625, "learning_rate": 9.4e-05, "loss": 0.7945, "step": 230 }, { "epoch": 0.182370820668693, "grad_norm": 1.5625, "learning_rate": 9.353488372093023e-05, "loss": 0.7283, "step": 240 }, { "epoch": 0.1899696048632219, "grad_norm": 1.578125, "learning_rate": 9.306976744186047e-05, "loss": 0.6244, "step": 250 }, { "epoch": 0.19756838905775076, "grad_norm": 1.546875, "learning_rate": 9.26046511627907e-05, "loss": 0.6776, "step": 260 }, { "epoch": 0.20516717325227962, "grad_norm": 1.234375, "learning_rate": 9.213953488372094e-05, "loss": 0.6386, "step": 270 }, { "epoch": 0.2127659574468085, "grad_norm": 1.53125, "learning_rate": 9.167441860465116e-05, "loss": 0.6425, "step": 280 }, { "epoch": 0.22036474164133737, "grad_norm": 1.453125, "learning_rate": 9.12093023255814e-05, "loss": 0.5976, "step": 290 }, { "epoch": 0.22796352583586627, "grad_norm": 1.40625, "learning_rate": 9.074418604651164e-05, "loss": 0.6518, "step": 300 }, { "epoch": 0.23556231003039513, "grad_norm": 1.71875, "learning_rate": 9.027906976744186e-05, "loss": 0.6024, "step": 310 }, { "epoch": 0.24316109422492402, "grad_norm": 1.3515625, "learning_rate": 8.98139534883721e-05, "loss": 0.6896, "step": 320 }, { "epoch": 0.2507598784194529, "grad_norm": 1.59375, "learning_rate": 8.934883720930233e-05, "loss": 0.5574, "step": 330 }, { "epoch": 0.25835866261398177, "grad_norm": 1.0625, "learning_rate": 8.888372093023257e-05, "loss": 0.5795, "step": 340 }, { "epoch": 0.26595744680851063, "grad_norm": 1.109375, "learning_rate": 8.841860465116279e-05, "loss": 0.6064, "step": 350 }, { "epoch": 0.2735562310030395, "grad_norm": 1.015625, "learning_rate": 8.795348837209303e-05, "loss": 0.527, "step": 360 }, { "epoch": 0.2811550151975684, "grad_norm": 0.92578125, "learning_rate": 8.748837209302326e-05, "loss": 0.5996, "step": 370 }, { "epoch": 0.2887537993920973, "grad_norm": 1.375, "learning_rate": 8.70232558139535e-05, "loss": 0.5588, "step": 380 }, { "epoch": 0.29635258358662614, "grad_norm": 1.546875, "learning_rate": 8.655813953488372e-05, "loss": 0.5675, "step": 390 }, { "epoch": 0.303951367781155, "grad_norm": 1.28125, "learning_rate": 8.609302325581396e-05, "loss": 0.5708, "step": 400 }, { "epoch": 0.31155015197568386, "grad_norm": 1.265625, "learning_rate": 8.562790697674418e-05, "loss": 0.593, "step": 410 }, { "epoch": 0.3191489361702128, "grad_norm": 1.3125, "learning_rate": 8.516279069767442e-05, "loss": 0.5349, "step": 420 }, { "epoch": 0.32674772036474165, "grad_norm": 1.46875, "learning_rate": 8.469767441860465e-05, "loss": 0.4981, "step": 430 }, { "epoch": 0.3343465045592705, "grad_norm": 1.3046875, "learning_rate": 8.423255813953489e-05, "loss": 0.5477, "step": 440 }, { "epoch": 0.34194528875379937, "grad_norm": 1.109375, "learning_rate": 8.376744186046513e-05, "loss": 0.5359, "step": 450 }, { "epoch": 0.3495440729483283, "grad_norm": 1.15625, "learning_rate": 8.330232558139536e-05, "loss": 0.5612, "step": 460 }, { "epoch": 0.35714285714285715, "grad_norm": 1.1796875, "learning_rate": 8.283720930232559e-05, "loss": 0.5386, "step": 470 }, { "epoch": 0.364741641337386, "grad_norm": 1.0625, "learning_rate": 8.237209302325581e-05, "loss": 0.4443, "step": 480 }, { "epoch": 0.3723404255319149, "grad_norm": 1.0703125, "learning_rate": 8.190697674418605e-05, "loss": 0.5386, "step": 490 }, { "epoch": 0.3799392097264438, "grad_norm": 1.140625, "learning_rate": 8.144186046511628e-05, "loss": 0.4538, "step": 500 }, { "epoch": 0.38753799392097266, "grad_norm": 1.390625, "learning_rate": 8.097674418604652e-05, "loss": 0.4705, "step": 510 }, { "epoch": 0.3951367781155015, "grad_norm": 1.15625, "learning_rate": 8.051162790697675e-05, "loss": 0.5122, "step": 520 }, { "epoch": 0.4027355623100304, "grad_norm": 1.2578125, "learning_rate": 8.004651162790698e-05, "loss": 0.4675, "step": 530 }, { "epoch": 0.41033434650455924, "grad_norm": 1.3046875, "learning_rate": 7.958139534883721e-05, "loss": 0.5141, "step": 540 }, { "epoch": 0.41793313069908816, "grad_norm": 1.0625, "learning_rate": 7.911627906976744e-05, "loss": 0.492, "step": 550 }, { "epoch": 0.425531914893617, "grad_norm": 1.21875, "learning_rate": 7.865116279069767e-05, "loss": 0.4708, "step": 560 }, { "epoch": 0.4331306990881459, "grad_norm": 1.296875, "learning_rate": 7.818604651162791e-05, "loss": 0.4668, "step": 570 }, { "epoch": 0.44072948328267475, "grad_norm": 1.2109375, "learning_rate": 7.772093023255815e-05, "loss": 0.4105, "step": 580 }, { "epoch": 0.44832826747720367, "grad_norm": 1.0859375, "learning_rate": 7.725581395348838e-05, "loss": 0.4586, "step": 590 }, { "epoch": 0.45592705167173253, "grad_norm": 1.2890625, "learning_rate": 7.67906976744186e-05, "loss": 0.4625, "step": 600 }, { "epoch": 0.4635258358662614, "grad_norm": 0.83984375, "learning_rate": 7.632558139534884e-05, "loss": 0.4149, "step": 610 }, { "epoch": 0.47112462006079026, "grad_norm": 0.97265625, "learning_rate": 7.586046511627908e-05, "loss": 0.4588, "step": 620 }, { "epoch": 0.4787234042553192, "grad_norm": 1.21875, "learning_rate": 7.53953488372093e-05, "loss": 0.4162, "step": 630 }, { "epoch": 0.48632218844984804, "grad_norm": 0.9296875, "learning_rate": 7.493023255813954e-05, "loss": 0.3976, "step": 640 }, { "epoch": 0.4939209726443769, "grad_norm": 1.3046875, "learning_rate": 7.446511627906977e-05, "loss": 0.4161, "step": 650 }, { "epoch": 0.5015197568389058, "grad_norm": 1.1640625, "learning_rate": 7.4e-05, "loss": 0.3689, "step": 660 }, { "epoch": 0.5091185410334347, "grad_norm": 0.8828125, "learning_rate": 7.353488372093023e-05, "loss": 0.427, "step": 670 }, { "epoch": 0.5167173252279635, "grad_norm": 0.9921875, "learning_rate": 7.306976744186047e-05, "loss": 0.3842, "step": 680 }, { "epoch": 0.5243161094224924, "grad_norm": 0.96484375, "learning_rate": 7.26046511627907e-05, "loss": 0.3853, "step": 690 }, { "epoch": 0.5319148936170213, "grad_norm": 1.0546875, "learning_rate": 7.213953488372094e-05, "loss": 0.3235, "step": 700 }, { "epoch": 0.5395136778115501, "grad_norm": 1.0546875, "learning_rate": 7.167441860465116e-05, "loss": 0.3763, "step": 710 }, { "epoch": 0.547112462006079, "grad_norm": 0.86328125, "learning_rate": 7.12093023255814e-05, "loss": 0.3651, "step": 720 }, { "epoch": 0.5547112462006079, "grad_norm": 0.97265625, "learning_rate": 7.074418604651162e-05, "loss": 0.3187, "step": 730 }, { "epoch": 0.5623100303951368, "grad_norm": 1.0859375, "learning_rate": 7.027906976744186e-05, "loss": 0.3752, "step": 740 }, { "epoch": 0.5699088145896657, "grad_norm": 1.1328125, "learning_rate": 6.98139534883721e-05, "loss": 0.3639, "step": 750 }, { "epoch": 0.5775075987841946, "grad_norm": 0.9609375, "learning_rate": 6.934883720930233e-05, "loss": 0.3682, "step": 760 }, { "epoch": 0.5851063829787234, "grad_norm": 0.7109375, "learning_rate": 6.888372093023257e-05, "loss": 0.3418, "step": 770 }, { "epoch": 0.5927051671732523, "grad_norm": 0.96875, "learning_rate": 6.841860465116279e-05, "loss": 0.3567, "step": 780 }, { "epoch": 0.6003039513677811, "grad_norm": 0.9375, "learning_rate": 6.795348837209301e-05, "loss": 0.371, "step": 790 }, { "epoch": 0.60790273556231, "grad_norm": 1.15625, "learning_rate": 6.748837209302325e-05, "loss": 0.334, "step": 800 }, { "epoch": 0.6155015197568389, "grad_norm": 0.921875, "learning_rate": 6.702325581395349e-05, "loss": 0.3043, "step": 810 }, { "epoch": 0.6231003039513677, "grad_norm": 1.0390625, "learning_rate": 6.655813953488372e-05, "loss": 0.3049, "step": 820 }, { "epoch": 0.6306990881458967, "grad_norm": 0.9453125, "learning_rate": 6.609302325581396e-05, "loss": 0.2673, "step": 830 }, { "epoch": 0.6382978723404256, "grad_norm": 1.03125, "learning_rate": 6.56279069767442e-05, "loss": 0.3203, "step": 840 }, { "epoch": 0.6458966565349544, "grad_norm": 1.078125, "learning_rate": 6.516279069767442e-05, "loss": 0.3552, "step": 850 }, { "epoch": 0.6534954407294833, "grad_norm": 0.62890625, "learning_rate": 6.469767441860466e-05, "loss": 0.2805, "step": 860 }, { "epoch": 0.6610942249240122, "grad_norm": 1.0703125, "learning_rate": 6.423255813953488e-05, "loss": 0.3399, "step": 870 }, { "epoch": 0.668693009118541, "grad_norm": 0.96875, "learning_rate": 6.376744186046512e-05, "loss": 0.2798, "step": 880 }, { "epoch": 0.6762917933130699, "grad_norm": 1.0234375, "learning_rate": 6.330232558139535e-05, "loss": 0.3096, "step": 890 }, { "epoch": 0.6838905775075987, "grad_norm": 0.8984375, "learning_rate": 6.283720930232559e-05, "loss": 0.3162, "step": 900 }, { "epoch": 0.6914893617021277, "grad_norm": 1.03125, "learning_rate": 6.237209302325581e-05, "loss": 0.2926, "step": 910 }, { "epoch": 0.6990881458966566, "grad_norm": 1.0703125, "learning_rate": 6.190697674418605e-05, "loss": 0.3684, "step": 920 }, { "epoch": 0.7066869300911854, "grad_norm": 0.96875, "learning_rate": 6.144186046511628e-05, "loss": 0.3586, "step": 930 }, { "epoch": 0.7142857142857143, "grad_norm": 1.03125, "learning_rate": 6.097674418604652e-05, "loss": 0.3382, "step": 940 }, { "epoch": 0.7218844984802432, "grad_norm": 1.234375, "learning_rate": 6.051162790697674e-05, "loss": 0.3272, "step": 950 }, { "epoch": 0.729483282674772, "grad_norm": 1.015625, "learning_rate": 6.004651162790698e-05, "loss": 0.2808, "step": 960 }, { "epoch": 0.7370820668693009, "grad_norm": 1.1796875, "learning_rate": 5.958139534883721e-05, "loss": 0.2965, "step": 970 }, { "epoch": 0.7446808510638298, "grad_norm": 0.90234375, "learning_rate": 5.9116279069767445e-05, "loss": 0.2778, "step": 980 }, { "epoch": 0.7522796352583586, "grad_norm": 1.1015625, "learning_rate": 5.8651162790697675e-05, "loss": 0.3103, "step": 990 }, { "epoch": 0.7598784194528876, "grad_norm": 1.125, "learning_rate": 5.818604651162791e-05, "loss": 0.2904, "step": 1000 }, { "epoch": 0.7674772036474165, "grad_norm": 0.76171875, "learning_rate": 5.772093023255815e-05, "loss": 0.3073, "step": 1010 }, { "epoch": 0.7750759878419453, "grad_norm": 0.99609375, "learning_rate": 5.725581395348838e-05, "loss": 0.3001, "step": 1020 }, { "epoch": 0.7826747720364742, "grad_norm": 1.0703125, "learning_rate": 5.67906976744186e-05, "loss": 0.2669, "step": 1030 }, { "epoch": 0.790273556231003, "grad_norm": 0.85546875, "learning_rate": 5.6325581395348836e-05, "loss": 0.2502, "step": 1040 }, { "epoch": 0.7978723404255319, "grad_norm": 0.91015625, "learning_rate": 5.586046511627907e-05, "loss": 0.306, "step": 1050 }, { "epoch": 0.8054711246200608, "grad_norm": 1.0, "learning_rate": 5.53953488372093e-05, "loss": 0.2415, "step": 1060 }, { "epoch": 0.8130699088145896, "grad_norm": 1.09375, "learning_rate": 5.493023255813954e-05, "loss": 0.2257, "step": 1070 }, { "epoch": 0.8206686930091185, "grad_norm": 0.84765625, "learning_rate": 5.4465116279069775e-05, "loss": 0.2491, "step": 1080 }, { "epoch": 0.8282674772036475, "grad_norm": 1.375, "learning_rate": 5.4000000000000005e-05, "loss": 0.2659, "step": 1090 }, { "epoch": 0.8358662613981763, "grad_norm": 0.953125, "learning_rate": 5.353488372093024e-05, "loss": 0.2436, "step": 1100 }, { "epoch": 0.8434650455927052, "grad_norm": 0.91015625, "learning_rate": 5.3069767441860464e-05, "loss": 0.2539, "step": 1110 }, { "epoch": 0.851063829787234, "grad_norm": 0.7265625, "learning_rate": 5.2604651162790694e-05, "loss": 0.2449, "step": 1120 }, { "epoch": 0.8586626139817629, "grad_norm": 1.03125, "learning_rate": 5.213953488372093e-05, "loss": 0.2424, "step": 1130 }, { "epoch": 0.8662613981762918, "grad_norm": 0.82421875, "learning_rate": 5.1674418604651166e-05, "loss": 0.2565, "step": 1140 }, { "epoch": 0.8738601823708206, "grad_norm": 1.0859375, "learning_rate": 5.1209302325581396e-05, "loss": 0.242, "step": 1150 }, { "epoch": 0.8814589665653495, "grad_norm": 0.97265625, "learning_rate": 5.074418604651163e-05, "loss": 0.2415, "step": 1160 }, { "epoch": 0.8890577507598785, "grad_norm": 1.2578125, "learning_rate": 5.027906976744187e-05, "loss": 0.2519, "step": 1170 }, { "epoch": 0.8966565349544073, "grad_norm": 0.8515625, "learning_rate": 4.981395348837209e-05, "loss": 0.2545, "step": 1180 }, { "epoch": 0.9042553191489362, "grad_norm": 0.69921875, "learning_rate": 4.934883720930233e-05, "loss": 0.2403, "step": 1190 }, { "epoch": 0.9118541033434651, "grad_norm": 0.73046875, "learning_rate": 4.8883720930232564e-05, "loss": 0.2075, "step": 1200 }, { "epoch": 0.9194528875379939, "grad_norm": 0.6875, "learning_rate": 4.8418604651162794e-05, "loss": 0.198, "step": 1210 }, { "epoch": 0.9270516717325228, "grad_norm": 0.88671875, "learning_rate": 4.7953488372093023e-05, "loss": 0.216, "step": 1220 }, { "epoch": 0.9346504559270516, "grad_norm": 0.66015625, "learning_rate": 4.748837209302326e-05, "loss": 0.1838, "step": 1230 }, { "epoch": 0.9422492401215805, "grad_norm": 0.578125, "learning_rate": 4.7023255813953496e-05, "loss": 0.2367, "step": 1240 }, { "epoch": 0.9498480243161094, "grad_norm": 1.1015625, "learning_rate": 4.655813953488372e-05, "loss": 0.1958, "step": 1250 }, { "epoch": 0.9574468085106383, "grad_norm": 0.94921875, "learning_rate": 4.6093023255813955e-05, "loss": 0.1862, "step": 1260 }, { "epoch": 0.9650455927051672, "grad_norm": 0.91015625, "learning_rate": 4.562790697674419e-05, "loss": 0.2328, "step": 1270 }, { "epoch": 0.9726443768996961, "grad_norm": 0.85546875, "learning_rate": 4.516279069767442e-05, "loss": 0.2483, "step": 1280 }, { "epoch": 0.9802431610942249, "grad_norm": 0.9453125, "learning_rate": 4.469767441860465e-05, "loss": 0.1677, "step": 1290 }, { "epoch": 0.9878419452887538, "grad_norm": 0.9375, "learning_rate": 4.423255813953489e-05, "loss": 0.2192, "step": 1300 }, { "epoch": 0.9954407294832827, "grad_norm": 0.859375, "learning_rate": 4.376744186046512e-05, "loss": 0.2357, "step": 1310 }, { "epoch": 1.0030395136778116, "grad_norm": 0.59375, "learning_rate": 4.3302325581395353e-05, "loss": 0.1562, "step": 1320 }, { "epoch": 1.0106382978723405, "grad_norm": 0.73828125, "learning_rate": 4.283720930232558e-05, "loss": 0.1043, "step": 1330 }, { "epoch": 1.0182370820668694, "grad_norm": 0.75390625, "learning_rate": 4.237209302325581e-05, "loss": 0.106, "step": 1340 }, { "epoch": 1.0258358662613982, "grad_norm": 0.859375, "learning_rate": 4.190697674418605e-05, "loss": 0.1041, "step": 1350 }, { "epoch": 1.033434650455927, "grad_norm": 0.98046875, "learning_rate": 4.1441860465116285e-05, "loss": 0.1001, "step": 1360 }, { "epoch": 1.041033434650456, "grad_norm": 0.65625, "learning_rate": 4.0976744186046515e-05, "loss": 0.0867, "step": 1370 }, { "epoch": 1.0486322188449848, "grad_norm": 0.78515625, "learning_rate": 4.0511627906976745e-05, "loss": 0.1042, "step": 1380 }, { "epoch": 1.0562310030395137, "grad_norm": 0.87109375, "learning_rate": 4.004651162790698e-05, "loss": 0.1049, "step": 1390 }, { "epoch": 1.0638297872340425, "grad_norm": 0.578125, "learning_rate": 3.958139534883721e-05, "loss": 0.0948, "step": 1400 }, { "epoch": 1.0714285714285714, "grad_norm": 0.9140625, "learning_rate": 3.911627906976744e-05, "loss": 0.0974, "step": 1410 }, { "epoch": 1.0790273556231003, "grad_norm": 0.703125, "learning_rate": 3.8651162790697677e-05, "loss": 0.1062, "step": 1420 }, { "epoch": 1.0866261398176291, "grad_norm": 0.8671875, "learning_rate": 3.818604651162791e-05, "loss": 0.0995, "step": 1430 }, { "epoch": 1.094224924012158, "grad_norm": 1.0078125, "learning_rate": 3.772093023255814e-05, "loss": 0.1188, "step": 1440 }, { "epoch": 1.1018237082066868, "grad_norm": 0.70703125, "learning_rate": 3.725581395348837e-05, "loss": 0.0929, "step": 1450 }, { "epoch": 1.1094224924012157, "grad_norm": 0.77734375, "learning_rate": 3.679069767441861e-05, "loss": 0.098, "step": 1460 }, { "epoch": 1.1170212765957448, "grad_norm": 0.6875, "learning_rate": 3.632558139534884e-05, "loss": 0.0958, "step": 1470 }, { "epoch": 1.1246200607902737, "grad_norm": 0.953125, "learning_rate": 3.5860465116279075e-05, "loss": 0.1008, "step": 1480 }, { "epoch": 1.1322188449848025, "grad_norm": 0.8984375, "learning_rate": 3.5395348837209304e-05, "loss": 0.1111, "step": 1490 }, { "epoch": 1.1398176291793314, "grad_norm": 0.68359375, "learning_rate": 3.4930232558139534e-05, "loss": 0.0917, "step": 1500 }, { "epoch": 1.1474164133738602, "grad_norm": 0.48828125, "learning_rate": 3.446511627906977e-05, "loss": 0.0912, "step": 1510 }, { "epoch": 1.155015197568389, "grad_norm": 0.94140625, "learning_rate": 3.4000000000000007e-05, "loss": 0.0948, "step": 1520 }, { "epoch": 1.162613981762918, "grad_norm": 0.83203125, "learning_rate": 3.353488372093023e-05, "loss": 0.0749, "step": 1530 }, { "epoch": 1.1702127659574468, "grad_norm": 0.953125, "learning_rate": 3.3069767441860466e-05, "loss": 0.1059, "step": 1540 }, { "epoch": 1.1778115501519757, "grad_norm": 0.6171875, "learning_rate": 3.26046511627907e-05, "loss": 0.0942, "step": 1550 }, { "epoch": 1.1854103343465046, "grad_norm": 0.48046875, "learning_rate": 3.213953488372093e-05, "loss": 0.0739, "step": 1560 }, { "epoch": 1.1930091185410334, "grad_norm": 1.078125, "learning_rate": 3.167441860465116e-05, "loss": 0.0931, "step": 1570 }, { "epoch": 1.2006079027355623, "grad_norm": 0.7265625, "learning_rate": 3.12093023255814e-05, "loss": 0.0955, "step": 1580 }, { "epoch": 1.2082066869300911, "grad_norm": 1.109375, "learning_rate": 3.074418604651163e-05, "loss": 0.0842, "step": 1590 }, { "epoch": 1.21580547112462, "grad_norm": 0.83203125, "learning_rate": 3.0279069767441864e-05, "loss": 0.0871, "step": 1600 }, { "epoch": 1.2234042553191489, "grad_norm": 1.171875, "learning_rate": 2.9813953488372093e-05, "loss": 0.0972, "step": 1610 }, { "epoch": 1.2310030395136777, "grad_norm": 0.5859375, "learning_rate": 2.9348837209302326e-05, "loss": 0.0752, "step": 1620 }, { "epoch": 1.2386018237082066, "grad_norm": 1.046875, "learning_rate": 2.888372093023256e-05, "loss": 0.0881, "step": 1630 }, { "epoch": 1.2462006079027357, "grad_norm": 0.5390625, "learning_rate": 2.8418604651162796e-05, "loss": 0.063, "step": 1640 }, { "epoch": 1.2537993920972643, "grad_norm": 0.7265625, "learning_rate": 2.7953488372093022e-05, "loss": 0.0964, "step": 1650 }, { "epoch": 1.2613981762917934, "grad_norm": 1.0234375, "learning_rate": 2.7488372093023258e-05, "loss": 0.0963, "step": 1660 }, { "epoch": 1.2689969604863223, "grad_norm": 0.8125, "learning_rate": 2.702325581395349e-05, "loss": 0.0675, "step": 1670 }, { "epoch": 1.2765957446808511, "grad_norm": 0.86328125, "learning_rate": 2.6558139534883724e-05, "loss": 0.0826, "step": 1680 }, { "epoch": 1.28419452887538, "grad_norm": 0.3359375, "learning_rate": 2.6093023255813954e-05, "loss": 0.0659, "step": 1690 }, { "epoch": 1.2917933130699089, "grad_norm": 0.494140625, "learning_rate": 2.5627906976744187e-05, "loss": 0.0869, "step": 1700 }, { "epoch": 1.2993920972644377, "grad_norm": 0.34375, "learning_rate": 2.516279069767442e-05, "loss": 0.0793, "step": 1710 }, { "epoch": 1.3069908814589666, "grad_norm": 0.9375, "learning_rate": 2.4697674418604653e-05, "loss": 0.0979, "step": 1720 }, { "epoch": 1.3145896656534954, "grad_norm": 0.65234375, "learning_rate": 2.4232558139534886e-05, "loss": 0.0848, "step": 1730 }, { "epoch": 1.3221884498480243, "grad_norm": 0.76171875, "learning_rate": 2.376744186046512e-05, "loss": 0.0944, "step": 1740 }, { "epoch": 1.3297872340425532, "grad_norm": 0.74609375, "learning_rate": 2.3302325581395352e-05, "loss": 0.0668, "step": 1750 }, { "epoch": 1.337386018237082, "grad_norm": 0.6328125, "learning_rate": 2.283720930232558e-05, "loss": 0.0787, "step": 1760 }, { "epoch": 1.344984802431611, "grad_norm": 0.79296875, "learning_rate": 2.2372093023255818e-05, "loss": 0.0708, "step": 1770 }, { "epoch": 1.3525835866261398, "grad_norm": 0.69140625, "learning_rate": 2.1906976744186047e-05, "loss": 0.0571, "step": 1780 }, { "epoch": 1.3601823708206686, "grad_norm": 0.734375, "learning_rate": 2.144186046511628e-05, "loss": 0.0626, "step": 1790 }, { "epoch": 1.3677811550151975, "grad_norm": 0.78125, "learning_rate": 2.0976744186046513e-05, "loss": 0.0687, "step": 1800 }, { "epoch": 1.3753799392097266, "grad_norm": 0.82421875, "learning_rate": 2.0511627906976746e-05, "loss": 0.0714, "step": 1810 }, { "epoch": 1.3829787234042552, "grad_norm": 0.74609375, "learning_rate": 2.0046511627906976e-05, "loss": 0.0639, "step": 1820 }, { "epoch": 1.3905775075987843, "grad_norm": 0.37109375, "learning_rate": 1.9581395348837212e-05, "loss": 0.0674, "step": 1830 }, { "epoch": 1.3981762917933132, "grad_norm": 0.4921875, "learning_rate": 1.9116279069767442e-05, "loss": 0.0664, "step": 1840 }, { "epoch": 1.405775075987842, "grad_norm": 0.78515625, "learning_rate": 1.8651162790697675e-05, "loss": 0.0593, "step": 1850 }, { "epoch": 1.4133738601823709, "grad_norm": 0.65625, "learning_rate": 1.8186046511627908e-05, "loss": 0.0686, "step": 1860 }, { "epoch": 1.4209726443768997, "grad_norm": 0.7421875, "learning_rate": 1.772093023255814e-05, "loss": 0.0844, "step": 1870 }, { "epoch": 1.4285714285714286, "grad_norm": 0.70703125, "learning_rate": 1.7255813953488374e-05, "loss": 0.0662, "step": 1880 }, { "epoch": 1.4361702127659575, "grad_norm": 0.578125, "learning_rate": 1.6790697674418607e-05, "loss": 0.0768, "step": 1890 }, { "epoch": 1.4437689969604863, "grad_norm": 0.462890625, "learning_rate": 1.6325581395348837e-05, "loss": 0.0686, "step": 1900 }, { "epoch": 1.4513677811550152, "grad_norm": 0.494140625, "learning_rate": 1.5860465116279073e-05, "loss": 0.0604, "step": 1910 }, { "epoch": 1.458966565349544, "grad_norm": 0.51171875, "learning_rate": 1.5395348837209303e-05, "loss": 0.0613, "step": 1920 }, { "epoch": 1.466565349544073, "grad_norm": 0.76171875, "learning_rate": 1.4930232558139537e-05, "loss": 0.0807, "step": 1930 }, { "epoch": 1.4741641337386018, "grad_norm": 0.796875, "learning_rate": 1.4465116279069768e-05, "loss": 0.0923, "step": 1940 }, { "epoch": 1.4817629179331306, "grad_norm": 1.015625, "learning_rate": 1.4000000000000001e-05, "loss": 0.0629, "step": 1950 }, { "epoch": 1.4893617021276595, "grad_norm": 0.80078125, "learning_rate": 1.3534883720930233e-05, "loss": 0.0709, "step": 1960 }, { "epoch": 1.4969604863221884, "grad_norm": 0.69140625, "learning_rate": 1.3069767441860467e-05, "loss": 0.0765, "step": 1970 }, { "epoch": 1.5045592705167175, "grad_norm": 0.765625, "learning_rate": 1.2604651162790699e-05, "loss": 0.07, "step": 1980 }, { "epoch": 1.512158054711246, "grad_norm": 0.546875, "learning_rate": 1.213953488372093e-05, "loss": 0.0614, "step": 1990 }, { "epoch": 1.5197568389057752, "grad_norm": 0.60546875, "learning_rate": 1.1674418604651163e-05, "loss": 0.0509, "step": 2000 } ], "logging_steps": 10, "max_steps": 2250, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.361342517248e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }