| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.7838700591493757, | |
| "eval_steps": 5000, | |
| "global_step": 19000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.009388789784996713, | |
| "grad_norm": 72.80598449707031, | |
| "learning_rate": 9.103707179727828e-07, | |
| "loss": 16.0755, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.018777579569993427, | |
| "grad_norm": 72.60921478271484, | |
| "learning_rate": 1.8301267010793056e-06, | |
| "loss": 13.0643, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.02816636935499014, | |
| "grad_norm": 10.017908096313477, | |
| "learning_rate": 2.7592679493195683e-06, | |
| "loss": 9.3474, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.03755515913998685, | |
| "grad_norm": 5.694988250732422, | |
| "learning_rate": 3.6977944626935713e-06, | |
| "loss": 8.2606, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.04694394892498357, | |
| "grad_norm": 4.844100475311279, | |
| "learning_rate": 4.6363209760675744e-06, | |
| "loss": 8.084, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.05633273870998028, | |
| "grad_norm": 3.6125142574310303, | |
| "learning_rate": 5.574847489441577e-06, | |
| "loss": 8.0581, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.06572152849497699, | |
| "grad_norm": 3.166046380996704, | |
| "learning_rate": 6.51337400281558e-06, | |
| "loss": 8.0175, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.0751103182799737, | |
| "grad_norm": 2.600433111190796, | |
| "learning_rate": 7.451900516189583e-06, | |
| "loss": 8.0285, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.08449910806497042, | |
| "grad_norm": 2.3805315494537354, | |
| "learning_rate": 8.390427029563585e-06, | |
| "loss": 8.0024, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.09388789784996714, | |
| "grad_norm": 12.382240295410156, | |
| "learning_rate": 9.328953542937589e-06, | |
| "loss": 8.0161, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.10327668763496385, | |
| "grad_norm": 2.7355728149414062, | |
| "learning_rate": 1.0267480056311592e-05, | |
| "loss": 7.9941, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.11266547741996057, | |
| "grad_norm": 2.0243470668792725, | |
| "learning_rate": 1.1206006569685594e-05, | |
| "loss": 8.0233, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.12205426720495728, | |
| "grad_norm": 1.9162158966064453, | |
| "learning_rate": 1.2144533083059597e-05, | |
| "loss": 8.0141, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.13144305698995398, | |
| "grad_norm": 11.409939765930176, | |
| "learning_rate": 1.3083059596433601e-05, | |
| "loss": 7.9644, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.1408318467749507, | |
| "grad_norm": 1.712424635887146, | |
| "learning_rate": 1.4021586109807603e-05, | |
| "loss": 8.0311, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.1502206365599474, | |
| "grad_norm": 2.4589834213256836, | |
| "learning_rate": 1.4960112623181606e-05, | |
| "loss": 8.0306, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.15960942634494413, | |
| "grad_norm": 1.7343533039093018, | |
| "learning_rate": 1.589863913655561e-05, | |
| "loss": 7.989, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.16899821612994084, | |
| "grad_norm": 2.0726826190948486, | |
| "learning_rate": 1.6837165649929613e-05, | |
| "loss": 8.0034, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.17838700591493756, | |
| "grad_norm": 1.7758458852767944, | |
| "learning_rate": 1.7775692163303613e-05, | |
| "loss": 8.0107, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.18777579569993427, | |
| "grad_norm": 3.3816475868225098, | |
| "learning_rate": 1.8714218676677617e-05, | |
| "loss": 7.9737, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.197164585484931, | |
| "grad_norm": 1.8136950731277466, | |
| "learning_rate": 1.965274519005162e-05, | |
| "loss": 7.9827, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.2065533752699277, | |
| "grad_norm": 1.7819303274154663, | |
| "learning_rate": 1.9934275728965626e-05, | |
| "loss": 8.0389, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.21594216505492442, | |
| "grad_norm": 2.269160509109497, | |
| "learning_rate": 1.9829951489228525e-05, | |
| "loss": 7.973, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.22533095483992113, | |
| "grad_norm": 3.3508036136627197, | |
| "learning_rate": 1.972562724949142e-05, | |
| "loss": 7.9669, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.23471974462491785, | |
| "grad_norm": 1.674142599105835, | |
| "learning_rate": 1.962130300975432e-05, | |
| "loss": 8.0296, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.24410853440991456, | |
| "grad_norm": 1.454300880432129, | |
| "learning_rate": 1.9516978770017215e-05, | |
| "loss": 7.9984, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.2534973241949113, | |
| "grad_norm": 2.2951695919036865, | |
| "learning_rate": 1.9412654530280113e-05, | |
| "loss": 7.9772, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.26288611397990796, | |
| "grad_norm": 6.295051574707031, | |
| "learning_rate": 1.930833029054301e-05, | |
| "loss": 7.9838, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.2722749037649047, | |
| "grad_norm": 1.8874555826187134, | |
| "learning_rate": 1.9204006050805904e-05, | |
| "loss": 7.9816, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.2816636935499014, | |
| "grad_norm": 20.835277557373047, | |
| "learning_rate": 1.9099681811068803e-05, | |
| "loss": 8.0021, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.29105248333489814, | |
| "grad_norm": 2.1683876514434814, | |
| "learning_rate": 1.8995357571331702e-05, | |
| "loss": 7.9715, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.3004412731198948, | |
| "grad_norm": 1.6533387899398804, | |
| "learning_rate": 1.8891033331594598e-05, | |
| "loss": 7.9809, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.30983006290489157, | |
| "grad_norm": 4.595189094543457, | |
| "learning_rate": 1.8786709091857496e-05, | |
| "loss": 7.9849, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.31921885268988826, | |
| "grad_norm": 1.994147539138794, | |
| "learning_rate": 1.8682384852120392e-05, | |
| "loss": 7.9463, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.328607642474885, | |
| "grad_norm": 1.961474895477295, | |
| "learning_rate": 1.8578060612383287e-05, | |
| "loss": 8.0067, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.3379964322598817, | |
| "grad_norm": 24.005535125732422, | |
| "learning_rate": 1.8473736372646186e-05, | |
| "loss": 7.9431, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.34738522204487843, | |
| "grad_norm": 1.9433845281600952, | |
| "learning_rate": 1.8369412132909085e-05, | |
| "loss": 7.9877, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.3567740118298751, | |
| "grad_norm": 10.296500205993652, | |
| "learning_rate": 1.826508789317198e-05, | |
| "loss": 7.9494, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.36616280161487186, | |
| "grad_norm": 2.194976568222046, | |
| "learning_rate": 1.8160763653434876e-05, | |
| "loss": 7.9466, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.37555159139986855, | |
| "grad_norm": 1.5201098918914795, | |
| "learning_rate": 1.8056439413697775e-05, | |
| "loss": 7.9708, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.3849403811848653, | |
| "grad_norm": 2.9077212810516357, | |
| "learning_rate": 1.795211517396067e-05, | |
| "loss": 7.9525, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.394329170969862, | |
| "grad_norm": 2.041530132293701, | |
| "learning_rate": 1.784779093422357e-05, | |
| "loss": 7.9322, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.4037179607548587, | |
| "grad_norm": 2.0275838375091553, | |
| "learning_rate": 1.7743466694486468e-05, | |
| "loss": 7.9415, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.4131067505398554, | |
| "grad_norm": 2.000778913497925, | |
| "learning_rate": 1.7639142454749364e-05, | |
| "loss": 7.9932, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.42249554032485215, | |
| "grad_norm": 2.2984609603881836, | |
| "learning_rate": 1.753481821501226e-05, | |
| "loss": 7.9481, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.43188433010984884, | |
| "grad_norm": 1.0885875225067139, | |
| "learning_rate": 1.7430493975275155e-05, | |
| "loss": 7.976, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.4412731198948456, | |
| "grad_norm": 4.441020488739014, | |
| "learning_rate": 1.7326169735538053e-05, | |
| "loss": 7.971, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.45066190967984227, | |
| "grad_norm": 2.2056221961975098, | |
| "learning_rate": 1.7221845495800952e-05, | |
| "loss": 7.9647, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.460050699464839, | |
| "grad_norm": 2.1192028522491455, | |
| "learning_rate": 1.7117521256063848e-05, | |
| "loss": 7.9217, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.4694394892498357, | |
| "grad_norm": 14.011516571044922, | |
| "learning_rate": 1.7013197016326747e-05, | |
| "loss": 7.9374, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.4694394892498357, | |
| "eval_loss": 7.951793193817139, | |
| "eval_runtime": 900.8057, | |
| "eval_samples_per_second": 378.358, | |
| "eval_steps_per_second": 2.956, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.4788282790348324, | |
| "grad_norm": 39.19038009643555, | |
| "learning_rate": 1.6908872776589642e-05, | |
| "loss": 7.9026, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.4882170688198291, | |
| "grad_norm": 1.812458872795105, | |
| "learning_rate": 1.6804548536852537e-05, | |
| "loss": 7.9304, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.4976058586048258, | |
| "grad_norm": 70.96247863769531, | |
| "learning_rate": 1.6700224297115436e-05, | |
| "loss": 7.9148, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.5069946483898226, | |
| "grad_norm": 1.6605011224746704, | |
| "learning_rate": 1.6595900057378335e-05, | |
| "loss": 7.9538, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.5163834381748192, | |
| "grad_norm": 2.0463483333587646, | |
| "learning_rate": 1.649157581764123e-05, | |
| "loss": 8.0002, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.5257722279598159, | |
| "grad_norm": 3.8601722717285156, | |
| "learning_rate": 1.6387251577904126e-05, | |
| "loss": 7.9571, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.5351610177448127, | |
| "grad_norm": 2.184122323989868, | |
| "learning_rate": 1.6282927338167025e-05, | |
| "loss": 7.932, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.5445498075298094, | |
| "grad_norm": 2.165367603302002, | |
| "learning_rate": 1.617860309842992e-05, | |
| "loss": 7.9047, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.5539385973148061, | |
| "grad_norm": 1.5312166213989258, | |
| "learning_rate": 1.607427885869282e-05, | |
| "loss": 7.9353, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.5633273870998028, | |
| "grad_norm": 21.75490379333496, | |
| "learning_rate": 1.5969954618955715e-05, | |
| "loss": 7.9203, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.5727161768847996, | |
| "grad_norm": 1.8674250841140747, | |
| "learning_rate": 1.5865630379218614e-05, | |
| "loss": 7.8967, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.5821049666697963, | |
| "grad_norm": 49.87809371948242, | |
| "learning_rate": 1.576130613948151e-05, | |
| "loss": 7.9414, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.591493756454793, | |
| "grad_norm": 54.42366409301758, | |
| "learning_rate": 1.5658025142141778e-05, | |
| "loss": 7.9631, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.6008825462397896, | |
| "grad_norm": 37.58320236206055, | |
| "learning_rate": 1.5554744144802047e-05, | |
| "loss": 7.9606, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.6102713360247864, | |
| "grad_norm": 3.1502482891082764, | |
| "learning_rate": 1.5450419905064945e-05, | |
| "loss": 7.9377, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.6196601258097831, | |
| "grad_norm": 2.5369224548339844, | |
| "learning_rate": 1.534609566532784e-05, | |
| "loss": 7.9108, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.6290489155947798, | |
| "grad_norm": 2.5891005992889404, | |
| "learning_rate": 1.5241771425590736e-05, | |
| "loss": 7.9225, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.6384377053797765, | |
| "grad_norm": 1.7794080972671509, | |
| "learning_rate": 1.5137447185853635e-05, | |
| "loss": 7.9154, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.6478264951647733, | |
| "grad_norm": 2.324805974960327, | |
| "learning_rate": 1.5033122946116532e-05, | |
| "loss": 7.9191, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.65721528494977, | |
| "grad_norm": 2.601715564727783, | |
| "learning_rate": 1.4928798706379428e-05, | |
| "loss": 7.8903, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.6666040747347667, | |
| "grad_norm": 2.4438092708587646, | |
| "learning_rate": 1.4824474466642325e-05, | |
| "loss": 7.9213, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.6759928645197634, | |
| "grad_norm": 8.118125915527344, | |
| "learning_rate": 1.4720150226905224e-05, | |
| "loss": 7.9202, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.6853816543047602, | |
| "grad_norm": 3.908555746078491, | |
| "learning_rate": 1.461582598716812e-05, | |
| "loss": 7.8998, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.6947704440897569, | |
| "grad_norm": 2.72293758392334, | |
| "learning_rate": 1.4511501747431017e-05, | |
| "loss": 7.9153, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.7041592338747535, | |
| "grad_norm": 3.108797073364258, | |
| "learning_rate": 1.4407177507693915e-05, | |
| "loss": 7.9037, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.7135480236597502, | |
| "grad_norm": 2.6256439685821533, | |
| "learning_rate": 1.430285326795681e-05, | |
| "loss": 7.9146, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.7229368134447469, | |
| "grad_norm": 3.5525624752044678, | |
| "learning_rate": 1.4198529028219708e-05, | |
| "loss": 7.8972, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.7323256032297437, | |
| "grad_norm": 2.6983673572540283, | |
| "learning_rate": 1.4094204788482607e-05, | |
| "loss": 7.9374, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.7417143930147404, | |
| "grad_norm": 1.545486569404602, | |
| "learning_rate": 1.3989880548745502e-05, | |
| "loss": 7.8647, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.7511031827997371, | |
| "grad_norm": 2.5116941928863525, | |
| "learning_rate": 1.38855563090084e-05, | |
| "loss": 7.8915, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.7604919725847338, | |
| "grad_norm": 1.8576518297195435, | |
| "learning_rate": 1.3781232069271295e-05, | |
| "loss": 7.8846, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.7698807623697306, | |
| "grad_norm": 3.3226571083068848, | |
| "learning_rate": 1.3676907829534194e-05, | |
| "loss": 7.8988, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.7792695521547273, | |
| "grad_norm": 2.946324586868286, | |
| "learning_rate": 1.3572583589797091e-05, | |
| "loss": 7.8702, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.788658341939724, | |
| "grad_norm": 2.5089969635009766, | |
| "learning_rate": 1.3468259350059986e-05, | |
| "loss": 7.923, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.7980471317247206, | |
| "grad_norm": 2.2807912826538086, | |
| "learning_rate": 1.3363935110322885e-05, | |
| "loss": 7.891, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.8074359215097174, | |
| "grad_norm": 2.5889735221862793, | |
| "learning_rate": 1.3259610870585782e-05, | |
| "loss": 7.8832, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.8168247112947141, | |
| "grad_norm": 2.8306784629821777, | |
| "learning_rate": 1.3155286630848678e-05, | |
| "loss": 7.8726, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.8262135010797108, | |
| "grad_norm": 2.626786231994629, | |
| "learning_rate": 1.3050962391111577e-05, | |
| "loss": 7.8813, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.8356022908647075, | |
| "grad_norm": 3.195319414138794, | |
| "learning_rate": 1.2946638151374474e-05, | |
| "loss": 7.8986, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.8449910806497043, | |
| "grad_norm": 5.254043102264404, | |
| "learning_rate": 1.284231391163737e-05, | |
| "loss": 7.8743, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.854379870434701, | |
| "grad_norm": 2.9493279457092285, | |
| "learning_rate": 1.2737989671900267e-05, | |
| "loss": 7.8791, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.8637686602196977, | |
| "grad_norm": 3.130415439605713, | |
| "learning_rate": 1.2633665432163165e-05, | |
| "loss": 7.8783, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.8731574500046944, | |
| "grad_norm": 4.030152797698975, | |
| "learning_rate": 1.2529341192426061e-05, | |
| "loss": 7.8528, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 0.8825462397896912, | |
| "grad_norm": 2.9882099628448486, | |
| "learning_rate": 1.2425016952688958e-05, | |
| "loss": 7.8864, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 0.8919350295746878, | |
| "grad_norm": 3.802172899246216, | |
| "learning_rate": 1.2320692712951855e-05, | |
| "loss": 7.8989, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.9013238193596845, | |
| "grad_norm": 2.724433183670044, | |
| "learning_rate": 1.2216368473214752e-05, | |
| "loss": 7.8617, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.9107126091446812, | |
| "grad_norm": 2.459376573562622, | |
| "learning_rate": 1.211204423347765e-05, | |
| "loss": 7.8371, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 0.920101398929678, | |
| "grad_norm": 4.715926647186279, | |
| "learning_rate": 1.2007719993740547e-05, | |
| "loss": 7.8566, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 0.9294901887146747, | |
| "grad_norm": 2.6845057010650635, | |
| "learning_rate": 1.1903395754003444e-05, | |
| "loss": 7.8776, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 0.9388789784996714, | |
| "grad_norm": 2.62907075881958, | |
| "learning_rate": 1.1799071514266341e-05, | |
| "loss": 7.8558, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.9388789784996714, | |
| "eval_loss": 7.849188327789307, | |
| "eval_runtime": 1155.7489, | |
| "eval_samples_per_second": 294.897, | |
| "eval_steps_per_second": 2.304, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.9482677682846681, | |
| "grad_norm": 4.570381164550781, | |
| "learning_rate": 1.1694747274529237e-05, | |
| "loss": 7.848, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 0.9576565580696648, | |
| "grad_norm": 21.764062881469727, | |
| "learning_rate": 1.1590423034792135e-05, | |
| "loss": 7.8227, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 0.9670453478546616, | |
| "grad_norm": 18.442140579223633, | |
| "learning_rate": 1.1486098795055033e-05, | |
| "loss": 7.8311, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 0.9764341376396583, | |
| "grad_norm": 4.737902641296387, | |
| "learning_rate": 1.1381774555317928e-05, | |
| "loss": 7.8437, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 0.9858229274246549, | |
| "grad_norm": 3.0295650959014893, | |
| "learning_rate": 1.1277450315580827e-05, | |
| "loss": 7.8454, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.9952117172096516, | |
| "grad_norm": 3.0269651412963867, | |
| "learning_rate": 1.1173126075843724e-05, | |
| "loss": 7.8362, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 1.0046005069946484, | |
| "grad_norm": 4.033662796020508, | |
| "learning_rate": 1.1069845078503991e-05, | |
| "loss": 7.8681, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 1.013989296779645, | |
| "grad_norm": 3.5319488048553467, | |
| "learning_rate": 1.0965520838766888e-05, | |
| "loss": 7.8745, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 1.0233780865646418, | |
| "grad_norm": 2.7731130123138428, | |
| "learning_rate": 1.0861196599029787e-05, | |
| "loss": 7.8339, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 1.0327668763496385, | |
| "grad_norm": 4.000971794128418, | |
| "learning_rate": 1.0756872359292683e-05, | |
| "loss": 7.8458, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.0421556661346352, | |
| "grad_norm": 15.05604362487793, | |
| "learning_rate": 1.065254811955558e-05, | |
| "loss": 7.8493, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 1.0515444559196319, | |
| "grad_norm": 4.498584747314453, | |
| "learning_rate": 1.0548223879818477e-05, | |
| "loss": 7.8317, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 1.0609332457046288, | |
| "grad_norm": 2.8218085765838623, | |
| "learning_rate": 1.0443899640081374e-05, | |
| "loss": 7.841, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 1.0703220354896255, | |
| "grad_norm": 3.627685785293579, | |
| "learning_rate": 1.0339575400344271e-05, | |
| "loss": 7.8292, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 1.0797108252746221, | |
| "grad_norm": 4.804520606994629, | |
| "learning_rate": 1.0235251160607167e-05, | |
| "loss": 7.8121, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 1.0890996150596188, | |
| "grad_norm": 15.256156921386719, | |
| "learning_rate": 1.0130926920870066e-05, | |
| "loss": 7.8165, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 1.0984884048446155, | |
| "grad_norm": 3.684401273727417, | |
| "learning_rate": 1.0026602681132963e-05, | |
| "loss": 7.8259, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 1.1078771946296122, | |
| "grad_norm": 3.7146763801574707, | |
| "learning_rate": 9.92227844139586e-06, | |
| "loss": 7.8303, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 1.117265984414609, | |
| "grad_norm": 3.4437708854675293, | |
| "learning_rate": 9.817954201658755e-06, | |
| "loss": 7.809, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 1.1266547741996056, | |
| "grad_norm": 4.232120990753174, | |
| "learning_rate": 9.713629961921654e-06, | |
| "loss": 7.818, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.1360435639846025, | |
| "grad_norm": 3.4967739582061768, | |
| "learning_rate": 9.60930572218455e-06, | |
| "loss": 7.8071, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 1.1454323537695992, | |
| "grad_norm": 10.542444229125977, | |
| "learning_rate": 9.504981482447447e-06, | |
| "loss": 7.801, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 1.1548211435545959, | |
| "grad_norm": 3.744981527328491, | |
| "learning_rate": 9.400657242710344e-06, | |
| "loss": 7.8123, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 1.1642099333395926, | |
| "grad_norm": 3.3549323081970215, | |
| "learning_rate": 9.296333002973241e-06, | |
| "loss": 7.8203, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 1.1735987231245892, | |
| "grad_norm": 5.337845325469971, | |
| "learning_rate": 9.192008763236138e-06, | |
| "loss": 7.8609, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 1.182987512909586, | |
| "grad_norm": 3.204465627670288, | |
| "learning_rate": 9.087684523499036e-06, | |
| "loss": 7.7782, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 1.1923763026945826, | |
| "grad_norm": 4.669897079467773, | |
| "learning_rate": 8.983360283761933e-06, | |
| "loss": 7.8092, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 1.2017650924795793, | |
| "grad_norm": 3.1824800968170166, | |
| "learning_rate": 8.87903604402483e-06, | |
| "loss": 7.815, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 1.211153882264576, | |
| "grad_norm": 3.6459527015686035, | |
| "learning_rate": 8.774711804287727e-06, | |
| "loss": 7.8196, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 1.220542672049573, | |
| "grad_norm": 3.732983112335205, | |
| "learning_rate": 8.670387564550624e-06, | |
| "loss": 7.8206, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 1.2299314618345696, | |
| "grad_norm": 4.478656768798828, | |
| "learning_rate": 8.566063324813521e-06, | |
| "loss": 7.8022, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 1.2393202516195663, | |
| "grad_norm": 3.7781801223754883, | |
| "learning_rate": 8.461739085076418e-06, | |
| "loss": 7.8043, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 1.248709041404563, | |
| "grad_norm": 5.932605743408203, | |
| "learning_rate": 8.357414845339316e-06, | |
| "loss": 7.7823, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 1.2580978311895596, | |
| "grad_norm": 3.8288068771362305, | |
| "learning_rate": 8.254133847999584e-06, | |
| "loss": 7.8061, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 1.2674866209745563, | |
| "grad_norm": 4.60470724105835, | |
| "learning_rate": 8.14980960826248e-06, | |
| "loss": 7.8016, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 1.276875410759553, | |
| "grad_norm": 5.450839996337891, | |
| "learning_rate": 8.045485368525377e-06, | |
| "loss": 7.8076, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 1.28626420054455, | |
| "grad_norm": 7.866298198699951, | |
| "learning_rate": 7.941161128788276e-06, | |
| "loss": 7.7996, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 1.2956529903295464, | |
| "grad_norm": 3.059967041015625, | |
| "learning_rate": 7.836836889051171e-06, | |
| "loss": 7.8035, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 1.3050417801145433, | |
| "grad_norm": 3.5380911827087402, | |
| "learning_rate": 7.732512649314069e-06, | |
| "loss": 7.8092, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 1.31443056989954, | |
| "grad_norm": 4.589097499847412, | |
| "learning_rate": 7.628188409576966e-06, | |
| "loss": 7.7902, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 1.3238193596845367, | |
| "grad_norm": 6.932407855987549, | |
| "learning_rate": 7.523864169839863e-06, | |
| "loss": 7.8114, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 1.3332081494695334, | |
| "grad_norm": 3.5786869525909424, | |
| "learning_rate": 7.41953993010276e-06, | |
| "loss": 7.8112, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 1.34259693925453, | |
| "grad_norm": 4.283187389373779, | |
| "learning_rate": 7.315215690365657e-06, | |
| "loss": 7.8036, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 1.3519857290395267, | |
| "grad_norm": 14.625285148620605, | |
| "learning_rate": 7.210891450628554e-06, | |
| "loss": 7.8178, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 1.3613745188245234, | |
| "grad_norm": 3.5072567462921143, | |
| "learning_rate": 7.106567210891451e-06, | |
| "loss": 7.8391, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 1.3707633086095203, | |
| "grad_norm": 4.140475749969482, | |
| "learning_rate": 7.002242971154349e-06, | |
| "loss": 7.8151, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 1.380152098394517, | |
| "grad_norm": 6.985396385192871, | |
| "learning_rate": 6.897918731417246e-06, | |
| "loss": 7.7957, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 1.3895408881795137, | |
| "grad_norm": 3.8024065494537354, | |
| "learning_rate": 6.793594491680142e-06, | |
| "loss": 7.7833, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 1.3989296779645104, | |
| "grad_norm": 4.183823585510254, | |
| "learning_rate": 6.689270251943039e-06, | |
| "loss": 7.8049, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 1.408318467749507, | |
| "grad_norm": 3.431105375289917, | |
| "learning_rate": 6.5849460122059365e-06, | |
| "loss": 7.8163, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 1.408318467749507, | |
| "eval_loss": 7.807833671569824, | |
| "eval_runtime": 335.7694, | |
| "eval_samples_per_second": 1015.063, | |
| "eval_steps_per_second": 7.931, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 1.4177072575345038, | |
| "grad_norm": 8.183846473693848, | |
| "learning_rate": 6.480621772468834e-06, | |
| "loss": 7.7864, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 1.4270960473195005, | |
| "grad_norm": 11.66592788696289, | |
| "learning_rate": 6.376297532731731e-06, | |
| "loss": 7.8241, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 1.4364848371044971, | |
| "grad_norm": 9.620813369750977, | |
| "learning_rate": 6.271973292994628e-06, | |
| "loss": 7.7694, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 1.4458736268894938, | |
| "grad_norm": 4.250575065612793, | |
| "learning_rate": 6.167649053257525e-06, | |
| "loss": 7.7784, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 1.4552624166744907, | |
| "grad_norm": 3.8679049015045166, | |
| "learning_rate": 6.0633248135204214e-06, | |
| "loss": 7.7628, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 1.4646512064594874, | |
| "grad_norm": 4.648382186889648, | |
| "learning_rate": 5.959000573783319e-06, | |
| "loss": 7.8044, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 1.4740399962444841, | |
| "grad_norm": 4.5424113273620605, | |
| "learning_rate": 5.854676334046217e-06, | |
| "loss": 7.7871, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 1.4834287860294808, | |
| "grad_norm": 4.026553630828857, | |
| "learning_rate": 5.750352094309113e-06, | |
| "loss": 7.809, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 1.4928175758144775, | |
| "grad_norm": 6.175102233886719, | |
| "learning_rate": 5.647071096969381e-06, | |
| "loss": 7.7955, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 1.5022063655994742, | |
| "grad_norm": 4.672608375549316, | |
| "learning_rate": 5.542746857232279e-06, | |
| "loss": 7.8056, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 1.5115951553844709, | |
| "grad_norm": 7.012312412261963, | |
| "learning_rate": 5.438422617495176e-06, | |
| "loss": 7.774, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 1.5209839451694678, | |
| "grad_norm": 5.2042131423950195, | |
| "learning_rate": 5.334098377758072e-06, | |
| "loss": 7.7874, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 1.5303727349544642, | |
| "grad_norm": 3.745805501937866, | |
| "learning_rate": 5.22977413802097e-06, | |
| "loss": 7.7918, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 1.5397615247394612, | |
| "grad_norm": 4.060446262359619, | |
| "learning_rate": 5.125449898283867e-06, | |
| "loss": 7.7787, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 1.5491503145244578, | |
| "grad_norm": 21.851919174194336, | |
| "learning_rate": 5.021125658546764e-06, | |
| "loss": 7.7881, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 1.5585391043094545, | |
| "grad_norm": 4.261013507843018, | |
| "learning_rate": 4.916801418809661e-06, | |
| "loss": 7.7723, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 1.5679278940944512, | |
| "grad_norm": 3.9473931789398193, | |
| "learning_rate": 4.812477179072558e-06, | |
| "loss": 7.7809, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 1.577316683879448, | |
| "grad_norm": 6.088964939117432, | |
| "learning_rate": 4.709196181732826e-06, | |
| "loss": 7.8096, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 1.5867054736644448, | |
| "grad_norm": 7.912614822387695, | |
| "learning_rate": 4.604871941995723e-06, | |
| "loss": 7.7559, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 1.5960942634494413, | |
| "grad_norm": 7.268245697021484, | |
| "learning_rate": 4.50054770225862e-06, | |
| "loss": 7.8063, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 1.6054830532344382, | |
| "grad_norm": 15.331354141235352, | |
| "learning_rate": 4.3962234625215175e-06, | |
| "loss": 7.8137, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 1.6148718430194346, | |
| "grad_norm": 5.976458549499512, | |
| "learning_rate": 4.291899222784414e-06, | |
| "loss": 7.761, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 1.6242606328044316, | |
| "grad_norm": 4.332185745239258, | |
| "learning_rate": 4.187574983047311e-06, | |
| "loss": 7.7672, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 1.6336494225894282, | |
| "grad_norm": 5.834400653839111, | |
| "learning_rate": 4.083250743310209e-06, | |
| "loss": 7.7939, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 1.643038212374425, | |
| "grad_norm": 17.61608123779297, | |
| "learning_rate": 3.978926503573105e-06, | |
| "loss": 7.8052, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 1.6524270021594216, | |
| "grad_norm": 4.811455249786377, | |
| "learning_rate": 3.8746022638360024e-06, | |
| "loss": 7.7519, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 1.6618157919444183, | |
| "grad_norm": 4.777581691741943, | |
| "learning_rate": 3.7702780240988996e-06, | |
| "loss": 7.7643, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 1.6712045817294152, | |
| "grad_norm": 3.996382474899292, | |
| "learning_rate": 3.6659537843617963e-06, | |
| "loss": 7.7823, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 1.6805933715144117, | |
| "grad_norm": 13.98630142211914, | |
| "learning_rate": 3.561629544624694e-06, | |
| "loss": 7.7507, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 1.6899821612994086, | |
| "grad_norm": 5.8219499588012695, | |
| "learning_rate": 3.457305304887591e-06, | |
| "loss": 7.777, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 1.699370951084405, | |
| "grad_norm": 6.051421165466309, | |
| "learning_rate": 3.352981065150488e-06, | |
| "loss": 7.786, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 1.708759740869402, | |
| "grad_norm": 7.037374019622803, | |
| "learning_rate": 3.248656825413385e-06, | |
| "loss": 7.8097, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 1.7181485306543987, | |
| "grad_norm": 4.905298709869385, | |
| "learning_rate": 3.1443325856762826e-06, | |
| "loss": 7.7749, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 1.7275373204393953, | |
| "grad_norm": 10.381073951721191, | |
| "learning_rate": 3.0400083459391793e-06, | |
| "loss": 7.7626, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 1.736926110224392, | |
| "grad_norm": 4.788754463195801, | |
| "learning_rate": 2.9356841062020765e-06, | |
| "loss": 7.7783, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 1.7463149000093887, | |
| "grad_norm": 7.451583385467529, | |
| "learning_rate": 2.8313598664649732e-06, | |
| "loss": 7.7552, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 1.7557036897943856, | |
| "grad_norm": 4.68840217590332, | |
| "learning_rate": 2.7270356267278704e-06, | |
| "loss": 7.7837, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 1.765092479579382, | |
| "grad_norm": 5.026941776275635, | |
| "learning_rate": 2.6227113869907676e-06, | |
| "loss": 7.7583, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 1.774481269364379, | |
| "grad_norm": 5.614253520965576, | |
| "learning_rate": 2.5183871472536647e-06, | |
| "loss": 7.7617, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 1.7838700591493757, | |
| "grad_norm": 67.81696319580078, | |
| "learning_rate": 2.414062907516562e-06, | |
| "loss": 7.7649, | |
| "step": 19000 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 21302, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 128, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |