yalhessi's picture
End of training
a7758bd verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.0,
"eval_steps": 3683,
"global_step": 110472,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.027156202476645665,
"grad_norm": 1.423619031906128,
"learning_rate": 0.00019927699054107928,
"loss": 0.804,
"step": 500
},
{
"epoch": 0.05431240495329133,
"grad_norm": 2.0592901706695557,
"learning_rate": 0.00019837096365020115,
"loss": 0.7258,
"step": 1000
},
{
"epoch": 0.081468607429937,
"grad_norm": 1.7999378442764282,
"learning_rate": 0.00019746493675932303,
"loss": 0.698,
"step": 1500
},
{
"epoch": 0.10862480990658266,
"grad_norm": 1.6404426097869873,
"learning_rate": 0.0001965589098684449,
"loss": 0.6863,
"step": 2000
},
{
"epoch": 0.13578101238322832,
"grad_norm": 2.424567222595215,
"learning_rate": 0.00019565469503134855,
"loss": 0.6638,
"step": 2500
},
{
"epoch": 0.162937214859874,
"grad_norm": 2.102727174758911,
"learning_rate": 0.00019475048019425216,
"loss": 0.6554,
"step": 3000
},
{
"epoch": 0.19009341733651966,
"grad_norm": 1.732860803604126,
"learning_rate": 0.00019384445330337407,
"loss": 0.6364,
"step": 3500
},
{
"epoch": 0.20003258744297198,
"eval_loss": 0.6357121467590332,
"eval_runtime": 25.1143,
"eval_samples_per_second": 14.693,
"eval_steps_per_second": 7.366,
"step": 3683
},
{
"epoch": 0.21724961981316532,
"grad_norm": 1.5829988718032837,
"learning_rate": 0.00019293842641249595,
"loss": 0.6576,
"step": 4000
},
{
"epoch": 0.244405822289811,
"grad_norm": 0.9155117869377136,
"learning_rate": 0.0001920323995216178,
"loss": 0.6241,
"step": 4500
},
{
"epoch": 0.27156202476645663,
"grad_norm": 1.6804828643798828,
"learning_rate": 0.0001911263726307397,
"loss": 0.6369,
"step": 5000
},
{
"epoch": 0.2987182272431023,
"grad_norm": 1.337156891822815,
"learning_rate": 0.00019022034573986155,
"loss": 0.6196,
"step": 5500
},
{
"epoch": 0.325874429719748,
"grad_norm": 1.680059790611267,
"learning_rate": 0.00018931431884898345,
"loss": 0.6121,
"step": 6000
},
{
"epoch": 0.35303063219639363,
"grad_norm": 2.251309871673584,
"learning_rate": 0.00018840829195810533,
"loss": 0.5984,
"step": 6500
},
{
"epoch": 0.3801868346730393,
"grad_norm": 1.7084137201309204,
"learning_rate": 0.0001875022650672272,
"loss": 0.5857,
"step": 7000
},
{
"epoch": 0.40006517488594395,
"eval_loss": 0.5826721787452698,
"eval_runtime": 24.2431,
"eval_samples_per_second": 15.221,
"eval_steps_per_second": 7.631,
"step": 7366
},
{
"epoch": 0.407343037149685,
"grad_norm": 1.7896497249603271,
"learning_rate": 0.00018659623817634909,
"loss": 0.5777,
"step": 7500
},
{
"epoch": 0.43449923962633064,
"grad_norm": 1.6899892091751099,
"learning_rate": 0.00018569021128547096,
"loss": 0.584,
"step": 8000
},
{
"epoch": 0.4616554421029763,
"grad_norm": 2.2623794078826904,
"learning_rate": 0.00018478418439459284,
"loss": 0.5731,
"step": 8500
},
{
"epoch": 0.488811644579622,
"grad_norm": 2.9157674312591553,
"learning_rate": 0.00018387996955749646,
"loss": 0.5719,
"step": 9000
},
{
"epoch": 0.5159678470562676,
"grad_norm": 1.841354489326477,
"learning_rate": 0.00018297394266661836,
"loss": 0.5765,
"step": 9500
},
{
"epoch": 0.5431240495329133,
"grad_norm": 2.550917625427246,
"learning_rate": 0.00018206791577574024,
"loss": 0.552,
"step": 10000
},
{
"epoch": 0.570280252009559,
"grad_norm": 1.5018529891967773,
"learning_rate": 0.00018116188888486211,
"loss": 0.5489,
"step": 10500
},
{
"epoch": 0.5974364544862046,
"grad_norm": 3.648230791091919,
"learning_rate": 0.00018025767404776576,
"loss": 0.5682,
"step": 11000
},
{
"epoch": 0.6000977623289159,
"eval_loss": 0.5515537858009338,
"eval_runtime": 25.2059,
"eval_samples_per_second": 14.639,
"eval_steps_per_second": 7.34,
"step": 11049
},
{
"epoch": 0.6245926569628503,
"grad_norm": 3.894047498703003,
"learning_rate": 0.0001793516471568876,
"loss": 0.5662,
"step": 11500
},
{
"epoch": 0.651748859439496,
"grad_norm": 2.1958436965942383,
"learning_rate": 0.0001784456202660095,
"loss": 0.557,
"step": 12000
},
{
"epoch": 0.6789050619161416,
"grad_norm": 1.6268092393875122,
"learning_rate": 0.00017754140542891312,
"loss": 0.5472,
"step": 12500
},
{
"epoch": 0.7060612643927873,
"grad_norm": 1.6947818994522095,
"learning_rate": 0.00017663537853803503,
"loss": 0.5487,
"step": 13000
},
{
"epoch": 0.733217466869433,
"grad_norm": 1.740544080734253,
"learning_rate": 0.00017572935164715688,
"loss": 0.5473,
"step": 13500
},
{
"epoch": 0.7603736693460786,
"grad_norm": 2.6229496002197266,
"learning_rate": 0.00017482332475627878,
"loss": 0.5306,
"step": 14000
},
{
"epoch": 0.7875298718227243,
"grad_norm": 1.760733723640442,
"learning_rate": 0.00017391729786540066,
"loss": 0.5421,
"step": 14500
},
{
"epoch": 0.8001303497718879,
"eval_loss": 0.5292674899101257,
"eval_runtime": 23.3828,
"eval_samples_per_second": 15.781,
"eval_steps_per_second": 7.912,
"step": 14732
},
{
"epoch": 0.81468607429937,
"grad_norm": 2.7095932960510254,
"learning_rate": 0.00017301308302830428,
"loss": 0.5225,
"step": 15000
},
{
"epoch": 0.8418422767760156,
"grad_norm": 3.7730860710144043,
"learning_rate": 0.00017210705613742618,
"loss": 0.536,
"step": 15500
},
{
"epoch": 0.8689984792526613,
"grad_norm": 1.8944693803787231,
"learning_rate": 0.00017120102924654803,
"loss": 0.5123,
"step": 16000
},
{
"epoch": 0.896154681729307,
"grad_norm": 2.137572765350342,
"learning_rate": 0.00017029500235566993,
"loss": 0.5411,
"step": 16500
},
{
"epoch": 0.9233108842059526,
"grad_norm": 4.163636207580566,
"learning_rate": 0.0001693889754647918,
"loss": 0.5218,
"step": 17000
},
{
"epoch": 0.9504670866825983,
"grad_norm": 2.492893934249878,
"learning_rate": 0.0001684829485739137,
"loss": 0.5171,
"step": 17500
},
{
"epoch": 0.977623289159244,
"grad_norm": 1.2668529748916626,
"learning_rate": 0.00016757692168303557,
"loss": 0.5142,
"step": 18000
},
{
"epoch": 1.00016293721486,
"eval_loss": 0.517742395401001,
"eval_runtime": 24.0099,
"eval_samples_per_second": 15.369,
"eval_steps_per_second": 7.705,
"step": 18415
},
{
"epoch": 1.0047794916358896,
"grad_norm": 4.145332336425781,
"learning_rate": 0.00016667089479215742,
"loss": 0.5248,
"step": 18500
},
{
"epoch": 1.0319356941125353,
"grad_norm": 3.0422215461730957,
"learning_rate": 0.00016576667995506109,
"loss": 0.4571,
"step": 19000
},
{
"epoch": 1.059091896589181,
"grad_norm": 2.034750461578369,
"learning_rate": 0.00016486065306418294,
"loss": 0.4635,
"step": 19500
},
{
"epoch": 1.0862480990658265,
"grad_norm": 2.047473907470703,
"learning_rate": 0.0001639564382270866,
"loss": 0.47,
"step": 20000
},
{
"epoch": 1.1134043015424724,
"grad_norm": 2.424201011657715,
"learning_rate": 0.00016305041133620845,
"loss": 0.4742,
"step": 20500
},
{
"epoch": 1.140560504019118,
"grad_norm": 2.1113667488098145,
"learning_rate": 0.00016214438444533036,
"loss": 0.4628,
"step": 21000
},
{
"epoch": 1.1677167064957636,
"grad_norm": 2.0212793350219727,
"learning_rate": 0.00016123835755445224,
"loss": 0.4636,
"step": 21500
},
{
"epoch": 1.1948729089724093,
"grad_norm": 4.672229290008545,
"learning_rate": 0.00016033233066357409,
"loss": 0.4674,
"step": 22000
},
{
"epoch": 1.2001955246578317,
"eval_loss": 0.5015310645103455,
"eval_runtime": 25.0548,
"eval_samples_per_second": 14.728,
"eval_steps_per_second": 7.384,
"step": 22098
},
{
"epoch": 1.222029111449055,
"grad_norm": 2.172687292098999,
"learning_rate": 0.000159426303772696,
"loss": 0.4583,
"step": 22500
},
{
"epoch": 1.2491853139257005,
"grad_norm": 2.1510438919067383,
"learning_rate": 0.00015852027688181784,
"loss": 0.4558,
"step": 23000
},
{
"epoch": 1.2763415164023462,
"grad_norm": 1.1689780950546265,
"learning_rate": 0.00015761424999093975,
"loss": 0.4502,
"step": 23500
},
{
"epoch": 1.303497718878992,
"grad_norm": 2.7791380882263184,
"learning_rate": 0.00015670822310006162,
"loss": 0.4451,
"step": 24000
},
{
"epoch": 1.3306539213556376,
"grad_norm": 2.7756049633026123,
"learning_rate": 0.0001558021962091835,
"loss": 0.4509,
"step": 24500
},
{
"epoch": 1.3578101238322833,
"grad_norm": 2.263340950012207,
"learning_rate": 0.00015489616931830538,
"loss": 0.4472,
"step": 25000
},
{
"epoch": 1.384966326308929,
"grad_norm": 3.0343711376190186,
"learning_rate": 0.000153991954481209,
"loss": 0.4615,
"step": 25500
},
{
"epoch": 1.4002281121008038,
"eval_loss": 0.500033438205719,
"eval_runtime": 24.7233,
"eval_samples_per_second": 14.925,
"eval_steps_per_second": 7.483,
"step": 25781
},
{
"epoch": 1.4121225287855745,
"grad_norm": 3.176940441131592,
"learning_rate": 0.0001530859275903309,
"loss": 0.4561,
"step": 26000
},
{
"epoch": 1.4392787312622204,
"grad_norm": 4.111068248748779,
"learning_rate": 0.00015217990069945275,
"loss": 0.491,
"step": 26500
},
{
"epoch": 1.466434933738866,
"grad_norm": 5.199289321899414,
"learning_rate": 0.00015127387380857465,
"loss": 0.4721,
"step": 27000
},
{
"epoch": 1.4935911362155116,
"grad_norm": 1.901997447013855,
"learning_rate": 0.00015036965897147827,
"loss": 0.4458,
"step": 27500
},
{
"epoch": 1.5207473386921573,
"grad_norm": 3.2669544219970703,
"learning_rate": 0.00014946363208060017,
"loss": 0.4313,
"step": 28000
},
{
"epoch": 1.547903541168803,
"grad_norm": 1.151863694190979,
"learning_rate": 0.00014855760518972205,
"loss": 0.4698,
"step": 28500
},
{
"epoch": 1.5750597436454488,
"grad_norm": 2.112612724304199,
"learning_rate": 0.0001476515782988439,
"loss": 0.453,
"step": 29000
},
{
"epoch": 1.6002606995437758,
"eval_loss": 0.47696688771247864,
"eval_runtime": 22.9354,
"eval_samples_per_second": 16.089,
"eval_steps_per_second": 8.066,
"step": 29464
},
{
"epoch": 1.6022159461220942,
"grad_norm": 2.558180570602417,
"learning_rate": 0.0001467455514079658,
"loss": 0.4575,
"step": 29500
},
{
"epoch": 1.62937214859874,
"grad_norm": 2.6507065296173096,
"learning_rate": 0.00014583952451708768,
"loss": 0.459,
"step": 30000
},
{
"epoch": 1.6565283510753857,
"grad_norm": 1.5638259649276733,
"learning_rate": 0.00014493349762620956,
"loss": 0.4288,
"step": 30500
},
{
"epoch": 1.6836845535520313,
"grad_norm": 3.8055946826934814,
"learning_rate": 0.00014402747073533143,
"loss": 0.4514,
"step": 31000
},
{
"epoch": 1.710840756028677,
"grad_norm": 3.0687201023101807,
"learning_rate": 0.00014312325589823508,
"loss": 0.4471,
"step": 31500
},
{
"epoch": 1.7379969585053225,
"grad_norm": 2.26448655128479,
"learning_rate": 0.0001422190410611387,
"loss": 0.4447,
"step": 32000
},
{
"epoch": 1.7651531609819684,
"grad_norm": 1.4060781002044678,
"learning_rate": 0.00014131301417026057,
"loss": 0.4361,
"step": 32500
},
{
"epoch": 1.7923093634586138,
"grad_norm": 2.3706018924713135,
"learning_rate": 0.00014040698727938247,
"loss": 0.4506,
"step": 33000
},
{
"epoch": 1.8002932869867476,
"eval_loss": 0.470061331987381,
"eval_runtime": 27.1848,
"eval_samples_per_second": 13.574,
"eval_steps_per_second": 6.805,
"step": 33147
},
{
"epoch": 1.8194655659352597,
"grad_norm": 2.880718946456909,
"learning_rate": 0.00013950096038850432,
"loss": 0.4439,
"step": 33500
},
{
"epoch": 1.8466217684119053,
"grad_norm": 1.4225813150405884,
"learning_rate": 0.000138596745551408,
"loss": 0.4434,
"step": 34000
},
{
"epoch": 1.873777970888551,
"grad_norm": 0.7051529884338379,
"learning_rate": 0.00013769071866052984,
"loss": 0.449,
"step": 34500
},
{
"epoch": 1.9009341733651968,
"grad_norm": 4.5070648193359375,
"learning_rate": 0.00013678469176965174,
"loss": 0.4356,
"step": 35000
},
{
"epoch": 1.9280903758418422,
"grad_norm": 1.354962944984436,
"learning_rate": 0.00013587866487877362,
"loss": 0.4509,
"step": 35500
},
{
"epoch": 1.955246578318488,
"grad_norm": 2.919261932373047,
"learning_rate": 0.00013497263798789547,
"loss": 0.4399,
"step": 36000
},
{
"epoch": 1.9824027807951337,
"grad_norm": 1.7376036643981934,
"learning_rate": 0.00013406661109701738,
"loss": 0.4309,
"step": 36500
},
{
"epoch": 2.00032587442972,
"eval_loss": 0.4645754098892212,
"eval_runtime": 26.5674,
"eval_samples_per_second": 13.889,
"eval_steps_per_second": 6.963,
"step": 36830
},
{
"epoch": 2.0095589832717793,
"grad_norm": 2.448807954788208,
"learning_rate": 0.000133162396259921,
"loss": 0.4074,
"step": 37000
},
{
"epoch": 2.036715185748425,
"grad_norm": 4.4545369148254395,
"learning_rate": 0.0001322563693690429,
"loss": 0.3784,
"step": 37500
},
{
"epoch": 2.0638713882250705,
"grad_norm": 1.4585567712783813,
"learning_rate": 0.0001313521545319465,
"loss": 0.3802,
"step": 38000
},
{
"epoch": 2.0910275907017164,
"grad_norm": 4.981091499328613,
"learning_rate": 0.0001304461276410684,
"loss": 0.3887,
"step": 38500
},
{
"epoch": 2.118183793178362,
"grad_norm": 1.345459222793579,
"learning_rate": 0.00012954010075019026,
"loss": 0.3791,
"step": 39000
},
{
"epoch": 2.1453399956550077,
"grad_norm": 2.339366912841797,
"learning_rate": 0.00012863407385931214,
"loss": 0.3895,
"step": 39500
},
{
"epoch": 2.172496198131653,
"grad_norm": 2.1575145721435547,
"learning_rate": 0.00012772804696843405,
"loss": 0.4046,
"step": 40000
},
{
"epoch": 2.199652400608299,
"grad_norm": 3.028726100921631,
"learning_rate": 0.0001268220200775559,
"loss": 0.3829,
"step": 40500
},
{
"epoch": 2.2003584618726917,
"eval_loss": 0.46665239334106445,
"eval_runtime": 22.994,
"eval_samples_per_second": 16.048,
"eval_steps_per_second": 8.046,
"step": 40513
},
{
"epoch": 2.2268086030849448,
"grad_norm": 2.198944330215454,
"learning_rate": 0.0001259159931866778,
"loss": 0.404,
"step": 41000
},
{
"epoch": 2.25396480556159,
"grad_norm": 1.4934983253479004,
"learning_rate": 0.00012500996629579965,
"loss": 0.3758,
"step": 41500
},
{
"epoch": 2.281121008038236,
"grad_norm": 3.0350615978240967,
"learning_rate": 0.0001241057514587033,
"loss": 0.3851,
"step": 42000
},
{
"epoch": 2.3082772105148814,
"grad_norm": 2.0013248920440674,
"learning_rate": 0.00012319972456782517,
"loss": 0.3874,
"step": 42500
},
{
"epoch": 2.3354334129915273,
"grad_norm": 3.0805039405822754,
"learning_rate": 0.00012229369767694705,
"loss": 0.383,
"step": 43000
},
{
"epoch": 2.362589615468173,
"grad_norm": 2.340902328491211,
"learning_rate": 0.00012138767078606894,
"loss": 0.3795,
"step": 43500
},
{
"epoch": 2.3897458179448186,
"grad_norm": 1.4872759580612183,
"learning_rate": 0.00012048164389519082,
"loss": 0.3925,
"step": 44000
},
{
"epoch": 2.4003910493156635,
"eval_loss": 0.45948517322540283,
"eval_runtime": 23.0689,
"eval_samples_per_second": 15.996,
"eval_steps_per_second": 8.019,
"step": 44196
},
{
"epoch": 2.4169020204214644,
"grad_norm": 2.0199170112609863,
"learning_rate": 0.00011957742905809446,
"loss": 0.387,
"step": 44500
},
{
"epoch": 2.44405822289811,
"grad_norm": 2.0993518829345703,
"learning_rate": 0.00011867140216721633,
"loss": 0.3858,
"step": 45000
},
{
"epoch": 2.4712144253747557,
"grad_norm": 2.5431594848632812,
"learning_rate": 0.0001177653752763382,
"loss": 0.3731,
"step": 45500
},
{
"epoch": 2.498370627851401,
"grad_norm": 2.0377984046936035,
"learning_rate": 0.00011685934838546009,
"loss": 0.383,
"step": 46000
},
{
"epoch": 2.525526830328047,
"grad_norm": 2.3051955699920654,
"learning_rate": 0.00011595332149458195,
"loss": 0.3865,
"step": 46500
},
{
"epoch": 2.5526830328046923,
"grad_norm": 3.8095552921295166,
"learning_rate": 0.00011504729460370384,
"loss": 0.3726,
"step": 47000
},
{
"epoch": 2.579839235281338,
"grad_norm": 2.2560086250305176,
"learning_rate": 0.00011414307976660747,
"loss": 0.3858,
"step": 47500
},
{
"epoch": 2.6004236367586357,
"eval_loss": 0.45664411783218384,
"eval_runtime": 22.9971,
"eval_samples_per_second": 16.045,
"eval_steps_per_second": 8.044,
"step": 47879
},
{
"epoch": 2.606995437757984,
"grad_norm": 2.8991200923919678,
"learning_rate": 0.00011323705287572936,
"loss": 0.383,
"step": 48000
},
{
"epoch": 2.6341516402346294,
"grad_norm": 4.307155132293701,
"learning_rate": 0.00011233102598485124,
"loss": 0.3941,
"step": 48500
},
{
"epoch": 2.6613078427112753,
"grad_norm": 3.7580649852752686,
"learning_rate": 0.0001114249990939731,
"loss": 0.385,
"step": 49000
},
{
"epoch": 2.6884640451879207,
"grad_norm": 2.604210615158081,
"learning_rate": 0.000110518972203095,
"loss": 0.3847,
"step": 49500
},
{
"epoch": 2.7156202476645666,
"grad_norm": 1.8067151308059692,
"learning_rate": 0.00010961475736599862,
"loss": 0.3775,
"step": 50000
},
{
"epoch": 2.7427764501412124,
"grad_norm": 2.4924516677856445,
"learning_rate": 0.00010870873047512051,
"loss": 0.392,
"step": 50500
},
{
"epoch": 2.769932652617858,
"grad_norm": 2.7145466804504395,
"learning_rate": 0.00010780270358424238,
"loss": 0.3817,
"step": 51000
},
{
"epoch": 2.7970888550945037,
"grad_norm": 3.6621336936950684,
"learning_rate": 0.00010689667669336427,
"loss": 0.3879,
"step": 51500
},
{
"epoch": 2.8004562242016076,
"eval_loss": 0.4439272880554199,
"eval_runtime": 22.9231,
"eval_samples_per_second": 16.097,
"eval_steps_per_second": 8.07,
"step": 51562
},
{
"epoch": 2.824245057571149,
"grad_norm": 3.2784557342529297,
"learning_rate": 0.00010599064980248614,
"loss": 0.3775,
"step": 52000
},
{
"epoch": 2.851401260047795,
"grad_norm": 2.4789769649505615,
"learning_rate": 0.00010508643496538977,
"loss": 0.3828,
"step": 52500
},
{
"epoch": 2.878557462524441,
"grad_norm": 4.17576789855957,
"learning_rate": 0.00010418040807451166,
"loss": 0.3922,
"step": 53000
},
{
"epoch": 2.905713665001086,
"grad_norm": 2.2692151069641113,
"learning_rate": 0.00010327438118363353,
"loss": 0.3684,
"step": 53500
},
{
"epoch": 2.932869867477732,
"grad_norm": 3.434340238571167,
"learning_rate": 0.00010236835429275542,
"loss": 0.3703,
"step": 54000
},
{
"epoch": 2.9600260699543774,
"grad_norm": 2.867629289627075,
"learning_rate": 0.00010146232740187728,
"loss": 0.3769,
"step": 54500
},
{
"epoch": 2.9871822724310233,
"grad_norm": 2.588996171951294,
"learning_rate": 0.00010055630051099917,
"loss": 0.3764,
"step": 55000
},
{
"epoch": 3.00048881164458,
"eval_loss": 0.43786150217056274,
"eval_runtime": 23.1256,
"eval_samples_per_second": 15.956,
"eval_steps_per_second": 8.0,
"step": 55245
},
{
"epoch": 3.0143384749076687,
"grad_norm": 2.558405876159668,
"learning_rate": 9.96520856739028e-05,
"loss": 0.3535,
"step": 55500
},
{
"epoch": 3.0414946773843146,
"grad_norm": 2.3702216148376465,
"learning_rate": 9.874605878302469e-05,
"loss": 0.3299,
"step": 56000
},
{
"epoch": 3.0686508798609604,
"grad_norm": 2.283313274383545,
"learning_rate": 9.784003189214657e-05,
"loss": 0.3366,
"step": 56500
},
{
"epoch": 3.095807082337606,
"grad_norm": 2.421048641204834,
"learning_rate": 9.693400500126845e-05,
"loss": 0.3261,
"step": 57000
},
{
"epoch": 3.1229632848142517,
"grad_norm": 2.0642685890197754,
"learning_rate": 9.602979016417207e-05,
"loss": 0.3335,
"step": 57500
},
{
"epoch": 3.150119487290897,
"grad_norm": 3.4360289573669434,
"learning_rate": 9.512376327329395e-05,
"loss": 0.3287,
"step": 58000
},
{
"epoch": 3.177275689767543,
"grad_norm": 3.9619264602661133,
"learning_rate": 9.421773638241583e-05,
"loss": 0.3267,
"step": 58500
},
{
"epoch": 3.2005213990875516,
"eval_loss": 0.4501725733280182,
"eval_runtime": 23.0376,
"eval_samples_per_second": 16.017,
"eval_steps_per_second": 8.03,
"step": 58928
},
{
"epoch": 3.2044318922441883,
"grad_norm": 2.5098698139190674,
"learning_rate": 9.331170949153772e-05,
"loss": 0.3365,
"step": 59000
},
{
"epoch": 3.231588094720834,
"grad_norm": 2.2651731967926025,
"learning_rate": 9.24056826006596e-05,
"loss": 0.3285,
"step": 59500
},
{
"epoch": 3.25874429719748,
"grad_norm": 2.573915958404541,
"learning_rate": 9.150146776356322e-05,
"loss": 0.3421,
"step": 60000
},
{
"epoch": 3.2859004996741255,
"grad_norm": 3.5748302936553955,
"learning_rate": 9.059544087268512e-05,
"loss": 0.3267,
"step": 60500
},
{
"epoch": 3.3130567021507713,
"grad_norm": 2.8185431957244873,
"learning_rate": 8.968941398180698e-05,
"loss": 0.3225,
"step": 61000
},
{
"epoch": 3.3402129046274167,
"grad_norm": 6.555810451507568,
"learning_rate": 8.878338709092886e-05,
"loss": 0.3174,
"step": 61500
},
{
"epoch": 3.3673691071040626,
"grad_norm": 3.8243870735168457,
"learning_rate": 8.787736020005073e-05,
"loss": 0.3249,
"step": 62000
},
{
"epoch": 3.3945253095807084,
"grad_norm": 1.514364242553711,
"learning_rate": 8.697314536295438e-05,
"loss": 0.3346,
"step": 62500
},
{
"epoch": 3.4005539865305234,
"eval_loss": 0.4442519247531891,
"eval_runtime": 22.857,
"eval_samples_per_second": 16.144,
"eval_steps_per_second": 8.094,
"step": 62611
},
{
"epoch": 3.421681512057354,
"grad_norm": 2.1374149322509766,
"learning_rate": 8.606711847207625e-05,
"loss": 0.3231,
"step": 63000
},
{
"epoch": 3.4488377145339997,
"grad_norm": 2.8971145153045654,
"learning_rate": 8.516109158119814e-05,
"loss": 0.3376,
"step": 63500
},
{
"epoch": 3.475993917010645,
"grad_norm": 2.860117197036743,
"learning_rate": 8.425506469032002e-05,
"loss": 0.3295,
"step": 64000
},
{
"epoch": 3.503150119487291,
"grad_norm": 1.976477026939392,
"learning_rate": 8.335084985322365e-05,
"loss": 0.3236,
"step": 64500
},
{
"epoch": 3.530306321963937,
"grad_norm": 2.6291637420654297,
"learning_rate": 8.244482296234553e-05,
"loss": 0.3201,
"step": 65000
},
{
"epoch": 3.557462524440582,
"grad_norm": 2.5785484313964844,
"learning_rate": 8.15387960714674e-05,
"loss": 0.3354,
"step": 65500
},
{
"epoch": 3.584618726917228,
"grad_norm": 2.3802502155303955,
"learning_rate": 8.063276918058928e-05,
"loss": 0.3363,
"step": 66000
},
{
"epoch": 3.6005865739734957,
"eval_loss": 0.43394023180007935,
"eval_runtime": 23.107,
"eval_samples_per_second": 15.969,
"eval_steps_per_second": 8.006,
"step": 66294
},
{
"epoch": 3.6117749293938735,
"grad_norm": 3.012232542037964,
"learning_rate": 7.972674228971116e-05,
"loss": 0.323,
"step": 66500
},
{
"epoch": 3.6389311318705193,
"grad_norm": 2.5260913372039795,
"learning_rate": 7.88225274526148e-05,
"loss": 0.3316,
"step": 67000
},
{
"epoch": 3.666087334347165,
"grad_norm": 3.0673775672912598,
"learning_rate": 7.791650056173668e-05,
"loss": 0.3194,
"step": 67500
},
{
"epoch": 3.6932435368238106,
"grad_norm": 1.782955527305603,
"learning_rate": 7.701047367085855e-05,
"loss": 0.3268,
"step": 68000
},
{
"epoch": 3.720399739300456,
"grad_norm": 3.0327773094177246,
"learning_rate": 7.610444677998043e-05,
"loss": 0.327,
"step": 68500
},
{
"epoch": 3.747555941777102,
"grad_norm": 4.625910758972168,
"learning_rate": 7.520023194288407e-05,
"loss": 0.3231,
"step": 69000
},
{
"epoch": 3.7747121442537477,
"grad_norm": 2.987931966781616,
"learning_rate": 7.429420505200595e-05,
"loss": 0.3321,
"step": 69500
},
{
"epoch": 3.8006191614164675,
"eval_loss": 0.43500107526779175,
"eval_runtime": 22.946,
"eval_samples_per_second": 16.081,
"eval_steps_per_second": 8.062,
"step": 69977
},
{
"epoch": 3.801868346730393,
"grad_norm": 3.8928215503692627,
"learning_rate": 7.338817816112783e-05,
"loss": 0.3387,
"step": 70000
},
{
"epoch": 3.829024549207039,
"grad_norm": 2.32753586769104,
"learning_rate": 7.24821512702497e-05,
"loss": 0.3327,
"step": 70500
},
{
"epoch": 3.8561807516836843,
"grad_norm": 2.5396571159362793,
"learning_rate": 7.157793643315333e-05,
"loss": 0.3251,
"step": 71000
},
{
"epoch": 3.88333695416033,
"grad_norm": 2.509148597717285,
"learning_rate": 7.067190954227521e-05,
"loss": 0.3225,
"step": 71500
},
{
"epoch": 3.910493156636976,
"grad_norm": 1.7930841445922852,
"learning_rate": 6.97658826513971e-05,
"loss": 0.3392,
"step": 72000
},
{
"epoch": 3.9376493591136215,
"grad_norm": 2.579759120941162,
"learning_rate": 6.885985576051898e-05,
"loss": 0.3415,
"step": 72500
},
{
"epoch": 3.9648055615902673,
"grad_norm": 4.053764820098877,
"learning_rate": 6.795564092342262e-05,
"loss": 0.3373,
"step": 73000
},
{
"epoch": 3.9919617640669127,
"grad_norm": 2.3885462284088135,
"learning_rate": 6.70496140325445e-05,
"loss": 0.3423,
"step": 73500
},
{
"epoch": 4.00065174885944,
"eval_loss": 0.42881426215171814,
"eval_runtime": 23.0588,
"eval_samples_per_second": 16.003,
"eval_steps_per_second": 8.023,
"step": 73660
},
{
"epoch": 4.019117966543559,
"grad_norm": 1.8718838691711426,
"learning_rate": 6.614358714166636e-05,
"loss": 0.2902,
"step": 74000
},
{
"epoch": 4.046274169020204,
"grad_norm": 3.1479783058166504,
"learning_rate": 6.523756025078824e-05,
"loss": 0.2817,
"step": 74500
},
{
"epoch": 4.07343037149685,
"grad_norm": 2.8043808937072754,
"learning_rate": 6.433153335991013e-05,
"loss": 0.28,
"step": 75000
},
{
"epoch": 4.100586573973495,
"grad_norm": 0.6163878440856934,
"learning_rate": 6.342550646903201e-05,
"loss": 0.283,
"step": 75500
},
{
"epoch": 4.127742776450141,
"grad_norm": 1.6441878080368042,
"learning_rate": 6.252129163193563e-05,
"loss": 0.2731,
"step": 76000
},
{
"epoch": 4.154898978926787,
"grad_norm": 3.012065887451172,
"learning_rate": 6.161526474105753e-05,
"loss": 0.2757,
"step": 76500
},
{
"epoch": 4.182055181403433,
"grad_norm": 2.1326332092285156,
"learning_rate": 6.07092378501794e-05,
"loss": 0.2789,
"step": 77000
},
{
"epoch": 4.200684336302412,
"eval_loss": 0.44576430320739746,
"eval_runtime": 23.0355,
"eval_samples_per_second": 16.019,
"eval_steps_per_second": 8.031,
"step": 77343
},
{
"epoch": 4.209211383880078,
"grad_norm": 3.3734445571899414,
"learning_rate": 5.9803210959301273e-05,
"loss": 0.2729,
"step": 77500
},
{
"epoch": 4.236367586356724,
"grad_norm": 2.7482869625091553,
"learning_rate": 5.889718406842315e-05,
"loss": 0.2924,
"step": 78000
},
{
"epoch": 4.2635237888333695,
"grad_norm": 2.5796825885772705,
"learning_rate": 5.799115717754503e-05,
"loss": 0.2843,
"step": 78500
},
{
"epoch": 4.290679991310015,
"grad_norm": 3.74029541015625,
"learning_rate": 5.708513028666691e-05,
"loss": 0.2889,
"step": 79000
},
{
"epoch": 4.317836193786661,
"grad_norm": 3.763978958129883,
"learning_rate": 5.617910339578879e-05,
"loss": 0.2812,
"step": 79500
},
{
"epoch": 4.344992396263306,
"grad_norm": 2.851184844970703,
"learning_rate": 5.527488855869243e-05,
"loss": 0.283,
"step": 80000
},
{
"epoch": 4.372148598739952,
"grad_norm": 3.071202278137207,
"learning_rate": 5.436886166781431e-05,
"loss": 0.2911,
"step": 80500
},
{
"epoch": 4.399304801216598,
"grad_norm": 3.962803602218628,
"learning_rate": 5.3464646830717936e-05,
"loss": 0.2928,
"step": 81000
},
{
"epoch": 4.400716923745383,
"eval_loss": 0.4378789961338043,
"eval_runtime": 22.9566,
"eval_samples_per_second": 16.074,
"eval_steps_per_second": 8.059,
"step": 81026
},
{
"epoch": 4.426461003693244,
"grad_norm": 2.5465190410614014,
"learning_rate": 5.2558619939839814e-05,
"loss": 0.269,
"step": 81500
},
{
"epoch": 4.4536172061698895,
"grad_norm": 3.322237491607666,
"learning_rate": 5.16525930489617e-05,
"loss": 0.2883,
"step": 82000
},
{
"epoch": 4.4807734086465345,
"grad_norm": 1.5292987823486328,
"learning_rate": 5.0746566158083575e-05,
"loss": 0.2796,
"step": 82500
},
{
"epoch": 4.50792961112318,
"grad_norm": 2.0258724689483643,
"learning_rate": 4.984053926720545e-05,
"loss": 0.2766,
"step": 83000
},
{
"epoch": 4.535085813599826,
"grad_norm": 2.583266019821167,
"learning_rate": 4.893451237632733e-05,
"loss": 0.2975,
"step": 83500
},
{
"epoch": 4.562242016076472,
"grad_norm": 2.7614002227783203,
"learning_rate": 4.802848548544921e-05,
"loss": 0.2846,
"step": 84000
},
{
"epoch": 4.589398218553118,
"grad_norm": 4.259634971618652,
"learning_rate": 4.712245859457109e-05,
"loss": 0.2963,
"step": 84500
},
{
"epoch": 4.600749511188355,
"eval_loss": 0.43254056572914124,
"eval_runtime": 22.8989,
"eval_samples_per_second": 16.114,
"eval_steps_per_second": 8.079,
"step": 84709
},
{
"epoch": 4.616554421029763,
"grad_norm": 1.8035340309143066,
"learning_rate": 4.621643170369297e-05,
"loss": 0.2854,
"step": 85000
},
{
"epoch": 4.643710623506409,
"grad_norm": 3.2322275638580322,
"learning_rate": 4.53122168665966e-05,
"loss": 0.287,
"step": 85500
},
{
"epoch": 4.670866825983055,
"grad_norm": 7.430004119873047,
"learning_rate": 4.440618997571848e-05,
"loss": 0.2805,
"step": 86000
},
{
"epoch": 4.6980230284597,
"grad_norm": 2.2691986560821533,
"learning_rate": 4.3500163084840364e-05,
"loss": 0.2874,
"step": 86500
},
{
"epoch": 4.725179230936346,
"grad_norm": 2.7627906799316406,
"learning_rate": 4.2594136193962235e-05,
"loss": 0.2818,
"step": 87000
},
{
"epoch": 4.752335433412991,
"grad_norm": 3.7362864017486572,
"learning_rate": 4.1689921356865876e-05,
"loss": 0.2827,
"step": 87500
},
{
"epoch": 4.779491635889637,
"grad_norm": 4.409236907958984,
"learning_rate": 4.0783894465987754e-05,
"loss": 0.2887,
"step": 88000
},
{
"epoch": 4.800782098631327,
"eval_loss": 0.42746320366859436,
"eval_runtime": 23.0563,
"eval_samples_per_second": 16.004,
"eval_steps_per_second": 8.024,
"step": 88392
},
{
"epoch": 4.806647838366283,
"grad_norm": 4.065585136413574,
"learning_rate": 3.987786757510963e-05,
"loss": 0.2905,
"step": 88500
},
{
"epoch": 4.833804040842929,
"grad_norm": 3.655996799468994,
"learning_rate": 3.897184068423151e-05,
"loss": 0.2716,
"step": 89000
},
{
"epoch": 4.860960243319575,
"grad_norm": 4.297955513000488,
"learning_rate": 3.806762584713515e-05,
"loss": 0.29,
"step": 89500
},
{
"epoch": 4.88811644579622,
"grad_norm": 3.1703717708587646,
"learning_rate": 3.716159895625702e-05,
"loss": 0.2754,
"step": 90000
},
{
"epoch": 4.9152726482728655,
"grad_norm": 3.771336078643799,
"learning_rate": 3.62555720653789e-05,
"loss": 0.2839,
"step": 90500
},
{
"epoch": 4.942428850749511,
"grad_norm": 3.908500909805298,
"learning_rate": 3.534954517450078e-05,
"loss": 0.2744,
"step": 91000
},
{
"epoch": 4.969585053226157,
"grad_norm": 3.199415445327759,
"learning_rate": 3.444351828362266e-05,
"loss": 0.2834,
"step": 91500
},
{
"epoch": 4.996741255702802,
"grad_norm": 3.1083319187164307,
"learning_rate": 3.3539303446526294e-05,
"loss": 0.2949,
"step": 92000
},
{
"epoch": 5.0008146860743,
"eval_loss": 0.4291832447052002,
"eval_runtime": 23.525,
"eval_samples_per_second": 15.685,
"eval_steps_per_second": 7.864,
"step": 92075
},
{
"epoch": 5.023897458179448,
"grad_norm": 6.121253490447998,
"learning_rate": 3.263327655564817e-05,
"loss": 0.2289,
"step": 92500
},
{
"epoch": 5.051053660656094,
"grad_norm": 2.5016486644744873,
"learning_rate": 3.1727249664770055e-05,
"loss": 0.248,
"step": 93000
},
{
"epoch": 5.07820986313274,
"grad_norm": 2.344914197921753,
"learning_rate": 3.0821222773891926e-05,
"loss": 0.2315,
"step": 93500
},
{
"epoch": 5.1053660656093856,
"grad_norm": 3.519299268722534,
"learning_rate": 2.9917007936795567e-05,
"loss": 0.2516,
"step": 94000
},
{
"epoch": 5.1325222680860305,
"grad_norm": 3.192281484603882,
"learning_rate": 2.9010981045917445e-05,
"loss": 0.2368,
"step": 94500
},
{
"epoch": 5.159678470562676,
"grad_norm": 3.7645487785339355,
"learning_rate": 2.8104954155039322e-05,
"loss": 0.2573,
"step": 95000
},
{
"epoch": 5.186834673039322,
"grad_norm": 4.5175275802612305,
"learning_rate": 2.71989272641612e-05,
"loss": 0.2437,
"step": 95500
},
{
"epoch": 5.2008472735172715,
"eval_loss": 0.4366357922554016,
"eval_runtime": 23.1107,
"eval_samples_per_second": 15.967,
"eval_steps_per_second": 8.005,
"step": 95758
},
{
"epoch": 5.213990875515968,
"grad_norm": 4.234988212585449,
"learning_rate": 2.629290037328308e-05,
"loss": 0.2439,
"step": 96000
},
{
"epoch": 5.241147077992614,
"grad_norm": 3.174309492111206,
"learning_rate": 2.538687348240496e-05,
"loss": 0.2523,
"step": 96500
},
{
"epoch": 5.268303280469259,
"grad_norm": 3.7519733905792236,
"learning_rate": 2.4480846591526838e-05,
"loss": 0.2463,
"step": 97000
},
{
"epoch": 5.295459482945905,
"grad_norm": 2.9701130390167236,
"learning_rate": 2.357481970064872e-05,
"loss": 0.2519,
"step": 97500
},
{
"epoch": 5.322615685422551,
"grad_norm": 5.130082130432129,
"learning_rate": 2.2672416917334107e-05,
"loss": 0.2486,
"step": 98000
},
{
"epoch": 5.349771887899196,
"grad_norm": 3.390826463699341,
"learning_rate": 2.1766390026455985e-05,
"loss": 0.2478,
"step": 98500
},
{
"epoch": 5.376928090375841,
"grad_norm": 2.6151483058929443,
"learning_rate": 2.0860363135577865e-05,
"loss": 0.2424,
"step": 99000
},
{
"epoch": 5.400879860960243,
"eval_loss": 0.43580135703086853,
"eval_runtime": 23.7346,
"eval_samples_per_second": 15.547,
"eval_steps_per_second": 7.795,
"step": 99441
},
{
"epoch": 5.404084292852487,
"grad_norm": 3.701735496520996,
"learning_rate": 1.9954336244699743e-05,
"loss": 0.2443,
"step": 99500
},
{
"epoch": 5.431240495329133,
"grad_norm": 3.8400754928588867,
"learning_rate": 1.9048309353821623e-05,
"loss": 0.2276,
"step": 100000
},
{
"epoch": 5.458396697805779,
"grad_norm": 2.5460264682769775,
"learning_rate": 1.81422824629435e-05,
"loss": 0.2313,
"step": 100500
},
{
"epoch": 5.485552900282425,
"grad_norm": 5.040457725524902,
"learning_rate": 1.7236255572065378e-05,
"loss": 0.238,
"step": 101000
},
{
"epoch": 5.51270910275907,
"grad_norm": 4.061932563781738,
"learning_rate": 1.633022868118726e-05,
"loss": 0.2558,
"step": 101500
},
{
"epoch": 5.539865305235716,
"grad_norm": 4.28571081161499,
"learning_rate": 1.5424201790309136e-05,
"loss": 0.2531,
"step": 102000
},
{
"epoch": 5.5670215077123615,
"grad_norm": 4.26746129989624,
"learning_rate": 1.4519986953212772e-05,
"loss": 0.2487,
"step": 102500
},
{
"epoch": 5.594177710189007,
"grad_norm": 1.4005869626998901,
"learning_rate": 1.3613960062334651e-05,
"loss": 0.2528,
"step": 103000
},
{
"epoch": 5.600912448403215,
"eval_loss": 0.4331228733062744,
"eval_runtime": 25.1727,
"eval_samples_per_second": 14.659,
"eval_steps_per_second": 7.349,
"step": 103124
},
{
"epoch": 5.621333912665653,
"grad_norm": 3.8620026111602783,
"learning_rate": 1.2707933171456529e-05,
"loss": 0.248,
"step": 103500
},
{
"epoch": 5.648490115142298,
"grad_norm": 4.398037433624268,
"learning_rate": 1.1803718334360163e-05,
"loss": 0.2394,
"step": 104000
},
{
"epoch": 5.675646317618944,
"grad_norm": 2.4203145503997803,
"learning_rate": 1.0897691443482042e-05,
"loss": 0.2344,
"step": 104500
},
{
"epoch": 5.70280252009559,
"grad_norm": 3.2735469341278076,
"learning_rate": 9.991664552603922e-06,
"loss": 0.2391,
"step": 105000
},
{
"epoch": 5.729958722572236,
"grad_norm": 3.202352523803711,
"learning_rate": 9.0856376617258e-06,
"loss": 0.2503,
"step": 105500
},
{
"epoch": 5.757114925048882,
"grad_norm": 2.457843065261841,
"learning_rate": 8.17961077084768e-06,
"loss": 0.233,
"step": 106000
},
{
"epoch": 5.7842711275255265,
"grad_norm": 2.1440610885620117,
"learning_rate": 7.273583879969558e-06,
"loss": 0.2477,
"step": 106500
},
{
"epoch": 5.800945035846187,
"eval_loss": 0.43289270997047424,
"eval_runtime": 25.7135,
"eval_samples_per_second": 14.35,
"eval_steps_per_second": 7.195,
"step": 106807
},
{
"epoch": 5.811427330002172,
"grad_norm": 2.6855876445770264,
"learning_rate": 6.367556989091436e-06,
"loss": 0.231,
"step": 107000
},
{
"epoch": 5.838583532478818,
"grad_norm": 5.511388778686523,
"learning_rate": 5.461530098213316e-06,
"loss": 0.2399,
"step": 107500
},
{
"epoch": 5.865739734955464,
"grad_norm": 2.992866277694702,
"learning_rate": 4.555503207335194e-06,
"loss": 0.2367,
"step": 108000
},
{
"epoch": 5.89289593743211,
"grad_norm": 2.2536861896514893,
"learning_rate": 3.651288370238829e-06,
"loss": 0.2545,
"step": 108500
},
{
"epoch": 5.920052139908755,
"grad_norm": 3.6174511909484863,
"learning_rate": 2.745261479360707e-06,
"loss": 0.2576,
"step": 109000
},
{
"epoch": 5.947208342385401,
"grad_norm": 2.4859135150909424,
"learning_rate": 1.8392345884825864e-06,
"loss": 0.2448,
"step": 109500
},
{
"epoch": 5.974364544862047,
"grad_norm": 1.783007025718689,
"learning_rate": 9.350197513862211e-07,
"loss": 0.2347,
"step": 110000
},
{
"epoch": 6.0,
"step": 110472,
"total_flos": 7.299634402197504e+17,
"train_loss": 0.3804842073002838,
"train_runtime": 59722.3514,
"train_samples_per_second": 3.699,
"train_steps_per_second": 1.85
}
],
"logging_steps": 500,
"max_steps": 110472,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.299634402197504e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}