Safetensors
English
bert_hash
custom_code
bert-hash-femto / trainer_state.json
davidmezzetti's picture
Initial model
3a9e97b
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 563148,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0026635982015384943,
"grad_norm": 0.7201167941093445,
"learning_rate": 0.0001996,
"loss": 9.4233,
"step": 500
},
{
"epoch": 0.005327196403076989,
"grad_norm": 0.15531601011753082,
"learning_rate": 0.0003996,
"loss": 7.4925,
"step": 1000
},
{
"epoch": 0.007990794604615483,
"grad_norm": 0.2483946532011032,
"learning_rate": 0.0005996,
"loss": 7.4229,
"step": 1500
},
{
"epoch": 0.010654392806153977,
"grad_norm": 0.5883714556694031,
"learning_rate": 0.0007996,
"loss": 7.2323,
"step": 2000
},
{
"epoch": 0.013317991007692471,
"grad_norm": 0.7867951989173889,
"learning_rate": 0.0009996,
"loss": 7.0605,
"step": 2500
},
{
"epoch": 0.015981589209230967,
"grad_norm": 0.8444465398788452,
"learning_rate": 0.0009991117421269675,
"loss": 6.9306,
"step": 3000
},
{
"epoch": 0.01864518741076946,
"grad_norm": 0.6867188215255737,
"learning_rate": 0.00099821991695324,
"loss": 6.8518,
"step": 3500
},
{
"epoch": 0.021308785612307955,
"grad_norm": 0.5377506017684937,
"learning_rate": 0.0009973280917795124,
"loss": 6.7872,
"step": 4000
},
{
"epoch": 0.02397238381384645,
"grad_norm": 0.6717762351036072,
"learning_rate": 0.0009964362666057848,
"loss": 6.7506,
"step": 4500
},
{
"epoch": 0.026635982015384942,
"grad_norm": 1.001440167427063,
"learning_rate": 0.0009955444414320573,
"loss": 6.7159,
"step": 5000
},
{
"epoch": 0.029299580216923436,
"grad_norm": 0.5917439460754395,
"learning_rate": 0.0009946526162583297,
"loss": 6.6817,
"step": 5500
},
{
"epoch": 0.031963178418461934,
"grad_norm": 0.6403864026069641,
"learning_rate": 0.0009937607910846021,
"loss": 6.6561,
"step": 6000
},
{
"epoch": 0.034626776620000424,
"grad_norm": 0.6477270126342773,
"learning_rate": 0.0009928689659108746,
"loss": 6.626,
"step": 6500
},
{
"epoch": 0.03729037482153892,
"grad_norm": 0.8317912817001343,
"learning_rate": 0.0009919789243874944,
"loss": 6.6155,
"step": 7000
},
{
"epoch": 0.03995397302307741,
"grad_norm": 0.81658536195755,
"learning_rate": 0.0009910870992137668,
"loss": 6.5983,
"step": 7500
},
{
"epoch": 0.04261757122461591,
"grad_norm": 0.8080710768699646,
"learning_rate": 0.0009901952740400395,
"loss": 6.5712,
"step": 8000
},
{
"epoch": 0.045281169426154406,
"grad_norm": 0.7330273985862732,
"learning_rate": 0.000989303448866312,
"loss": 6.5671,
"step": 8500
},
{
"epoch": 0.0479447676276929,
"grad_norm": 0.5048246383666992,
"learning_rate": 0.0009884134073429318,
"loss": 6.5566,
"step": 9000
},
{
"epoch": 0.050608365829231394,
"grad_norm": 0.60006183385849,
"learning_rate": 0.0009875215821692042,
"loss": 6.5299,
"step": 9500
},
{
"epoch": 0.053271964030769885,
"grad_norm": 0.7553561329841614,
"learning_rate": 0.0009866297569954767,
"loss": 6.4984,
"step": 10000
},
{
"epoch": 0.05593556223230838,
"grad_norm": 0.6969451904296875,
"learning_rate": 0.000985737931821749,
"loss": 6.4697,
"step": 10500
},
{
"epoch": 0.05859916043384687,
"grad_norm": 0.8137800097465515,
"learning_rate": 0.0009848461066480215,
"loss": 6.4535,
"step": 11000
},
{
"epoch": 0.06126275863538537,
"grad_norm": 0.6285300850868225,
"learning_rate": 0.000983954281474294,
"loss": 6.4259,
"step": 11500
},
{
"epoch": 0.06392635683692387,
"grad_norm": 0.6301620006561279,
"learning_rate": 0.0009830624563005664,
"loss": 6.4174,
"step": 12000
},
{
"epoch": 0.06658995503846236,
"grad_norm": 0.49541255831718445,
"learning_rate": 0.0009821706311268388,
"loss": 6.4134,
"step": 12500
},
{
"epoch": 0.06925355324000085,
"grad_norm": 0.8492177128791809,
"learning_rate": 0.000981280589603459,
"loss": 6.394,
"step": 13000
},
{
"epoch": 0.07191715144153935,
"grad_norm": 0.6284229755401611,
"learning_rate": 0.0009803887644297313,
"loss": 6.3861,
"step": 13500
},
{
"epoch": 0.07458074964307784,
"grad_norm": 0.7854110598564148,
"learning_rate": 0.0009794969392560038,
"loss": 6.3795,
"step": 14000
},
{
"epoch": 0.07724434784461634,
"grad_norm": 0.6952440738677979,
"learning_rate": 0.0009786051140822762,
"loss": 6.3679,
"step": 14500
},
{
"epoch": 0.07990794604615482,
"grad_norm": Infinity,
"learning_rate": 0.0009777132889085486,
"loss": 6.363,
"step": 15000
},
{
"epoch": 0.08257154424769332,
"grad_norm": 0.6554950475692749,
"learning_rate": 0.0009768232473851685,
"loss": 6.3597,
"step": 15500
},
{
"epoch": 0.08523514244923182,
"grad_norm": 0.6918802261352539,
"learning_rate": 0.000975931422211441,
"loss": 6.3536,
"step": 16000
},
{
"epoch": 0.08789874065077032,
"grad_norm": 0.749622642993927,
"learning_rate": 0.0009750395970377135,
"loss": 6.3438,
"step": 16500
},
{
"epoch": 0.09056233885230881,
"grad_norm": 0.7492349743843079,
"learning_rate": 0.000974147771863986,
"loss": 6.3332,
"step": 17000
},
{
"epoch": 0.0932259370538473,
"grad_norm": 0.6446586847305298,
"learning_rate": 0.000973257730340606,
"loss": 6.3241,
"step": 17500
},
{
"epoch": 0.0958895352553858,
"grad_norm": 0.8464730978012085,
"learning_rate": 0.0009723659051668784,
"loss": 6.3194,
"step": 18000
},
{
"epoch": 0.09855313345692429,
"grad_norm": 0.6281186938285828,
"learning_rate": 0.0009714740799931508,
"loss": 6.309,
"step": 18500
},
{
"epoch": 0.10121673165846279,
"grad_norm": 0.8605656027793884,
"learning_rate": 0.0009705822548194233,
"loss": 6.2991,
"step": 19000
},
{
"epoch": 0.10388032986000127,
"grad_norm": 0.7788176536560059,
"learning_rate": 0.0009696922132960431,
"loss": 6.3005,
"step": 19500
},
{
"epoch": 0.10654392806153977,
"grad_norm": 0.6075990200042725,
"learning_rate": 0.0009688003881223157,
"loss": 6.2843,
"step": 20000
},
{
"epoch": 0.10920752626307827,
"grad_norm": 0.7577124238014221,
"learning_rate": 0.0009679085629485881,
"loss": 6.2759,
"step": 20500
},
{
"epoch": 0.11187112446461676,
"grad_norm": 0.8228011727333069,
"learning_rate": 0.0009670167377748605,
"loss": 6.2599,
"step": 21000
},
{
"epoch": 0.11453472266615526,
"grad_norm": 0.7447388172149658,
"learning_rate": 0.0009661266962514804,
"loss": 6.2513,
"step": 21500
},
{
"epoch": 0.11719832086769374,
"grad_norm": 0.9003899097442627,
"learning_rate": 0.0009652348710777528,
"loss": 6.2279,
"step": 22000
},
{
"epoch": 0.11986191906923224,
"grad_norm": 1.0574650764465332,
"learning_rate": 0.0009643430459040254,
"loss": 6.2027,
"step": 22500
},
{
"epoch": 0.12252551727077074,
"grad_norm": 0.9610631465911865,
"learning_rate": 0.0009634512207302978,
"loss": 6.1742,
"step": 23000
},
{
"epoch": 0.12518911547230924,
"grad_norm": 1.1535989046096802,
"learning_rate": 0.0009625611792069178,
"loss": 6.1294,
"step": 23500
},
{
"epoch": 0.12785271367384773,
"grad_norm": 1.1773658990859985,
"learning_rate": 0.0009616711376835376,
"loss": 6.097,
"step": 24000
},
{
"epoch": 0.13051631187538623,
"grad_norm": 1.2815760374069214,
"learning_rate": 0.0009607793125098101,
"loss": 6.0634,
"step": 24500
},
{
"epoch": 0.13317991007692473,
"grad_norm": 1.4569323062896729,
"learning_rate": 0.0009598874873360826,
"loss": 6.0457,
"step": 25000
},
{
"epoch": 0.1358435082784632,
"grad_norm": 1.506204605102539,
"learning_rate": 0.000958995662162355,
"loss": 6.0186,
"step": 25500
},
{
"epoch": 0.1385071064800017,
"grad_norm": 1.3472563028335571,
"learning_rate": 0.0009581056206389749,
"loss": 6.0086,
"step": 26000
},
{
"epoch": 0.1411707046815402,
"grad_norm": 1.4809520244598389,
"learning_rate": 0.0009572137954652473,
"loss": 5.9898,
"step": 26500
},
{
"epoch": 0.1438343028830787,
"grad_norm": 1.5233690738677979,
"learning_rate": 0.0009563219702915198,
"loss": 5.9781,
"step": 27000
},
{
"epoch": 0.1464979010846172,
"grad_norm": 1.5101710557937622,
"learning_rate": 0.0009554301451177923,
"loss": 5.9561,
"step": 27500
},
{
"epoch": 0.14916149928615569,
"grad_norm": 1.612731695175171,
"learning_rate": 0.0009545401035944123,
"loss": 5.9526,
"step": 28000
},
{
"epoch": 0.15182509748769418,
"grad_norm": 1.7018260955810547,
"learning_rate": 0.0009536482784206847,
"loss": 5.9338,
"step": 28500
},
{
"epoch": 0.15448869568923268,
"grad_norm": 1.7604913711547852,
"learning_rate": 0.0009527564532469571,
"loss": 5.9321,
"step": 29000
},
{
"epoch": 0.15715229389077118,
"grad_norm": 1.721969485282898,
"learning_rate": 0.0009518646280732296,
"loss": 5.9175,
"step": 29500
},
{
"epoch": 0.15981589209230965,
"grad_norm": 1.5823644399642944,
"learning_rate": 0.0009509745865498494,
"loss": 5.9153,
"step": 30000
},
{
"epoch": 0.16247949029384814,
"grad_norm": 1.7854641675949097,
"learning_rate": 0.000950082761376122,
"loss": 5.9072,
"step": 30500
},
{
"epoch": 0.16514308849538664,
"grad_norm": 1.7369080781936646,
"learning_rate": 0.0009491909362023944,
"loss": 5.9029,
"step": 31000
},
{
"epoch": 0.16780668669692514,
"grad_norm": 1.674492597579956,
"learning_rate": 0.0009482991110286668,
"loss": 5.8841,
"step": 31500
},
{
"epoch": 0.17047028489846364,
"grad_norm": 1.7058457136154175,
"learning_rate": 0.0009474072858549393,
"loss": 5.8883,
"step": 32000
},
{
"epoch": 0.17313388310000213,
"grad_norm": 1.5853819847106934,
"learning_rate": 0.0009465172443315591,
"loss": 5.8775,
"step": 32500
},
{
"epoch": 0.17579748130154063,
"grad_norm": 1.7525198459625244,
"learning_rate": 0.0009456254191578317,
"loss": 5.8717,
"step": 33000
},
{
"epoch": 0.17846107950307913,
"grad_norm": 1.9233468770980835,
"learning_rate": 0.0009447335939841041,
"loss": 5.8608,
"step": 33500
},
{
"epoch": 0.18112467770461763,
"grad_norm": 1.637522578239441,
"learning_rate": 0.0009438417688103765,
"loss": 5.8658,
"step": 34000
},
{
"epoch": 0.1837882759061561,
"grad_norm": 1.8892813920974731,
"learning_rate": 0.000942949943636649,
"loss": 5.8523,
"step": 34500
},
{
"epoch": 0.1864518741076946,
"grad_norm": 1.9510762691497803,
"learning_rate": 0.0009420599021132689,
"loss": 5.8404,
"step": 35000
},
{
"epoch": 0.1891154723092331,
"grad_norm": 1.7907196283340454,
"learning_rate": 0.0009411680769395415,
"loss": 5.8396,
"step": 35500
},
{
"epoch": 0.1917790705107716,
"grad_norm": 1.8805279731750488,
"learning_rate": 0.0009402762517658139,
"loss": 5.8293,
"step": 36000
},
{
"epoch": 0.19444266871231008,
"grad_norm": 1.7272233963012695,
"learning_rate": 0.0009393844265920863,
"loss": 5.8268,
"step": 36500
},
{
"epoch": 0.19710626691384858,
"grad_norm": 2.035203695297241,
"learning_rate": 0.0009384926014183588,
"loss": 5.8209,
"step": 37000
},
{
"epoch": 0.19976986511538708,
"grad_norm": 1.8728936910629272,
"learning_rate": 0.0009376007762446312,
"loss": 5.8165,
"step": 37500
},
{
"epoch": 0.20243346331692558,
"grad_norm": 1.9231390953063965,
"learning_rate": 0.0009367089510709037,
"loss": 5.8149,
"step": 38000
},
{
"epoch": 0.20509706151846407,
"grad_norm": 1.7793642282485962,
"learning_rate": 0.0009358171258971762,
"loss": 5.8132,
"step": 38500
},
{
"epoch": 0.20776065972000254,
"grad_norm": 1.7759062051773071,
"learning_rate": 0.000934927084373796,
"loss": 5.8065,
"step": 39000
},
{
"epoch": 0.21042425792154104,
"grad_norm": 1.7528033256530762,
"learning_rate": 0.0009340352592000685,
"loss": 5.8023,
"step": 39500
},
{
"epoch": 0.21308785612307954,
"grad_norm": 1.8702290058135986,
"learning_rate": 0.0009331434340263409,
"loss": 5.7909,
"step": 40000
},
{
"epoch": 0.21575145432461804,
"grad_norm": 1.9332852363586426,
"learning_rate": 0.000932253392502961,
"loss": 5.7937,
"step": 40500
},
{
"epoch": 0.21841505252615653,
"grad_norm": 1.8513240814208984,
"learning_rate": 0.0009313615673292334,
"loss": 5.7865,
"step": 41000
},
{
"epoch": 0.22107865072769503,
"grad_norm": 1.8357592821121216,
"learning_rate": 0.0009304697421555058,
"loss": 5.7859,
"step": 41500
},
{
"epoch": 0.22374224892923353,
"grad_norm": 1.7558057308197021,
"learning_rate": 0.0009295779169817783,
"loss": 5.7781,
"step": 42000
},
{
"epoch": 0.22640584713077203,
"grad_norm": 1.7014683485031128,
"learning_rate": 0.0009286860918080507,
"loss": 5.7703,
"step": 42500
},
{
"epoch": 0.22906944533231052,
"grad_norm": 1.8377306461334229,
"learning_rate": 0.0009277942666343233,
"loss": 5.7775,
"step": 43000
},
{
"epoch": 0.231733043533849,
"grad_norm": 1.7670570611953735,
"learning_rate": 0.0009269024414605957,
"loss": 5.7606,
"step": 43500
},
{
"epoch": 0.2343966417353875,
"grad_norm": 1.907322883605957,
"learning_rate": 0.0009260106162868681,
"loss": 5.7595,
"step": 44000
},
{
"epoch": 0.237060239936926,
"grad_norm": 1.9192357063293457,
"learning_rate": 0.000925120574763488,
"loss": 5.7574,
"step": 44500
},
{
"epoch": 0.23972383813846448,
"grad_norm": 1.801256775856018,
"learning_rate": 0.0009242287495897604,
"loss": 5.7623,
"step": 45000
},
{
"epoch": 0.24238743634000298,
"grad_norm": 1.7864599227905273,
"learning_rate": 0.000923336924416033,
"loss": 5.7464,
"step": 45500
},
{
"epoch": 0.24505103454154148,
"grad_norm": 2.0881760120391846,
"learning_rate": 0.0009224450992423054,
"loss": 5.7492,
"step": 46000
},
{
"epoch": 0.24771463274307998,
"grad_norm": 2.0729496479034424,
"learning_rate": 0.0009215550577189252,
"loss": 5.7464,
"step": 46500
},
{
"epoch": 0.2503782309446185,
"grad_norm": 1.807739496231079,
"learning_rate": 0.0009206632325451977,
"loss": 5.7391,
"step": 47000
},
{
"epoch": 0.25304182914615697,
"grad_norm": 1.7898356914520264,
"learning_rate": 0.0009197731910218176,
"loss": 5.7399,
"step": 47500
},
{
"epoch": 0.25570542734769547,
"grad_norm": 1.6668163537979126,
"learning_rate": 0.0009188813658480901,
"loss": 5.7316,
"step": 48000
},
{
"epoch": 0.25836902554923397,
"grad_norm": 1.743788242340088,
"learning_rate": 0.0009179895406743626,
"loss": 5.7251,
"step": 48500
},
{
"epoch": 0.26103262375077246,
"grad_norm": 1.7427009344100952,
"learning_rate": 0.000917097715500635,
"loss": 5.7231,
"step": 49000
},
{
"epoch": 0.26369622195231096,
"grad_norm": 1.8911422491073608,
"learning_rate": 0.0009162058903269075,
"loss": 5.7272,
"step": 49500
},
{
"epoch": 0.26635982015384946,
"grad_norm": 1.7783831357955933,
"learning_rate": 0.0009153140651531799,
"loss": 5.7193,
"step": 50000
},
{
"epoch": 0.2690234183553879,
"grad_norm": 1.75882089138031,
"learning_rate": 0.0009144222399794523,
"loss": 5.7233,
"step": 50500
},
{
"epoch": 0.2716870165569264,
"grad_norm": 1.8454984426498413,
"learning_rate": 0.0009135304148057249,
"loss": 5.7163,
"step": 51000
},
{
"epoch": 0.2743506147584649,
"grad_norm": 1.8908592462539673,
"learning_rate": 0.0009126403732823447,
"loss": 5.7175,
"step": 51500
},
{
"epoch": 0.2770142129600034,
"grad_norm": 1.6938859224319458,
"learning_rate": 0.0009117485481086172,
"loss": 5.7113,
"step": 52000
},
{
"epoch": 0.2796778111615419,
"grad_norm": 1.8087745904922485,
"learning_rate": 0.0009108567229348896,
"loss": 5.7104,
"step": 52500
},
{
"epoch": 0.2823414093630804,
"grad_norm": 1.9441509246826172,
"learning_rate": 0.000909964897761162,
"loss": 5.7006,
"step": 53000
},
{
"epoch": 0.2850050075646189,
"grad_norm": 2.016289710998535,
"learning_rate": 0.000909074856237782,
"loss": 5.7084,
"step": 53500
},
{
"epoch": 0.2876686057661574,
"grad_norm": 1.7924542427062988,
"learning_rate": 0.0009081830310640544,
"loss": 5.6967,
"step": 54000
},
{
"epoch": 0.2903322039676959,
"grad_norm": 1.8578925132751465,
"learning_rate": 0.0009072912058903269,
"loss": 5.7058,
"step": 54500
},
{
"epoch": 0.2929958021692344,
"grad_norm": 1.8592642545700073,
"learning_rate": 0.0009063993807165993,
"loss": 5.699,
"step": 55000
},
{
"epoch": 0.2956594003707729,
"grad_norm": 1.726891040802002,
"learning_rate": 0.0009055075555428717,
"loss": 5.6873,
"step": 55500
},
{
"epoch": 0.29832299857231137,
"grad_norm": 1.8885732889175415,
"learning_rate": 0.0009046175140194918,
"loss": 5.6859,
"step": 56000
},
{
"epoch": 0.30098659677384987,
"grad_norm": 1.6777235269546509,
"learning_rate": 0.0009037256888457643,
"loss": 5.6843,
"step": 56500
},
{
"epoch": 0.30365019497538837,
"grad_norm": 1.824777364730835,
"learning_rate": 0.0009028338636720367,
"loss": 5.6865,
"step": 57000
},
{
"epoch": 0.30631379317692686,
"grad_norm": 1.6151602268218994,
"learning_rate": 0.0009019420384983091,
"loss": 5.6864,
"step": 57500
},
{
"epoch": 0.30897739137846536,
"grad_norm": 1.7518750429153442,
"learning_rate": 0.0009010502133245816,
"loss": 5.6835,
"step": 58000
},
{
"epoch": 0.31164098958000386,
"grad_norm": 1.9652341604232788,
"learning_rate": 0.0009001583881508541,
"loss": 5.6778,
"step": 58500
},
{
"epoch": 0.31430458778154235,
"grad_norm": 1.8396164178848267,
"learning_rate": 0.0008992665629771265,
"loss": 5.6805,
"step": 59000
},
{
"epoch": 0.3169681859830808,
"grad_norm": 1.7397726774215698,
"learning_rate": 0.000898374737803399,
"loss": 5.6809,
"step": 59500
},
{
"epoch": 0.3196317841846193,
"grad_norm": 1.6550874710083008,
"learning_rate": 0.0008974846962800188,
"loss": 5.6713,
"step": 60000
},
{
"epoch": 0.3222953823861578,
"grad_norm": 1.7428010702133179,
"learning_rate": 0.0008965928711062913,
"loss": 5.6777,
"step": 60500
},
{
"epoch": 0.3249589805876963,
"grad_norm": 1.7465174198150635,
"learning_rate": 0.0008957028295829112,
"loss": 5.6668,
"step": 61000
},
{
"epoch": 0.3276225787892348,
"grad_norm": 1.719190239906311,
"learning_rate": 0.0008948110044091838,
"loss": 5.6736,
"step": 61500
},
{
"epoch": 0.3302861769907733,
"grad_norm": 1.6879175901412964,
"learning_rate": 0.0008939191792354562,
"loss": 5.6585,
"step": 62000
},
{
"epoch": 0.3329497751923118,
"grad_norm": 1.6741931438446045,
"learning_rate": 0.0008930273540617286,
"loss": 5.6584,
"step": 62500
},
{
"epoch": 0.3356133733938503,
"grad_norm": 1.8733186721801758,
"learning_rate": 0.0008921355288880011,
"loss": 5.6655,
"step": 63000
},
{
"epoch": 0.3382769715953888,
"grad_norm": 1.8366929292678833,
"learning_rate": 0.0008912454873646209,
"loss": 5.6551,
"step": 63500
},
{
"epoch": 0.3409405697969273,
"grad_norm": 1.7783548831939697,
"learning_rate": 0.0008903536621908935,
"loss": 5.6598,
"step": 64000
},
{
"epoch": 0.34360416799846577,
"grad_norm": 1.739394187927246,
"learning_rate": 0.0008894618370171659,
"loss": 5.6568,
"step": 64500
},
{
"epoch": 0.34626776620000427,
"grad_norm": 1.706986427307129,
"learning_rate": 0.0008885700118434383,
"loss": 5.6577,
"step": 65000
},
{
"epoch": 0.34893136440154277,
"grad_norm": 1.7595592737197876,
"learning_rate": 0.0008876781866697108,
"loss": 5.6504,
"step": 65500
},
{
"epoch": 0.35159496260308126,
"grad_norm": 1.7445604801177979,
"learning_rate": 0.0008867863614959832,
"loss": 5.6457,
"step": 66000
},
{
"epoch": 0.35425856080461976,
"grad_norm": 1.7039164304733276,
"learning_rate": 0.0008858945363222557,
"loss": 5.652,
"step": 66500
},
{
"epoch": 0.35692215900615826,
"grad_norm": 1.7117230892181396,
"learning_rate": 0.0008850027111485282,
"loss": 5.6456,
"step": 67000
},
{
"epoch": 0.35958575720769675,
"grad_norm": 1.8759076595306396,
"learning_rate": 0.000884112669625148,
"loss": 5.6504,
"step": 67500
},
{
"epoch": 0.36224935540923525,
"grad_norm": 1.5524253845214844,
"learning_rate": 0.0008832208444514205,
"loss": 5.6426,
"step": 68000
},
{
"epoch": 0.36491295361077375,
"grad_norm": 1.648575782775879,
"learning_rate": 0.0008823290192776929,
"loss": 5.6401,
"step": 68500
},
{
"epoch": 0.3675765518123122,
"grad_norm": 1.6062759160995483,
"learning_rate": 0.0008814371941039654,
"loss": 5.6466,
"step": 69000
},
{
"epoch": 0.3702401500138507,
"grad_norm": 1.5237386226654053,
"learning_rate": 0.0008805471525805854,
"loss": 5.6381,
"step": 69500
},
{
"epoch": 0.3729037482153892,
"grad_norm": 1.7291427850723267,
"learning_rate": 0.0008796553274068578,
"loss": 5.6337,
"step": 70000
},
{
"epoch": 0.3755673464169277,
"grad_norm": 1.875213623046875,
"learning_rate": 0.0008787635022331303,
"loss": 5.6356,
"step": 70500
},
{
"epoch": 0.3782309446184662,
"grad_norm": 1.8453514575958252,
"learning_rate": 0.0008778716770594027,
"loss": 5.6348,
"step": 71000
},
{
"epoch": 0.3808945428200047,
"grad_norm": 1.725234866142273,
"learning_rate": 0.0008769816355360227,
"loss": 5.6318,
"step": 71500
},
{
"epoch": 0.3835581410215432,
"grad_norm": 1.7739455699920654,
"learning_rate": 0.0008760898103622951,
"loss": 5.6296,
"step": 72000
},
{
"epoch": 0.3862217392230817,
"grad_norm": 1.683827519416809,
"learning_rate": 0.0008751979851885675,
"loss": 5.6357,
"step": 72500
},
{
"epoch": 0.38888533742462017,
"grad_norm": 1.5576590299606323,
"learning_rate": 0.00087430616001484,
"loss": 5.63,
"step": 73000
},
{
"epoch": 0.39154893562615867,
"grad_norm": 1.666030764579773,
"learning_rate": 0.0008734161184914598,
"loss": 5.6178,
"step": 73500
},
{
"epoch": 0.39421253382769716,
"grad_norm": 1.618916392326355,
"learning_rate": 0.0008725242933177324,
"loss": 5.6273,
"step": 74000
},
{
"epoch": 0.39687613202923566,
"grad_norm": 1.69428551197052,
"learning_rate": 0.0008716324681440048,
"loss": 5.6188,
"step": 74500
},
{
"epoch": 0.39953973023077416,
"grad_norm": 1.8516380786895752,
"learning_rate": 0.0008707406429702772,
"loss": 5.6235,
"step": 75000
},
{
"epoch": 0.40220332843231266,
"grad_norm": 1.505953311920166,
"learning_rate": 0.0008698506014468972,
"loss": 5.6175,
"step": 75500
},
{
"epoch": 0.40486692663385115,
"grad_norm": 1.5639010667800903,
"learning_rate": 0.0008689587762731696,
"loss": 5.6213,
"step": 76000
},
{
"epoch": 0.40753052483538965,
"grad_norm": 1.7431727647781372,
"learning_rate": 0.0008680669510994421,
"loss": 5.6198,
"step": 76500
},
{
"epoch": 0.41019412303692815,
"grad_norm": 1.676757574081421,
"learning_rate": 0.0008671751259257146,
"loss": 5.6252,
"step": 77000
},
{
"epoch": 0.41285772123846665,
"grad_norm": 1.6216061115264893,
"learning_rate": 0.0008662850844023345,
"loss": 5.6211,
"step": 77500
},
{
"epoch": 0.4155213194400051,
"grad_norm": 1.6766453981399536,
"learning_rate": 0.0008653932592286069,
"loss": 5.62,
"step": 78000
},
{
"epoch": 0.4181849176415436,
"grad_norm": 1.6790215969085693,
"learning_rate": 0.0008645014340548793,
"loss": 5.6093,
"step": 78500
},
{
"epoch": 0.4208485158430821,
"grad_norm": 1.8037434816360474,
"learning_rate": 0.0008636096088811518,
"loss": 5.6085,
"step": 79000
},
{
"epoch": 0.4235121140446206,
"grad_norm": 1.6324502229690552,
"learning_rate": 0.0008627195673577717,
"loss": 5.6031,
"step": 79500
},
{
"epoch": 0.4261757122461591,
"grad_norm": 1.6987981796264648,
"learning_rate": 0.0008618277421840443,
"loss": 5.6116,
"step": 80000
},
{
"epoch": 0.4288393104476976,
"grad_norm": 1.6692321300506592,
"learning_rate": 0.0008609359170103167,
"loss": 5.6062,
"step": 80500
},
{
"epoch": 0.43150290864923607,
"grad_norm": 1.6387773752212524,
"learning_rate": 0.0008600440918365891,
"loss": 5.6087,
"step": 81000
},
{
"epoch": 0.43416650685077457,
"grad_norm": 1.792861819267273,
"learning_rate": 0.000859154050313209,
"loss": 5.608,
"step": 81500
},
{
"epoch": 0.43683010505231307,
"grad_norm": 1.676076889038086,
"learning_rate": 0.0008582622251394815,
"loss": 5.6056,
"step": 82000
},
{
"epoch": 0.43949370325385156,
"grad_norm": 1.772159218788147,
"learning_rate": 0.000857370399965754,
"loss": 5.6015,
"step": 82500
},
{
"epoch": 0.44215730145539006,
"grad_norm": 1.7022145986557007,
"learning_rate": 0.0008564785747920264,
"loss": 5.6056,
"step": 83000
},
{
"epoch": 0.44482089965692856,
"grad_norm": 1.6428086757659912,
"learning_rate": 0.0008555885332686463,
"loss": 5.596,
"step": 83500
},
{
"epoch": 0.44748449785846706,
"grad_norm": 1.6144286394119263,
"learning_rate": 0.0008546967080949187,
"loss": 5.5974,
"step": 84000
},
{
"epoch": 0.45014809606000555,
"grad_norm": 1.5918573141098022,
"learning_rate": 0.0008538048829211912,
"loss": 5.604,
"step": 84500
},
{
"epoch": 0.45281169426154405,
"grad_norm": 1.7871578931808472,
"learning_rate": 0.0008529130577474637,
"loss": 5.5951,
"step": 85000
},
{
"epoch": 0.45547529246308255,
"grad_norm": 1.6631501913070679,
"learning_rate": 0.0008520230162240836,
"loss": 5.6014,
"step": 85500
},
{
"epoch": 0.45813889066462105,
"grad_norm": 1.6243520975112915,
"learning_rate": 0.0008511311910503561,
"loss": 5.5942,
"step": 86000
},
{
"epoch": 0.46080248886615954,
"grad_norm": 1.5686520338058472,
"learning_rate": 0.0008502393658766285,
"loss": 5.5981,
"step": 86500
},
{
"epoch": 0.463466087067698,
"grad_norm": 1.7691351175308228,
"learning_rate": 0.0008493475407029009,
"loss": 5.5984,
"step": 87000
},
{
"epoch": 0.4661296852692365,
"grad_norm": 1.6885465383529663,
"learning_rate": 0.0008484574991795209,
"loss": 5.5851,
"step": 87500
},
{
"epoch": 0.468793283470775,
"grad_norm": 1.6488664150238037,
"learning_rate": 0.0008475656740057933,
"loss": 5.5831,
"step": 88000
},
{
"epoch": 0.4714568816723135,
"grad_norm": 1.5736653804779053,
"learning_rate": 0.0008466738488320658,
"loss": 5.582,
"step": 88500
},
{
"epoch": 0.474120479873852,
"grad_norm": 1.7857962846755981,
"learning_rate": 0.0008457820236583382,
"loss": 5.5901,
"step": 89000
},
{
"epoch": 0.47678407807539047,
"grad_norm": 1.7936720848083496,
"learning_rate": 0.0008448919821349581,
"loss": 5.5822,
"step": 89500
},
{
"epoch": 0.47944767627692897,
"grad_norm": 1.546919345855713,
"learning_rate": 0.0008440001569612306,
"loss": 5.581,
"step": 90000
},
{
"epoch": 0.48211127447846747,
"grad_norm": 1.778827428817749,
"learning_rate": 0.000843108331787503,
"loss": 5.5922,
"step": 90500
},
{
"epoch": 0.48477487268000596,
"grad_norm": 1.495205044746399,
"learning_rate": 0.0008422165066137755,
"loss": 5.5821,
"step": 91000
},
{
"epoch": 0.48743847088154446,
"grad_norm": 1.6151823997497559,
"learning_rate": 0.0008413264650903954,
"loss": 5.5801,
"step": 91500
},
{
"epoch": 0.49010206908308296,
"grad_norm": 1.7652384042739868,
"learning_rate": 0.0008404346399166679,
"loss": 5.5785,
"step": 92000
},
{
"epoch": 0.49276566728462146,
"grad_norm": 1.7062280178070068,
"learning_rate": 0.0008395428147429404,
"loss": 5.5784,
"step": 92500
},
{
"epoch": 0.49542926548615995,
"grad_norm": 1.5986762046813965,
"learning_rate": 0.0008386509895692128,
"loss": 5.5814,
"step": 93000
},
{
"epoch": 0.49809286368769845,
"grad_norm": 1.672861933708191,
"learning_rate": 0.0008377609480458327,
"loss": 5.5743,
"step": 93500
},
{
"epoch": 0.500756461889237,
"grad_norm": 1.8104331493377686,
"learning_rate": 0.0008368691228721051,
"loss": 5.5709,
"step": 94000
},
{
"epoch": 0.5034200600907754,
"grad_norm": 1.8253047466278076,
"learning_rate": 0.0008359772976983776,
"loss": 5.5642,
"step": 94500
},
{
"epoch": 0.5060836582923139,
"grad_norm": 1.604465126991272,
"learning_rate": 0.0008350854725246501,
"loss": 5.5691,
"step": 95000
},
{
"epoch": 0.5087472564938524,
"grad_norm": 1.7985742092132568,
"learning_rate": 0.00083419543100127,
"loss": 5.5611,
"step": 95500
},
{
"epoch": 0.5114108546953909,
"grad_norm": 1.652733325958252,
"learning_rate": 0.0008333036058275424,
"loss": 5.5577,
"step": 96000
},
{
"epoch": 0.5140744528969294,
"grad_norm": 1.8247016668319702,
"learning_rate": 0.0008324117806538148,
"loss": 5.5557,
"step": 96500
},
{
"epoch": 0.5167380510984679,
"grad_norm": 1.784303069114685,
"learning_rate": 0.0008315199554800873,
"loss": 5.5554,
"step": 97000
},
{
"epoch": 0.5194016493000064,
"grad_norm": 1.705725073814392,
"learning_rate": 0.0008306299139567072,
"loss": 5.5545,
"step": 97500
},
{
"epoch": 0.5220652475015449,
"grad_norm": 1.8760724067687988,
"learning_rate": 0.0008297380887829798,
"loss": 5.5512,
"step": 98000
},
{
"epoch": 0.5247288457030834,
"grad_norm": 1.7412986755371094,
"learning_rate": 0.0008288462636092522,
"loss": 5.5522,
"step": 98500
},
{
"epoch": 0.5273924439046219,
"grad_norm": 2.0051610469818115,
"learning_rate": 0.0008279544384355246,
"loss": 5.5403,
"step": 99000
},
{
"epoch": 0.5300560421061604,
"grad_norm": 1.6867221593856812,
"learning_rate": 0.0008270643969121445,
"loss": 5.544,
"step": 99500
},
{
"epoch": 0.5327196403076989,
"grad_norm": 1.838189721107483,
"learning_rate": 0.0008261725717384169,
"loss": 5.5396,
"step": 100000
},
{
"epoch": 0.5353832385092374,
"grad_norm": 1.655271291732788,
"learning_rate": 0.0008252807465646895,
"loss": 5.5358,
"step": 100500
},
{
"epoch": 0.5380468367107758,
"grad_norm": 1.8378669023513794,
"learning_rate": 0.0008243889213909619,
"loss": 5.5419,
"step": 101000
},
{
"epoch": 0.5407104349123143,
"grad_norm": 1.7509022951126099,
"learning_rate": 0.0008234988798675818,
"loss": 5.523,
"step": 101500
},
{
"epoch": 0.5433740331138528,
"grad_norm": 1.9558390378952026,
"learning_rate": 0.0008226070546938542,
"loss": 5.5322,
"step": 102000
},
{
"epoch": 0.5460376313153913,
"grad_norm": 2.0113561153411865,
"learning_rate": 0.0008217152295201266,
"loss": 5.5303,
"step": 102500
},
{
"epoch": 0.5487012295169298,
"grad_norm": 1.989725112915039,
"learning_rate": 0.0008208234043463993,
"loss": 5.5257,
"step": 103000
},
{
"epoch": 0.5513648277184683,
"grad_norm": 1.702812671661377,
"learning_rate": 0.0008199315791726717,
"loss": 5.5327,
"step": 103500
},
{
"epoch": 0.5540284259200068,
"grad_norm": 1.8519411087036133,
"learning_rate": 0.0008190397539989441,
"loss": 5.5272,
"step": 104000
},
{
"epoch": 0.5566920241215453,
"grad_norm": 1.856350064277649,
"learning_rate": 0.0008181479288252166,
"loss": 5.5211,
"step": 104500
},
{
"epoch": 0.5593556223230838,
"grad_norm": 1.7010074853897095,
"learning_rate": 0.000817256103651489,
"loss": 5.5287,
"step": 105000
},
{
"epoch": 0.5620192205246223,
"grad_norm": 1.6479413509368896,
"learning_rate": 0.000816366062128109,
"loss": 5.5279,
"step": 105500
},
{
"epoch": 0.5646828187261608,
"grad_norm": 1.9108966588974,
"learning_rate": 0.0008154742369543814,
"loss": 5.5203,
"step": 106000
},
{
"epoch": 0.5673464169276993,
"grad_norm": 1.9142667055130005,
"learning_rate": 0.0008145824117806538,
"loss": 5.5189,
"step": 106500
},
{
"epoch": 0.5700100151292378,
"grad_norm": 1.8495519161224365,
"learning_rate": 0.0008136905866069263,
"loss": 5.5196,
"step": 107000
},
{
"epoch": 0.5726736133307763,
"grad_norm": 2.063087224960327,
"learning_rate": 0.0008128005450835461,
"loss": 5.5132,
"step": 107500
},
{
"epoch": 0.5753372115323148,
"grad_norm": 2.0009357929229736,
"learning_rate": 0.0008119087199098186,
"loss": 5.5177,
"step": 108000
},
{
"epoch": 0.5780008097338533,
"grad_norm": 2.0125739574432373,
"learning_rate": 0.0008110168947360911,
"loss": 5.5112,
"step": 108500
},
{
"epoch": 0.5806644079353918,
"grad_norm": 1.8415509462356567,
"learning_rate": 0.0008101250695623635,
"loss": 5.509,
"step": 109000
},
{
"epoch": 0.5833280061369303,
"grad_norm": 1.7688753604888916,
"learning_rate": 0.0008092350280389835,
"loss": 5.5032,
"step": 109500
},
{
"epoch": 0.5859916043384688,
"grad_norm": 1.8354215621948242,
"learning_rate": 0.000808343202865256,
"loss": 5.5129,
"step": 110000
},
{
"epoch": 0.5886552025400072,
"grad_norm": 2.036357879638672,
"learning_rate": 0.0008074513776915284,
"loss": 5.5043,
"step": 110500
},
{
"epoch": 0.5913188007415457,
"grad_norm": 1.8382165431976318,
"learning_rate": 0.0008065595525178009,
"loss": 5.5065,
"step": 111000
},
{
"epoch": 0.5939823989430842,
"grad_norm": 2.001885175704956,
"learning_rate": 0.0008056695109944208,
"loss": 5.507,
"step": 111500
},
{
"epoch": 0.5966459971446227,
"grad_norm": 1.872819423675537,
"learning_rate": 0.0008047776858206932,
"loss": 5.5081,
"step": 112000
},
{
"epoch": 0.5993095953461612,
"grad_norm": 1.8629109859466553,
"learning_rate": 0.0008038858606469656,
"loss": 5.5078,
"step": 112500
},
{
"epoch": 0.6019731935476997,
"grad_norm": 2.0044994354248047,
"learning_rate": 0.0008029940354732381,
"loss": 5.498,
"step": 113000
},
{
"epoch": 0.6046367917492382,
"grad_norm": 1.9607182741165161,
"learning_rate": 0.000802103993949858,
"loss": 5.5092,
"step": 113500
},
{
"epoch": 0.6073003899507767,
"grad_norm": 1.9605486392974854,
"learning_rate": 0.0008012121687761305,
"loss": 5.5013,
"step": 114000
},
{
"epoch": 0.6099639881523152,
"grad_norm": 1.999872088432312,
"learning_rate": 0.0008003203436024029,
"loss": 5.497,
"step": 114500
},
{
"epoch": 0.6126275863538537,
"grad_norm": 1.7834984064102173,
"learning_rate": 0.0007994285184286753,
"loss": 5.5001,
"step": 115000
},
{
"epoch": 0.6152911845553922,
"grad_norm": 1.9666252136230469,
"learning_rate": 0.0007985384769052953,
"loss": 5.5004,
"step": 115500
},
{
"epoch": 0.6179547827569307,
"grad_norm": 1.810936450958252,
"learning_rate": 0.0007976484353819152,
"loss": 5.4934,
"step": 116000
},
{
"epoch": 0.6206183809584692,
"grad_norm": 1.8183609247207642,
"learning_rate": 0.0007967566102081877,
"loss": 5.4999,
"step": 116500
},
{
"epoch": 0.6232819791600077,
"grad_norm": 2.1452646255493164,
"learning_rate": 0.0007958647850344601,
"loss": 5.4937,
"step": 117000
},
{
"epoch": 0.6259455773615462,
"grad_norm": 1.984305739402771,
"learning_rate": 0.0007949729598607326,
"loss": 5.494,
"step": 117500
},
{
"epoch": 0.6286091755630847,
"grad_norm": 2.1507790088653564,
"learning_rate": 0.000794081134687005,
"loss": 5.4915,
"step": 118000
},
{
"epoch": 0.6312727737646232,
"grad_norm": 1.821390151977539,
"learning_rate": 0.0007931910931636249,
"loss": 5.4948,
"step": 118500
},
{
"epoch": 0.6339363719661616,
"grad_norm": 1.901696801185608,
"learning_rate": 0.0007922992679898974,
"loss": 5.4944,
"step": 119000
},
{
"epoch": 0.6365999701677001,
"grad_norm": 2.214447259902954,
"learning_rate": 0.0007914074428161698,
"loss": 5.4901,
"step": 119500
},
{
"epoch": 0.6392635683692386,
"grad_norm": 1.8764078617095947,
"learning_rate": 0.0007905156176424423,
"loss": 5.4837,
"step": 120000
},
{
"epoch": 0.6419271665707771,
"grad_norm": 1.9411547183990479,
"learning_rate": 0.0007896237924687147,
"loss": 5.4889,
"step": 120500
},
{
"epoch": 0.6445907647723156,
"grad_norm": 1.8323979377746582,
"learning_rate": 0.0007887319672949871,
"loss": 5.49,
"step": 121000
},
{
"epoch": 0.6472543629738541,
"grad_norm": 1.8666421175003052,
"learning_rate": 0.0007878401421212597,
"loss": 5.4911,
"step": 121500
},
{
"epoch": 0.6499179611753926,
"grad_norm": 2.0501484870910645,
"learning_rate": 0.0007869483169475321,
"loss": 5.4894,
"step": 122000
},
{
"epoch": 0.6525815593769311,
"grad_norm": 1.8784074783325195,
"learning_rate": 0.0007860600590744995,
"loss": 5.4911,
"step": 122500
},
{
"epoch": 0.6552451575784696,
"grad_norm": 1.9021259546279907,
"learning_rate": 0.000785168233900772,
"loss": 5.4844,
"step": 123000
},
{
"epoch": 0.6579087557800081,
"grad_norm": 2.053755283355713,
"learning_rate": 0.0007842764087270444,
"loss": 5.4884,
"step": 123500
},
{
"epoch": 0.6605723539815466,
"grad_norm": 1.9320204257965088,
"learning_rate": 0.0007833845835533169,
"loss": 5.4822,
"step": 124000
},
{
"epoch": 0.6632359521830851,
"grad_norm": 1.793219804763794,
"learning_rate": 0.0007824945420299368,
"loss": 5.4834,
"step": 124500
},
{
"epoch": 0.6658995503846236,
"grad_norm": 2.0100185871124268,
"learning_rate": 0.0007816027168562092,
"loss": 5.4872,
"step": 125000
},
{
"epoch": 0.6685631485861621,
"grad_norm": 2.0543274879455566,
"learning_rate": 0.0007807108916824816,
"loss": 5.4826,
"step": 125500
},
{
"epoch": 0.6712267467877006,
"grad_norm": 1.9622262716293335,
"learning_rate": 0.0007798190665087542,
"loss": 5.4809,
"step": 126000
},
{
"epoch": 0.673890344989239,
"grad_norm": 1.918966293334961,
"learning_rate": 0.0007789272413350267,
"loss": 5.4823,
"step": 126500
},
{
"epoch": 0.6765539431907776,
"grad_norm": 1.8516751527786255,
"learning_rate": 0.0007780354161612992,
"loss": 5.4786,
"step": 127000
},
{
"epoch": 0.679217541392316,
"grad_norm": 1.8985280990600586,
"learning_rate": 0.000777145374637919,
"loss": 5.4762,
"step": 127500
},
{
"epoch": 0.6818811395938545,
"grad_norm": 2.030210018157959,
"learning_rate": 0.0007762535494641915,
"loss": 5.4786,
"step": 128000
},
{
"epoch": 0.684544737795393,
"grad_norm": 1.9270013570785522,
"learning_rate": 0.0007753617242904639,
"loss": 5.4801,
"step": 128500
},
{
"epoch": 0.6872083359969315,
"grad_norm": 1.7799612283706665,
"learning_rate": 0.0007744698991167364,
"loss": 5.4715,
"step": 129000
},
{
"epoch": 0.68987193419847,
"grad_norm": 2.1841835975646973,
"learning_rate": 0.0007735780739430089,
"loss": 5.4726,
"step": 129500
},
{
"epoch": 0.6925355324000085,
"grad_norm": 1.970680594444275,
"learning_rate": 0.0007726862487692813,
"loss": 5.4751,
"step": 130000
},
{
"epoch": 0.695199130601547,
"grad_norm": 2.1457014083862305,
"learning_rate": 0.0007717944235955537,
"loss": 5.4754,
"step": 130500
},
{
"epoch": 0.6978627288030855,
"grad_norm": 1.8095160722732544,
"learning_rate": 0.0007709025984218262,
"loss": 5.4723,
"step": 131000
},
{
"epoch": 0.700526327004624,
"grad_norm": 1.8374313116073608,
"learning_rate": 0.000770012556898446,
"loss": 5.4774,
"step": 131500
},
{
"epoch": 0.7031899252061625,
"grad_norm": 1.8603581190109253,
"learning_rate": 0.0007691207317247186,
"loss": 5.477,
"step": 132000
},
{
"epoch": 0.705853523407701,
"grad_norm": 1.9838221073150635,
"learning_rate": 0.0007682306902013385,
"loss": 5.4732,
"step": 132500
},
{
"epoch": 0.7085171216092395,
"grad_norm": 1.9500114917755127,
"learning_rate": 0.000767338865027611,
"loss": 5.4742,
"step": 133000
},
{
"epoch": 0.711180719810778,
"grad_norm": 1.9748975038528442,
"learning_rate": 0.0007664470398538834,
"loss": 5.4675,
"step": 133500
},
{
"epoch": 0.7138443180123165,
"grad_norm": 1.7860807180404663,
"learning_rate": 0.0007655552146801558,
"loss": 5.4711,
"step": 134000
},
{
"epoch": 0.716507916213855,
"grad_norm": 2.076504945755005,
"learning_rate": 0.0007646633895064284,
"loss": 5.4691,
"step": 134500
},
{
"epoch": 0.7191715144153935,
"grad_norm": 2.1392953395843506,
"learning_rate": 0.0007637715643327008,
"loss": 5.4763,
"step": 135000
},
{
"epoch": 0.721835112616932,
"grad_norm": 1.7750567197799683,
"learning_rate": 0.0007628797391589732,
"loss": 5.4624,
"step": 135500
},
{
"epoch": 0.7244987108184705,
"grad_norm": 2.1746318340301514,
"learning_rate": 0.0007619879139852457,
"loss": 5.4632,
"step": 136000
},
{
"epoch": 0.727162309020009,
"grad_norm": 1.9568692445755005,
"learning_rate": 0.0007610978724618655,
"loss": 5.4702,
"step": 136500
},
{
"epoch": 0.7298259072215475,
"grad_norm": 1.940618634223938,
"learning_rate": 0.0007602060472881381,
"loss": 5.4682,
"step": 137000
},
{
"epoch": 0.7324895054230859,
"grad_norm": 2.0432674884796143,
"learning_rate": 0.0007593142221144105,
"loss": 5.4661,
"step": 137500
},
{
"epoch": 0.7351531036246244,
"grad_norm": 1.989637017250061,
"learning_rate": 0.0007584223969406829,
"loss": 5.4643,
"step": 138000
},
{
"epoch": 0.7378167018261629,
"grad_norm": 1.7842735052108765,
"learning_rate": 0.0007575305717669554,
"loss": 5.4633,
"step": 138500
},
{
"epoch": 0.7404803000277014,
"grad_norm": 2.000488519668579,
"learning_rate": 0.0007566405302435752,
"loss": 5.4645,
"step": 139000
},
{
"epoch": 0.7431438982292399,
"grad_norm": 1.9219857454299927,
"learning_rate": 0.0007557487050698478,
"loss": 5.4587,
"step": 139500
},
{
"epoch": 0.7458074964307784,
"grad_norm": 1.8964563608169556,
"learning_rate": 0.0007548568798961202,
"loss": 5.4594,
"step": 140000
},
{
"epoch": 0.7484710946323169,
"grad_norm": 2.0744431018829346,
"learning_rate": 0.0007539650547223926,
"loss": 5.4677,
"step": 140500
},
{
"epoch": 0.7511346928338554,
"grad_norm": 2.0807344913482666,
"learning_rate": 0.0007530732295486651,
"loss": 5.4594,
"step": 141000
},
{
"epoch": 0.7537982910353939,
"grad_norm": 1.9063740968704224,
"learning_rate": 0.0007521814043749375,
"loss": 5.4614,
"step": 141500
},
{
"epoch": 0.7564618892369324,
"grad_norm": 1.8823788166046143,
"learning_rate": 0.0007512913628515576,
"loss": 5.4612,
"step": 142000
},
{
"epoch": 0.7591254874384709,
"grad_norm": 2.027939558029175,
"learning_rate": 0.00075039953767783,
"loss": 5.457,
"step": 142500
},
{
"epoch": 0.7617890856400094,
"grad_norm": 1.956814169883728,
"learning_rate": 0.0007495077125041024,
"loss": 5.4561,
"step": 143000
},
{
"epoch": 0.7644526838415479,
"grad_norm": 1.8203577995300293,
"learning_rate": 0.0007486158873303749,
"loss": 5.4612,
"step": 143500
},
{
"epoch": 0.7671162820430864,
"grad_norm": 2.0049407482147217,
"learning_rate": 0.0007477240621566473,
"loss": 5.4572,
"step": 144000
},
{
"epoch": 0.7697798802446248,
"grad_norm": 2.0092926025390625,
"learning_rate": 0.0007468322369829198,
"loss": 5.4566,
"step": 144500
},
{
"epoch": 0.7724434784461633,
"grad_norm": 1.9448853731155396,
"learning_rate": 0.0007459421954595397,
"loss": 5.4567,
"step": 145000
},
{
"epoch": 0.7751070766477018,
"grad_norm": 1.9080660343170166,
"learning_rate": 0.0007450503702858121,
"loss": 5.4529,
"step": 145500
},
{
"epoch": 0.7777706748492403,
"grad_norm": 2.0922887325286865,
"learning_rate": 0.0007441585451120846,
"loss": 5.4594,
"step": 146000
},
{
"epoch": 0.7804342730507788,
"grad_norm": 2.102870464324951,
"learning_rate": 0.000743266719938357,
"loss": 5.4533,
"step": 146500
},
{
"epoch": 0.7830978712523173,
"grad_norm": 1.8905880451202393,
"learning_rate": 0.0007423748947646295,
"loss": 5.4512,
"step": 147000
},
{
"epoch": 0.7857614694538558,
"grad_norm": 1.937587857246399,
"learning_rate": 0.000741483069590902,
"loss": 5.4577,
"step": 147500
},
{
"epoch": 0.7884250676553943,
"grad_norm": 2.2599427700042725,
"learning_rate": 0.0007405912444171744,
"loss": 5.4545,
"step": 148000
},
{
"epoch": 0.7910886658569328,
"grad_norm": 2.1247055530548096,
"learning_rate": 0.0007396994192434468,
"loss": 5.4552,
"step": 148500
},
{
"epoch": 0.7937522640584713,
"grad_norm": 1.8920656442642212,
"learning_rate": 0.0007388093777200668,
"loss": 5.4551,
"step": 149000
},
{
"epoch": 0.7964158622600098,
"grad_norm": 2.05411696434021,
"learning_rate": 0.0007379175525463393,
"loss": 5.4581,
"step": 149500
},
{
"epoch": 0.7990794604615483,
"grad_norm": 2.1096110343933105,
"learning_rate": 0.0007370257273726118,
"loss": 5.4553,
"step": 150000
},
{
"epoch": 0.8017430586630868,
"grad_norm": 2.060760736465454,
"learning_rate": 0.0007361339021988842,
"loss": 5.4557,
"step": 150500
},
{
"epoch": 0.8044066568646253,
"grad_norm": 1.7533081769943237,
"learning_rate": 0.0007352438606755041,
"loss": 5.4596,
"step": 151000
},
{
"epoch": 0.8070702550661638,
"grad_norm": 1.948110580444336,
"learning_rate": 0.0007343520355017765,
"loss": 5.4581,
"step": 151500
},
{
"epoch": 0.8097338532677023,
"grad_norm": 2.0876693725585938,
"learning_rate": 0.000733460210328049,
"loss": 5.4517,
"step": 152000
},
{
"epoch": 0.8123974514692408,
"grad_norm": 1.8972123861312866,
"learning_rate": 0.0007325701688046689,
"loss": 5.4529,
"step": 152500
},
{
"epoch": 0.8150610496707793,
"grad_norm": 2.0049657821655273,
"learning_rate": 0.0007316783436309413,
"loss": 5.4506,
"step": 153000
},
{
"epoch": 0.8177246478723178,
"grad_norm": 1.9599244594573975,
"learning_rate": 0.0007307865184572138,
"loss": 5.4503,
"step": 153500
},
{
"epoch": 0.8203882460738563,
"grad_norm": 2.090162992477417,
"learning_rate": 0.0007298946932834862,
"loss": 5.4487,
"step": 154000
},
{
"epoch": 0.8230518442753948,
"grad_norm": 1.9685425758361816,
"learning_rate": 0.0007290028681097586,
"loss": 5.4459,
"step": 154500
},
{
"epoch": 0.8257154424769333,
"grad_norm": 2.0231292247772217,
"learning_rate": 0.0007281110429360312,
"loss": 5.4519,
"step": 155000
},
{
"epoch": 0.8283790406784717,
"grad_norm": 1.824242353439331,
"learning_rate": 0.0007272192177623036,
"loss": 5.4495,
"step": 155500
},
{
"epoch": 0.8310426388800102,
"grad_norm": 1.8740367889404297,
"learning_rate": 0.000726327392588576,
"loss": 5.4514,
"step": 156000
},
{
"epoch": 0.8337062370815487,
"grad_norm": 1.898790955543518,
"learning_rate": 0.000725437351065196,
"loss": 5.4442,
"step": 156500
},
{
"epoch": 0.8363698352830872,
"grad_norm": 1.9713107347488403,
"learning_rate": 0.0007245455258914684,
"loss": 5.4481,
"step": 157000
},
{
"epoch": 0.8390334334846257,
"grad_norm": 1.892471432685852,
"learning_rate": 0.000723653700717741,
"loss": 5.4514,
"step": 157500
},
{
"epoch": 0.8416970316861642,
"grad_norm": 2.0477683544158936,
"learning_rate": 0.0007227618755440134,
"loss": 5.4402,
"step": 158000
},
{
"epoch": 0.8443606298877027,
"grad_norm": 1.9651503562927246,
"learning_rate": 0.0007218736176709807,
"loss": 5.439,
"step": 158500
},
{
"epoch": 0.8470242280892412,
"grad_norm": 1.9664440155029297,
"learning_rate": 0.0007209817924972531,
"loss": 5.4512,
"step": 159000
},
{
"epoch": 0.8496878262907797,
"grad_norm": 1.9268772602081299,
"learning_rate": 0.0007200899673235256,
"loss": 5.4445,
"step": 159500
},
{
"epoch": 0.8523514244923182,
"grad_norm": 2.0761542320251465,
"learning_rate": 0.0007191981421497981,
"loss": 5.4476,
"step": 160000
},
{
"epoch": 0.8550150226938567,
"grad_norm": 2.080336570739746,
"learning_rate": 0.0007183063169760705,
"loss": 5.4472,
"step": 160500
},
{
"epoch": 0.8576786208953951,
"grad_norm": 1.8157365322113037,
"learning_rate": 0.000717414491802343,
"loss": 5.4471,
"step": 161000
},
{
"epoch": 0.8603422190969336,
"grad_norm": 1.7620859146118164,
"learning_rate": 0.0007165226666286154,
"loss": 5.4486,
"step": 161500
},
{
"epoch": 0.8630058172984721,
"grad_norm": 1.8530540466308594,
"learning_rate": 0.0007156326251052354,
"loss": 5.4403,
"step": 162000
},
{
"epoch": 0.8656694155000106,
"grad_norm": 1.91478431224823,
"learning_rate": 0.0007147407999315079,
"loss": 5.4453,
"step": 162500
},
{
"epoch": 0.8683330137015491,
"grad_norm": 1.944806456565857,
"learning_rate": 0.0007138489747577804,
"loss": 5.4438,
"step": 163000
},
{
"epoch": 0.8709966119030876,
"grad_norm": 1.941565752029419,
"learning_rate": 0.0007129571495840528,
"loss": 5.4403,
"step": 163500
},
{
"epoch": 0.8736602101046261,
"grad_norm": 1.8101640939712524,
"learning_rate": 0.0007120653244103252,
"loss": 5.4352,
"step": 164000
},
{
"epoch": 0.8763238083061646,
"grad_norm": 2.391594171524048,
"learning_rate": 0.0007111752828869451,
"loss": 5.4379,
"step": 164500
},
{
"epoch": 0.8789874065077031,
"grad_norm": 1.946295142173767,
"learning_rate": 0.0007102834577132175,
"loss": 5.4385,
"step": 165000
},
{
"epoch": 0.8816510047092416,
"grad_norm": 2.1615066528320312,
"learning_rate": 0.00070939163253949,
"loss": 5.4439,
"step": 165500
},
{
"epoch": 0.8843146029107801,
"grad_norm": 2.0320687294006348,
"learning_rate": 0.0007084998073657625,
"loss": 5.4434,
"step": 166000
},
{
"epoch": 0.8869782011123186,
"grad_norm": 1.8692481517791748,
"learning_rate": 0.0007076079821920349,
"loss": 5.437,
"step": 166500
},
{
"epoch": 0.8896417993138571,
"grad_norm": 2.007511854171753,
"learning_rate": 0.0007067161570183073,
"loss": 5.4327,
"step": 167000
},
{
"epoch": 0.8923053975153956,
"grad_norm": 2.02004337310791,
"learning_rate": 0.0007058243318445799,
"loss": 5.4393,
"step": 167500
},
{
"epoch": 0.8949689957169341,
"grad_norm": 1.7644096612930298,
"learning_rate": 0.0007049325066708523,
"loss": 5.4304,
"step": 168000
},
{
"epoch": 0.8976325939184726,
"grad_norm": 2.0698578357696533,
"learning_rate": 0.0007040424651474723,
"loss": 5.4301,
"step": 168500
},
{
"epoch": 0.9002961921200111,
"grad_norm": 1.881465196609497,
"learning_rate": 0.0007031506399737447,
"loss": 5.4399,
"step": 169000
},
{
"epoch": 0.9029597903215496,
"grad_norm": 2.0607750415802,
"learning_rate": 0.0007022588148000172,
"loss": 5.4311,
"step": 169500
},
{
"epoch": 0.9056233885230881,
"grad_norm": 2.1066737174987793,
"learning_rate": 0.0007013669896262897,
"loss": 5.4348,
"step": 170000
},
{
"epoch": 0.9082869867246266,
"grad_norm": 2.0234835147857666,
"learning_rate": 0.0007004769481029096,
"loss": 5.4337,
"step": 170500
},
{
"epoch": 0.9109505849261651,
"grad_norm": 1.8877592086791992,
"learning_rate": 0.000699585122929182,
"loss": 5.4389,
"step": 171000
},
{
"epoch": 0.9136141831277036,
"grad_norm": 2.117302417755127,
"learning_rate": 0.0006986932977554544,
"loss": 5.4333,
"step": 171500
},
{
"epoch": 0.9162777813292421,
"grad_norm": 2.073172092437744,
"learning_rate": 0.0006978014725817269,
"loss": 5.4318,
"step": 172000
},
{
"epoch": 0.9189413795307806,
"grad_norm": 2.064408540725708,
"learning_rate": 0.0006969114310583467,
"loss": 5.431,
"step": 172500
},
{
"epoch": 0.9216049777323191,
"grad_norm": 1.9481194019317627,
"learning_rate": 0.0006960196058846193,
"loss": 5.4321,
"step": 173000
},
{
"epoch": 0.9242685759338576,
"grad_norm": 2.010923147201538,
"learning_rate": 0.0006951277807108917,
"loss": 5.4342,
"step": 173500
},
{
"epoch": 0.926932174135396,
"grad_norm": 1.9323519468307495,
"learning_rate": 0.0006942359555371641,
"loss": 5.4303,
"step": 174000
},
{
"epoch": 0.9295957723369345,
"grad_norm": 2.2859385013580322,
"learning_rate": 0.0006933459140137841,
"loss": 5.4352,
"step": 174500
},
{
"epoch": 0.932259370538473,
"grad_norm": 2.055107593536377,
"learning_rate": 0.000692455872490404,
"loss": 5.4352,
"step": 175000
},
{
"epoch": 0.9349229687400115,
"grad_norm": 1.9875715970993042,
"learning_rate": 0.0006915640473166765,
"loss": 5.4392,
"step": 175500
},
{
"epoch": 0.93758656694155,
"grad_norm": 2.097477912902832,
"learning_rate": 0.0006906722221429489,
"loss": 5.4291,
"step": 176000
},
{
"epoch": 0.9402501651430885,
"grad_norm": 1.8664289712905884,
"learning_rate": 0.0006897803969692214,
"loss": 5.423,
"step": 176500
},
{
"epoch": 0.942913763344627,
"grad_norm": 2.0907797813415527,
"learning_rate": 0.0006888885717954938,
"loss": 5.4322,
"step": 177000
},
{
"epoch": 0.9455773615461655,
"grad_norm": 1.9234920740127563,
"learning_rate": 0.0006879967466217662,
"loss": 5.4303,
"step": 177500
},
{
"epoch": 0.948240959747704,
"grad_norm": 2.0696797370910645,
"learning_rate": 0.0006871049214480388,
"loss": 5.4251,
"step": 178000
},
{
"epoch": 0.9509045579492424,
"grad_norm": 2.0838043689727783,
"learning_rate": 0.0006862130962743112,
"loss": 5.4244,
"step": 178500
},
{
"epoch": 0.9535681561507809,
"grad_norm": 2.1029279232025146,
"learning_rate": 0.0006853230547509311,
"loss": 5.4323,
"step": 179000
},
{
"epoch": 0.9562317543523194,
"grad_norm": 2.1586649417877197,
"learning_rate": 0.000684433013227551,
"loss": 5.4329,
"step": 179500
},
{
"epoch": 0.9588953525538579,
"grad_norm": 1.8636375665664673,
"learning_rate": 0.0006835411880538235,
"loss": 5.43,
"step": 180000
},
{
"epoch": 0.9615589507553964,
"grad_norm": 1.9289181232452393,
"learning_rate": 0.0006826493628800959,
"loss": 5.4193,
"step": 180500
},
{
"epoch": 0.9642225489569349,
"grad_norm": 1.9578914642333984,
"learning_rate": 0.0006817575377063684,
"loss": 5.4298,
"step": 181000
},
{
"epoch": 0.9668861471584734,
"grad_norm": 2.0745270252227783,
"learning_rate": 0.0006808657125326409,
"loss": 5.4315,
"step": 181500
},
{
"epoch": 0.9695497453600119,
"grad_norm": 1.9545907974243164,
"learning_rate": 0.0006799738873589133,
"loss": 5.425,
"step": 182000
},
{
"epoch": 0.9722133435615504,
"grad_norm": 1.9709100723266602,
"learning_rate": 0.0006790820621851857,
"loss": 5.425,
"step": 182500
},
{
"epoch": 0.9748769417630889,
"grad_norm": 1.8214976787567139,
"learning_rate": 0.0006781902370114582,
"loss": 5.4307,
"step": 183000
},
{
"epoch": 0.9775405399646274,
"grad_norm": 1.8456212282180786,
"learning_rate": 0.0006773001954880781,
"loss": 5.4277,
"step": 183500
},
{
"epoch": 0.9802041381661659,
"grad_norm": 2.0278677940368652,
"learning_rate": 0.0006764083703143506,
"loss": 5.425,
"step": 184000
},
{
"epoch": 0.9828677363677044,
"grad_norm": 1.8401942253112793,
"learning_rate": 0.000675516545140623,
"loss": 5.4228,
"step": 184500
},
{
"epoch": 0.9855313345692429,
"grad_norm": 2.0018155574798584,
"learning_rate": 0.0006746247199668954,
"loss": 5.4272,
"step": 185000
},
{
"epoch": 0.9881949327707814,
"grad_norm": 1.9544193744659424,
"learning_rate": 0.0006737346784435153,
"loss": 5.4297,
"step": 185500
},
{
"epoch": 0.9908585309723199,
"grad_norm": 1.8701244592666626,
"learning_rate": 0.0006728428532697878,
"loss": 5.4305,
"step": 186000
},
{
"epoch": 0.9935221291738584,
"grad_norm": 1.9702414274215698,
"learning_rate": 0.0006719510280960603,
"loss": 5.4272,
"step": 186500
},
{
"epoch": 0.9961857273753969,
"grad_norm": 2.005018472671509,
"learning_rate": 0.0006710592029223327,
"loss": 5.4259,
"step": 187000
},
{
"epoch": 0.9988493255769354,
"grad_norm": 1.9745688438415527,
"learning_rate": 0.0006701691613989527,
"loss": 5.4255,
"step": 187500
},
{
"epoch": 1.001512923778474,
"grad_norm": 2.119936466217041,
"learning_rate": 0.0006692773362252251,
"loss": 5.4282,
"step": 188000
},
{
"epoch": 1.0041765219800123,
"grad_norm": 1.8192147016525269,
"learning_rate": 0.0006683855110514976,
"loss": 5.4272,
"step": 188500
},
{
"epoch": 1.006840120181551,
"grad_norm": 2.0825536251068115,
"learning_rate": 0.0006674936858777701,
"loss": 5.4191,
"step": 189000
},
{
"epoch": 1.0095037183830893,
"grad_norm": 2.034301519393921,
"learning_rate": 0.0006666036443543899,
"loss": 5.4212,
"step": 189500
},
{
"epoch": 1.0121673165846279,
"grad_norm": 2.013160467147827,
"learning_rate": 0.0006657118191806624,
"loss": 5.4216,
"step": 190000
},
{
"epoch": 1.0148309147861663,
"grad_norm": 1.9328818321228027,
"learning_rate": 0.0006648199940069348,
"loss": 5.4286,
"step": 190500
},
{
"epoch": 1.0174945129877049,
"grad_norm": 2.011674642562866,
"learning_rate": 0.0006639281688332073,
"loss": 5.426,
"step": 191000
},
{
"epoch": 1.0201581111892433,
"grad_norm": 2.1039912700653076,
"learning_rate": 0.0006630381273098273,
"loss": 5.4261,
"step": 191500
},
{
"epoch": 1.0228217093907819,
"grad_norm": 1.8038475513458252,
"learning_rate": 0.0006621480857864472,
"loss": 5.4201,
"step": 192000
},
{
"epoch": 1.0254853075923203,
"grad_norm": 1.8866719007492065,
"learning_rate": 0.0006612562606127196,
"loss": 5.4156,
"step": 192500
},
{
"epoch": 1.0281489057938589,
"grad_norm": 1.9180611371994019,
"learning_rate": 0.000660364435438992,
"loss": 5.4219,
"step": 193000
},
{
"epoch": 1.0308125039953973,
"grad_norm": 1.83159339427948,
"learning_rate": 0.0006594726102652645,
"loss": 5.4158,
"step": 193500
},
{
"epoch": 1.0334761021969359,
"grad_norm": 1.8638277053833008,
"learning_rate": 0.000658580785091537,
"loss": 5.4196,
"step": 194000
},
{
"epoch": 1.0361397003984743,
"grad_norm": 1.8679394721984863,
"learning_rate": 0.0006576889599178094,
"loss": 5.4221,
"step": 194500
},
{
"epoch": 1.0388032986000129,
"grad_norm": 1.8080953359603882,
"learning_rate": 0.0006567971347440819,
"loss": 5.4168,
"step": 195000
},
{
"epoch": 1.0414668968015512,
"grad_norm": 2.044064521789551,
"learning_rate": 0.0006559053095703543,
"loss": 5.4152,
"step": 195500
},
{
"epoch": 1.0441304950030899,
"grad_norm": 2.067416191101074,
"learning_rate": 0.0006550152680469742,
"loss": 5.4197,
"step": 196000
},
{
"epoch": 1.0467940932046282,
"grad_norm": 1.8547744750976562,
"learning_rate": 0.0006541234428732467,
"loss": 5.416,
"step": 196500
},
{
"epoch": 1.0494576914061668,
"grad_norm": 2.1002390384674072,
"learning_rate": 0.0006532316176995191,
"loss": 5.414,
"step": 197000
},
{
"epoch": 1.0521212896077052,
"grad_norm": 1.8542534112930298,
"learning_rate": 0.0006523397925257916,
"loss": 5.4176,
"step": 197500
},
{
"epoch": 1.0547848878092438,
"grad_norm": 1.8873697519302368,
"learning_rate": 0.000651447967352064,
"loss": 5.4155,
"step": 198000
},
{
"epoch": 1.0574484860107822,
"grad_norm": 2.0172159671783447,
"learning_rate": 0.0006505561421783364,
"loss": 5.4234,
"step": 198500
},
{
"epoch": 1.0601120842123208,
"grad_norm": 1.9374735355377197,
"learning_rate": 0.000649664317004609,
"loss": 5.4131,
"step": 199000
},
{
"epoch": 1.0627756824138592,
"grad_norm": 2.141655921936035,
"learning_rate": 0.0006487724918308814,
"loss": 5.4134,
"step": 199500
},
{
"epoch": 1.0654392806153978,
"grad_norm": 1.9056235551834106,
"learning_rate": 0.0006478824503075014,
"loss": 5.4173,
"step": 200000
},
{
"epoch": 1.0681028788169362,
"grad_norm": 2.3003177642822266,
"learning_rate": 0.0006469906251337738,
"loss": 5.4049,
"step": 200500
},
{
"epoch": 1.0707664770184748,
"grad_norm": 2.1843066215515137,
"learning_rate": 0.0006460987999600462,
"loss": 5.411,
"step": 201000
},
{
"epoch": 1.0734300752200132,
"grad_norm": 2.0827953815460205,
"learning_rate": 0.0006452069747863188,
"loss": 5.4175,
"step": 201500
},
{
"epoch": 1.0760936734215516,
"grad_norm": 2.02587890625,
"learning_rate": 0.0006443169332629386,
"loss": 5.4183,
"step": 202000
},
{
"epoch": 1.0787572716230902,
"grad_norm": 1.8049343824386597,
"learning_rate": 0.0006434251080892111,
"loss": 5.4142,
"step": 202500
},
{
"epoch": 1.0814208698246286,
"grad_norm": 2.1238086223602295,
"learning_rate": 0.0006425332829154835,
"loss": 5.4155,
"step": 203000
},
{
"epoch": 1.0840844680261672,
"grad_norm": 1.9311139583587646,
"learning_rate": 0.0006416414577417559,
"loss": 5.4132,
"step": 203500
},
{
"epoch": 1.0867480662277056,
"grad_norm": 1.970428228378296,
"learning_rate": 0.0006407514162183758,
"loss": 5.4073,
"step": 204000
},
{
"epoch": 1.0894116644292442,
"grad_norm": 1.7967313528060913,
"learning_rate": 0.0006398595910446483,
"loss": 5.4113,
"step": 204500
},
{
"epoch": 1.0920752626307826,
"grad_norm": 1.7493606805801392,
"learning_rate": 0.0006389677658709208,
"loss": 5.4106,
"step": 205000
},
{
"epoch": 1.0947388608323212,
"grad_norm": 1.868148922920227,
"learning_rate": 0.0006380777243475407,
"loss": 5.4125,
"step": 205500
},
{
"epoch": 1.0974024590338596,
"grad_norm": 2.0261473655700684,
"learning_rate": 0.0006371858991738132,
"loss": 5.4119,
"step": 206000
},
{
"epoch": 1.1000660572353982,
"grad_norm": 1.8863203525543213,
"learning_rate": 0.0006362940740000856,
"loss": 5.4085,
"step": 206500
},
{
"epoch": 1.1027296554369366,
"grad_norm": 1.97225821018219,
"learning_rate": 0.0006354022488263581,
"loss": 5.4106,
"step": 207000
},
{
"epoch": 1.1053932536384752,
"grad_norm": 2.2650508880615234,
"learning_rate": 0.0006345104236526306,
"loss": 5.4128,
"step": 207500
},
{
"epoch": 1.1080568518400136,
"grad_norm": 1.9305511713027954,
"learning_rate": 0.000633618598478903,
"loss": 5.4084,
"step": 208000
},
{
"epoch": 1.1107204500415522,
"grad_norm": 2.110548973083496,
"learning_rate": 0.0006327285569555229,
"loss": 5.4078,
"step": 208500
},
{
"epoch": 1.1133840482430906,
"grad_norm": 2.0234880447387695,
"learning_rate": 0.0006318367317817953,
"loss": 5.4125,
"step": 209000
},
{
"epoch": 1.1160476464446292,
"grad_norm": 1.8949861526489258,
"learning_rate": 0.0006309449066080678,
"loss": 5.4077,
"step": 209500
},
{
"epoch": 1.1187112446461676,
"grad_norm": 1.9646226167678833,
"learning_rate": 0.0006300530814343403,
"loss": 5.4112,
"step": 210000
},
{
"epoch": 1.1213748428477062,
"grad_norm": 1.9960238933563232,
"learning_rate": 0.0006291612562606127,
"loss": 5.4062,
"step": 210500
},
{
"epoch": 1.1240384410492446,
"grad_norm": 2.0510716438293457,
"learning_rate": 0.0006282694310868851,
"loss": 5.4094,
"step": 211000
},
{
"epoch": 1.1267020392507832,
"grad_norm": 1.969011664390564,
"learning_rate": 0.0006273776059131576,
"loss": 5.4123,
"step": 211500
},
{
"epoch": 1.1293656374523215,
"grad_norm": 2.0459535121917725,
"learning_rate": 0.0006264857807394301,
"loss": 5.4077,
"step": 212000
},
{
"epoch": 1.1320292356538602,
"grad_norm": 2.093336343765259,
"learning_rate": 0.0006255957392160501,
"loss": 5.4107,
"step": 212500
},
{
"epoch": 1.1346928338553985,
"grad_norm": 1.8615410327911377,
"learning_rate": 0.0006247056976926699,
"loss": 5.4078,
"step": 213000
},
{
"epoch": 1.1373564320569371,
"grad_norm": 1.9422777891159058,
"learning_rate": 0.0006238138725189424,
"loss": 5.4115,
"step": 213500
},
{
"epoch": 1.1400200302584755,
"grad_norm": 1.9412380456924438,
"learning_rate": 0.0006229220473452148,
"loss": 5.4013,
"step": 214000
},
{
"epoch": 1.1426836284600141,
"grad_norm": 2.2532691955566406,
"learning_rate": 0.0006220302221714873,
"loss": 5.4061,
"step": 214500
},
{
"epoch": 1.1453472266615525,
"grad_norm": 1.7372703552246094,
"learning_rate": 0.0006211383969977598,
"loss": 5.41,
"step": 215000
},
{
"epoch": 1.1480108248630911,
"grad_norm": 1.9771249294281006,
"learning_rate": 0.0006202465718240322,
"loss": 5.4032,
"step": 215500
},
{
"epoch": 1.1506744230646295,
"grad_norm": 1.802037000656128,
"learning_rate": 0.0006193547466503046,
"loss": 5.4026,
"step": 216000
},
{
"epoch": 1.1533380212661681,
"grad_norm": 1.958177924156189,
"learning_rate": 0.0006184629214765771,
"loss": 5.4043,
"step": 216500
},
{
"epoch": 1.1560016194677065,
"grad_norm": 1.9318652153015137,
"learning_rate": 0.000617572879953197,
"loss": 5.4044,
"step": 217000
},
{
"epoch": 1.158665217669245,
"grad_norm": 1.917920470237732,
"learning_rate": 0.0006166810547794695,
"loss": 5.4051,
"step": 217500
},
{
"epoch": 1.1613288158707835,
"grad_norm": 1.9815441370010376,
"learning_rate": 0.0006157892296057419,
"loss": 5.4036,
"step": 218000
},
{
"epoch": 1.1639924140723221,
"grad_norm": 2.0141518115997314,
"learning_rate": 0.0006148974044320143,
"loss": 5.4093,
"step": 218500
},
{
"epoch": 1.1666560122738605,
"grad_norm": 2.0144686698913574,
"learning_rate": 0.0006140073629086343,
"loss": 5.3992,
"step": 219000
},
{
"epoch": 1.169319610475399,
"grad_norm": 1.848953127861023,
"learning_rate": 0.0006131155377349069,
"loss": 5.4069,
"step": 219500
},
{
"epoch": 1.1719832086769375,
"grad_norm": 1.8711676597595215,
"learning_rate": 0.0006122237125611793,
"loss": 5.4058,
"step": 220000
},
{
"epoch": 1.1746468068784761,
"grad_norm": 2.1549181938171387,
"learning_rate": 0.0006113318873874517,
"loss": 5.4057,
"step": 220500
},
{
"epoch": 1.1773104050800145,
"grad_norm": 2.136955738067627,
"learning_rate": 0.0006104418458640716,
"loss": 5.4047,
"step": 221000
},
{
"epoch": 1.1799740032815529,
"grad_norm": 1.984183430671692,
"learning_rate": 0.000609550020690344,
"loss": 5.397,
"step": 221500
},
{
"epoch": 1.1826376014830915,
"grad_norm": 2.173187732696533,
"learning_rate": 0.0006086581955166164,
"loss": 5.3996,
"step": 222000
},
{
"epoch": 1.1853011996846299,
"grad_norm": 2.0700299739837646,
"learning_rate": 0.000607766370342889,
"loss": 5.3976,
"step": 222500
},
{
"epoch": 1.1879647978861685,
"grad_norm": 2.1351547241210938,
"learning_rate": 0.0006068763288195088,
"loss": 5.4113,
"step": 223000
},
{
"epoch": 1.1906283960877069,
"grad_norm": 1.9995781183242798,
"learning_rate": 0.0006059845036457813,
"loss": 5.4012,
"step": 223500
},
{
"epoch": 1.1932919942892455,
"grad_norm": 2.2745988368988037,
"learning_rate": 0.0006050926784720537,
"loss": 5.4093,
"step": 224000
},
{
"epoch": 1.1959555924907839,
"grad_norm": 2.5383615493774414,
"learning_rate": 0.0006042026369486737,
"loss": 5.3934,
"step": 224500
},
{
"epoch": 1.1986191906923225,
"grad_norm": 2.132570266723633,
"learning_rate": 0.0006033108117749462,
"loss": 5.4143,
"step": 225000
},
{
"epoch": 1.2012827888938609,
"grad_norm": 1.9985568523406982,
"learning_rate": 0.0006024189866012187,
"loss": 5.3987,
"step": 225500
},
{
"epoch": 1.2039463870953995,
"grad_norm": 1.9169471263885498,
"learning_rate": 0.0006015271614274911,
"loss": 5.4005,
"step": 226000
},
{
"epoch": 1.2066099852969379,
"grad_norm": 1.9423543214797974,
"learning_rate": 0.0006006353362537635,
"loss": 5.4016,
"step": 226500
},
{
"epoch": 1.2092735834984765,
"grad_norm": 2.0575485229492188,
"learning_rate": 0.000599743511080036,
"loss": 5.393,
"step": 227000
},
{
"epoch": 1.2119371817000149,
"grad_norm": 2.034454584121704,
"learning_rate": 0.0005988516859063085,
"loss": 5.3946,
"step": 227500
},
{
"epoch": 1.2146007799015535,
"grad_norm": 1.9063221216201782,
"learning_rate": 0.0005979598607325809,
"loss": 5.4005,
"step": 228000
},
{
"epoch": 1.2172643781030918,
"grad_norm": 2.094717025756836,
"learning_rate": 0.0005970698192092008,
"loss": 5.3943,
"step": 228500
},
{
"epoch": 1.2199279763046305,
"grad_norm": 1.9740791320800781,
"learning_rate": 0.0005961779940354732,
"loss": 5.399,
"step": 229000
},
{
"epoch": 1.2225915745061688,
"grad_norm": 1.95699143409729,
"learning_rate": 0.0005952861688617457,
"loss": 5.3971,
"step": 229500
},
{
"epoch": 1.2252551727077075,
"grad_norm": 1.9305535554885864,
"learning_rate": 0.0005943943436880182,
"loss": 5.399,
"step": 230000
},
{
"epoch": 1.2279187709092458,
"grad_norm": 1.8926870822906494,
"learning_rate": 0.000593504302164638,
"loss": 5.3967,
"step": 230500
},
{
"epoch": 1.2305823691107844,
"grad_norm": 1.91937255859375,
"learning_rate": 0.0005926124769909105,
"loss": 5.3966,
"step": 231000
},
{
"epoch": 1.2332459673123228,
"grad_norm": 1.9494017362594604,
"learning_rate": 0.0005917224354675305,
"loss": 5.3988,
"step": 231500
},
{
"epoch": 1.2359095655138614,
"grad_norm": 1.7676622867584229,
"learning_rate": 0.0005908306102938029,
"loss": 5.3954,
"step": 232000
},
{
"epoch": 1.2385731637153998,
"grad_norm": 1.9707027673721313,
"learning_rate": 0.0005899387851200753,
"loss": 5.3987,
"step": 232500
},
{
"epoch": 1.2412367619169384,
"grad_norm": 1.8651105165481567,
"learning_rate": 0.0005890469599463479,
"loss": 5.3913,
"step": 233000
},
{
"epoch": 1.2439003601184768,
"grad_norm": 2.2256948947906494,
"learning_rate": 0.0005881551347726203,
"loss": 5.4022,
"step": 233500
},
{
"epoch": 1.2465639583200154,
"grad_norm": 2.0236611366271973,
"learning_rate": 0.0005872633095988927,
"loss": 5.3928,
"step": 234000
},
{
"epoch": 1.2492275565215538,
"grad_norm": 2.07328724861145,
"learning_rate": 0.0005863714844251652,
"loss": 5.3964,
"step": 234500
},
{
"epoch": 1.2518911547230922,
"grad_norm": 2.011497974395752,
"learning_rate": 0.000585481442901785,
"loss": 5.4,
"step": 235000
},
{
"epoch": 1.2545547529246308,
"grad_norm": 1.891579270362854,
"learning_rate": 0.0005845896177280576,
"loss": 5.3931,
"step": 235500
},
{
"epoch": 1.2572183511261694,
"grad_norm": 1.8369475603103638,
"learning_rate": 0.00058369779255433,
"loss": 5.388,
"step": 236000
},
{
"epoch": 1.2598819493277078,
"grad_norm": 2.316582441329956,
"learning_rate": 0.0005828059673806024,
"loss": 5.3878,
"step": 236500
},
{
"epoch": 1.2625455475292462,
"grad_norm": 1.8466497659683228,
"learning_rate": 0.0005819141422068749,
"loss": 5.3942,
"step": 237000
},
{
"epoch": 1.2652091457307848,
"grad_norm": 1.9420734643936157,
"learning_rate": 0.0005810223170331473,
"loss": 5.3907,
"step": 237500
},
{
"epoch": 1.2678727439323234,
"grad_norm": 1.9229456186294556,
"learning_rate": 0.0005801304918594198,
"loss": 5.394,
"step": 238000
},
{
"epoch": 1.2705363421338618,
"grad_norm": 2.126213788986206,
"learning_rate": 0.0005792386666856923,
"loss": 5.3875,
"step": 238500
},
{
"epoch": 1.2731999403354002,
"grad_norm": 1.9714566469192505,
"learning_rate": 0.0005783486251623122,
"loss": 5.3938,
"step": 239000
},
{
"epoch": 1.2758635385369388,
"grad_norm": 2.244844436645508,
"learning_rate": 0.0005774567999885847,
"loss": 5.3974,
"step": 239500
},
{
"epoch": 1.2785271367384774,
"grad_norm": 2.083517551422119,
"learning_rate": 0.0005765649748148571,
"loss": 5.3827,
"step": 240000
},
{
"epoch": 1.2811907349400158,
"grad_norm": 2.1155362129211426,
"learning_rate": 0.0005756749332914771,
"loss": 5.3908,
"step": 240500
},
{
"epoch": 1.2838543331415542,
"grad_norm": 2.0415351390838623,
"learning_rate": 0.0005747831081177495,
"loss": 5.3904,
"step": 241000
},
{
"epoch": 1.2865179313430928,
"grad_norm": 2.4744224548339844,
"learning_rate": 0.0005738912829440219,
"loss": 5.3825,
"step": 241500
},
{
"epoch": 1.2891815295446314,
"grad_norm": 1.9680261611938477,
"learning_rate": 0.0005729994577702944,
"loss": 5.3915,
"step": 242000
},
{
"epoch": 1.2918451277461698,
"grad_norm": 2.4636471271514893,
"learning_rate": 0.0005721076325965668,
"loss": 5.3946,
"step": 242500
},
{
"epoch": 1.2945087259477082,
"grad_norm": 1.8884419202804565,
"learning_rate": 0.0005712158074228393,
"loss": 5.3905,
"step": 243000
},
{
"epoch": 1.2971723241492468,
"grad_norm": 2.192204236984253,
"learning_rate": 0.0005703257658994592,
"loss": 5.3891,
"step": 243500
},
{
"epoch": 1.2998359223507852,
"grad_norm": 1.963740587234497,
"learning_rate": 0.0005694339407257316,
"loss": 5.389,
"step": 244000
},
{
"epoch": 1.3024995205523238,
"grad_norm": 2.2511630058288574,
"learning_rate": 0.0005685421155520041,
"loss": 5.3988,
"step": 244500
},
{
"epoch": 1.3051631187538622,
"grad_norm": 1.8933221101760864,
"learning_rate": 0.0005676502903782765,
"loss": 5.39,
"step": 245000
},
{
"epoch": 1.3078267169554008,
"grad_norm": 1.813040852546692,
"learning_rate": 0.000566758465204549,
"loss": 5.3884,
"step": 245500
},
{
"epoch": 1.3104903151569391,
"grad_norm": 2.3987181186676025,
"learning_rate": 0.0005658666400308215,
"loss": 5.3888,
"step": 246000
},
{
"epoch": 1.3131539133584778,
"grad_norm": 2.0762851238250732,
"learning_rate": 0.0005649748148570939,
"loss": 5.3881,
"step": 246500
},
{
"epoch": 1.3158175115600161,
"grad_norm": 2.3197662830352783,
"learning_rate": 0.0005640829896833663,
"loss": 5.3876,
"step": 247000
},
{
"epoch": 1.3184811097615547,
"grad_norm": 1.9953910112380981,
"learning_rate": 0.0005631929481599863,
"loss": 5.3892,
"step": 247500
},
{
"epoch": 1.3211447079630931,
"grad_norm": 2.20346999168396,
"learning_rate": 0.0005623011229862588,
"loss": 5.3844,
"step": 248000
},
{
"epoch": 1.3238083061646317,
"grad_norm": 1.9688447713851929,
"learning_rate": 0.0005614092978125313,
"loss": 5.3924,
"step": 248500
},
{
"epoch": 1.3264719043661701,
"grad_norm": 1.950621485710144,
"learning_rate": 0.0005605174726388037,
"loss": 5.382,
"step": 249000
},
{
"epoch": 1.3291355025677087,
"grad_norm": 2.0261106491088867,
"learning_rate": 0.0005596274311154236,
"loss": 5.3889,
"step": 249500
},
{
"epoch": 1.3317991007692471,
"grad_norm": 1.819598913192749,
"learning_rate": 0.000558735605941696,
"loss": 5.3879,
"step": 250000
},
{
"epoch": 1.3344626989707857,
"grad_norm": 2.092658042907715,
"learning_rate": 0.0005578437807679685,
"loss": 5.3897,
"step": 250500
},
{
"epoch": 1.3371262971723241,
"grad_norm": 1.8927563428878784,
"learning_rate": 0.000556951955594241,
"loss": 5.3888,
"step": 251000
},
{
"epoch": 1.3397898953738627,
"grad_norm": 1.91410493850708,
"learning_rate": 0.0005560619140708608,
"loss": 5.3865,
"step": 251500
},
{
"epoch": 1.3424534935754011,
"grad_norm": 1.923710584640503,
"learning_rate": 0.0005551700888971333,
"loss": 5.3831,
"step": 252000
},
{
"epoch": 1.3451170917769395,
"grad_norm": 2.011301279067993,
"learning_rate": 0.0005542782637234058,
"loss": 5.3832,
"step": 252500
},
{
"epoch": 1.347780689978478,
"grad_norm": 1.8271079063415527,
"learning_rate": 0.0005533864385496783,
"loss": 5.3843,
"step": 253000
},
{
"epoch": 1.3504442881800167,
"grad_norm": 2.0028188228607178,
"learning_rate": 0.0005524963970262982,
"loss": 5.383,
"step": 253500
},
{
"epoch": 1.353107886381555,
"grad_norm": 1.8386844396591187,
"learning_rate": 0.0005516045718525706,
"loss": 5.3873,
"step": 254000
},
{
"epoch": 1.3557714845830935,
"grad_norm": 1.8750890493392944,
"learning_rate": 0.0005507127466788431,
"loss": 5.3794,
"step": 254500
},
{
"epoch": 1.358435082784632,
"grad_norm": 1.9305578470230103,
"learning_rate": 0.0005498209215051155,
"loss": 5.3863,
"step": 255000
},
{
"epoch": 1.3610986809861707,
"grad_norm": 2.1922383308410645,
"learning_rate": 0.0005489308799817354,
"loss": 5.3889,
"step": 255500
},
{
"epoch": 1.363762279187709,
"grad_norm": 2.006162405014038,
"learning_rate": 0.0005480390548080079,
"loss": 5.3793,
"step": 256000
},
{
"epoch": 1.3664258773892475,
"grad_norm": 2.1891300678253174,
"learning_rate": 0.0005471472296342803,
"loss": 5.3805,
"step": 256500
},
{
"epoch": 1.369089475590786,
"grad_norm": 2.036553144454956,
"learning_rate": 0.0005462554044605528,
"loss": 5.3809,
"step": 257000
},
{
"epoch": 1.3717530737923247,
"grad_norm": 1.9189977645874023,
"learning_rate": 0.0005453653629371727,
"loss": 5.3766,
"step": 257500
},
{
"epoch": 1.374416671993863,
"grad_norm": 1.98636794090271,
"learning_rate": 0.0005444735377634452,
"loss": 5.39,
"step": 258000
},
{
"epoch": 1.3770802701954015,
"grad_norm": 1.897522211074829,
"learning_rate": 0.0005435834962400651,
"loss": 5.3839,
"step": 258500
},
{
"epoch": 1.37974386839694,
"grad_norm": 2.0826635360717773,
"learning_rate": 0.0005426916710663376,
"loss": 5.383,
"step": 259000
},
{
"epoch": 1.3824074665984787,
"grad_norm": 1.8267229795455933,
"learning_rate": 0.00054179984589261,
"loss": 5.3866,
"step": 259500
},
{
"epoch": 1.385071064800017,
"grad_norm": 2.1117184162139893,
"learning_rate": 0.0005409080207188824,
"loss": 5.3787,
"step": 260000
},
{
"epoch": 1.3877346630015555,
"grad_norm": 1.9132159948349,
"learning_rate": 0.0005400161955451549,
"loss": 5.3812,
"step": 260500
},
{
"epoch": 1.390398261203094,
"grad_norm": 1.9600298404693604,
"learning_rate": 0.0005391243703714274,
"loss": 5.381,
"step": 261000
},
{
"epoch": 1.3930618594046325,
"grad_norm": 2.000422716140747,
"learning_rate": 0.0005382325451976998,
"loss": 5.3823,
"step": 261500
},
{
"epoch": 1.395725457606171,
"grad_norm": 2.2225003242492676,
"learning_rate": 0.0005373407200239723,
"loss": 5.3776,
"step": 262000
},
{
"epoch": 1.3983890558077094,
"grad_norm": 2.084779977798462,
"learning_rate": 0.0005364506785005921,
"loss": 5.3781,
"step": 262500
},
{
"epoch": 1.401052654009248,
"grad_norm": 2.126775026321411,
"learning_rate": 0.0005355588533268646,
"loss": 5.3832,
"step": 263000
},
{
"epoch": 1.4037162522107864,
"grad_norm": 1.9713746309280396,
"learning_rate": 0.0005346670281531371,
"loss": 5.3792,
"step": 263500
},
{
"epoch": 1.406379850412325,
"grad_norm": 2.0785419940948486,
"learning_rate": 0.0005337752029794095,
"loss": 5.3825,
"step": 264000
},
{
"epoch": 1.4090434486138634,
"grad_norm": 2.3811593055725098,
"learning_rate": 0.0005328851614560295,
"loss": 5.3826,
"step": 264500
},
{
"epoch": 1.411707046815402,
"grad_norm": 2.1196324825286865,
"learning_rate": 0.0005319933362823019,
"loss": 5.3785,
"step": 265000
},
{
"epoch": 1.4143706450169404,
"grad_norm": 2.06736421585083,
"learning_rate": 0.0005311015111085744,
"loss": 5.3796,
"step": 265500
},
{
"epoch": 1.417034243218479,
"grad_norm": 2.1438751220703125,
"learning_rate": 0.0005302096859348468,
"loss": 5.3747,
"step": 266000
},
{
"epoch": 1.4196978414200174,
"grad_norm": 2.0328142642974854,
"learning_rate": 0.0005293196444114668,
"loss": 5.3726,
"step": 266500
},
{
"epoch": 1.422361439621556,
"grad_norm": 1.9709652662277222,
"learning_rate": 0.0005284278192377392,
"loss": 5.3835,
"step": 267000
},
{
"epoch": 1.4250250378230944,
"grad_norm": 2.0982072353363037,
"learning_rate": 0.0005275359940640116,
"loss": 5.3719,
"step": 267500
},
{
"epoch": 1.427688636024633,
"grad_norm": 2.3335447311401367,
"learning_rate": 0.0005266441688902841,
"loss": 5.3824,
"step": 268000
},
{
"epoch": 1.4303522342261714,
"grad_norm": 1.9240329265594482,
"learning_rate": 0.0005257541273669039,
"loss": 5.3754,
"step": 268500
},
{
"epoch": 1.43301583242771,
"grad_norm": 2.0762813091278076,
"learning_rate": 0.0005248623021931765,
"loss": 5.3754,
"step": 269000
},
{
"epoch": 1.4356794306292484,
"grad_norm": 1.9223084449768066,
"learning_rate": 0.0005239704770194489,
"loss": 5.3751,
"step": 269500
},
{
"epoch": 1.4383430288307868,
"grad_norm": 1.9600517749786377,
"learning_rate": 0.0005230786518457213,
"loss": 5.3726,
"step": 270000
},
{
"epoch": 1.4410066270323254,
"grad_norm": 2.0275826454162598,
"learning_rate": 0.0005221886103223413,
"loss": 5.3755,
"step": 270500
},
{
"epoch": 1.443670225233864,
"grad_norm": 2.0879909992218018,
"learning_rate": 0.0005212967851486137,
"loss": 5.371,
"step": 271000
},
{
"epoch": 1.4463338234354024,
"grad_norm": 2.2107584476470947,
"learning_rate": 0.0005204049599748863,
"loss": 5.3775,
"step": 271500
},
{
"epoch": 1.4489974216369408,
"grad_norm": 1.9889525175094604,
"learning_rate": 0.0005195131348011587,
"loss": 5.369,
"step": 272000
},
{
"epoch": 1.4516610198384794,
"grad_norm": 1.8878706693649292,
"learning_rate": 0.0005186230932777786,
"loss": 5.3762,
"step": 272500
},
{
"epoch": 1.454324618040018,
"grad_norm": 2.0804665088653564,
"learning_rate": 0.000517731268104051,
"loss": 5.3731,
"step": 273000
},
{
"epoch": 1.4569882162415564,
"grad_norm": 2.3155815601348877,
"learning_rate": 0.0005168394429303234,
"loss": 5.3696,
"step": 273500
},
{
"epoch": 1.4596518144430948,
"grad_norm": 2.2707676887512207,
"learning_rate": 0.000515947617756596,
"loss": 5.3763,
"step": 274000
},
{
"epoch": 1.4623154126446334,
"grad_norm": 1.947204828262329,
"learning_rate": 0.0005150575762332158,
"loss": 5.3689,
"step": 274500
},
{
"epoch": 1.464979010846172,
"grad_norm": 1.9428602457046509,
"learning_rate": 0.0005141657510594883,
"loss": 5.3797,
"step": 275000
},
{
"epoch": 1.4676426090477104,
"grad_norm": 2.4003546237945557,
"learning_rate": 0.0005132739258857608,
"loss": 5.3672,
"step": 275500
},
{
"epoch": 1.4703062072492488,
"grad_norm": 2.047048330307007,
"learning_rate": 0.0005123821007120333,
"loss": 5.3761,
"step": 276000
},
{
"epoch": 1.4729698054507874,
"grad_norm": 2.0965404510498047,
"learning_rate": 0.0005114920591886531,
"loss": 5.3645,
"step": 276500
},
{
"epoch": 1.475633403652326,
"grad_norm": 1.9648233652114868,
"learning_rate": 0.0005106002340149257,
"loss": 5.37,
"step": 277000
},
{
"epoch": 1.4782970018538644,
"grad_norm": 1.8992446660995483,
"learning_rate": 0.0005097084088411981,
"loss": 5.3679,
"step": 277500
},
{
"epoch": 1.4809606000554028,
"grad_norm": 2.125126838684082,
"learning_rate": 0.0005088165836674705,
"loss": 5.3702,
"step": 278000
},
{
"epoch": 1.4836241982569414,
"grad_norm": 2.030409574508667,
"learning_rate": 0.0005079265421440904,
"loss": 5.3691,
"step": 278500
},
{
"epoch": 1.4862877964584797,
"grad_norm": 1.9816679954528809,
"learning_rate": 0.0005070347169703628,
"loss": 5.3723,
"step": 279000
},
{
"epoch": 1.4889513946600184,
"grad_norm": 2.032564401626587,
"learning_rate": 0.0005061428917966354,
"loss": 5.3695,
"step": 279500
},
{
"epoch": 1.4916149928615567,
"grad_norm": 2.0342843532562256,
"learning_rate": 0.0005052510666229078,
"loss": 5.3681,
"step": 280000
},
{
"epoch": 1.4942785910630954,
"grad_norm": 1.9113322496414185,
"learning_rate": 0.0005043610250995278,
"loss": 5.3713,
"step": 280500
},
{
"epoch": 1.4969421892646337,
"grad_norm": 2.1201562881469727,
"learning_rate": 0.0005034691999258002,
"loss": 5.375,
"step": 281000
},
{
"epoch": 1.4996057874661723,
"grad_norm": 2.1695244312286377,
"learning_rate": 0.0005025773747520726,
"loss": 5.3666,
"step": 281500
},
{
"epoch": 1.5022693856677107,
"grad_norm": 2.2736222743988037,
"learning_rate": 0.0005016873332286925,
"loss": 5.3728,
"step": 282000
},
{
"epoch": 1.5049329838692493,
"grad_norm": 1.9306550025939941,
"learning_rate": 0.000500795508054965,
"loss": 5.3607,
"step": 282500
},
{
"epoch": 1.507596582070788,
"grad_norm": 1.970550537109375,
"learning_rate": 0.0004999036828812375,
"loss": 5.372,
"step": 283000
},
{
"epoch": 1.5102601802723261,
"grad_norm": 1.7387876510620117,
"learning_rate": 0.0004990118577075099,
"loss": 5.3728,
"step": 283500
},
{
"epoch": 1.5129237784738647,
"grad_norm": 2.364816188812256,
"learning_rate": 0.0004981200325337823,
"loss": 5.3667,
"step": 284000
},
{
"epoch": 1.5155873766754033,
"grad_norm": 1.959367036819458,
"learning_rate": 0.0004972282073600549,
"loss": 5.3672,
"step": 284500
},
{
"epoch": 1.5182509748769417,
"grad_norm": 2.4462456703186035,
"learning_rate": 0.0004963363821863273,
"loss": 5.3669,
"step": 285000
},
{
"epoch": 1.52091457307848,
"grad_norm": 1.949645757675171,
"learning_rate": 0.0004954445570125997,
"loss": 5.3669,
"step": 285500
},
{
"epoch": 1.5235781712800187,
"grad_norm": 2.0255677700042725,
"learning_rate": 0.0004945545154892197,
"loss": 5.3689,
"step": 286000
},
{
"epoch": 1.5262417694815573,
"grad_norm": 2.0761642456054688,
"learning_rate": 0.0004936644739658396,
"loss": 5.3633,
"step": 286500
},
{
"epoch": 1.5289053676830957,
"grad_norm": 2.1219048500061035,
"learning_rate": 0.000492772648792112,
"loss": 5.3617,
"step": 287000
},
{
"epoch": 1.531568965884634,
"grad_norm": 1.83650803565979,
"learning_rate": 0.0004918808236183844,
"loss": 5.3735,
"step": 287500
},
{
"epoch": 1.5342325640861727,
"grad_norm": 2.0275492668151855,
"learning_rate": 0.0004909889984446568,
"loss": 5.3636,
"step": 288000
},
{
"epoch": 1.5368961622877113,
"grad_norm": 1.9854780435562134,
"learning_rate": 0.0004900971732709294,
"loss": 5.3595,
"step": 288500
},
{
"epoch": 1.5395597604892497,
"grad_norm": 2.282017707824707,
"learning_rate": 0.0004892053480972018,
"loss": 5.3673,
"step": 289000
},
{
"epoch": 1.542223358690788,
"grad_norm": 2.0435492992401123,
"learning_rate": 0.0004883135229234743,
"loss": 5.3771,
"step": 289500
},
{
"epoch": 1.5448869568923267,
"grad_norm": 2.4702582359313965,
"learning_rate": 0.0004874216977497467,
"loss": 5.3592,
"step": 290000
},
{
"epoch": 1.5475505550938653,
"grad_norm": 2.032315731048584,
"learning_rate": 0.00048653165622636666,
"loss": 5.3688,
"step": 290500
},
{
"epoch": 1.5502141532954037,
"grad_norm": 2.13460636138916,
"learning_rate": 0.0004856398310526391,
"loss": 5.3624,
"step": 291000
},
{
"epoch": 1.552877751496942,
"grad_norm": 1.9628610610961914,
"learning_rate": 0.0004847480058789115,
"loss": 5.3647,
"step": 291500
},
{
"epoch": 1.5555413496984807,
"grad_norm": 1.8896455764770508,
"learning_rate": 0.000483856180705184,
"loss": 5.3693,
"step": 292000
},
{
"epoch": 1.5582049479000193,
"grad_norm": 1.92352294921875,
"learning_rate": 0.0004829661391818039,
"loss": 5.3551,
"step": 292500
},
{
"epoch": 1.5608685461015577,
"grad_norm": 2.061492919921875,
"learning_rate": 0.0004820743140080764,
"loss": 5.3618,
"step": 293000
},
{
"epoch": 1.563532144303096,
"grad_norm": 2.0767364501953125,
"learning_rate": 0.0004811842724846963,
"loss": 5.3596,
"step": 293500
},
{
"epoch": 1.5661957425046347,
"grad_norm": 2.103719472885132,
"learning_rate": 0.00048029244731096876,
"loss": 5.3547,
"step": 294000
},
{
"epoch": 1.5688593407061733,
"grad_norm": 2.096832275390625,
"learning_rate": 0.00047940062213724124,
"loss": 5.3635,
"step": 294500
},
{
"epoch": 1.5715229389077117,
"grad_norm": 2.053567409515381,
"learning_rate": 0.0004785087969635137,
"loss": 5.3683,
"step": 295000
},
{
"epoch": 1.57418653710925,
"grad_norm": 2.040846586227417,
"learning_rate": 0.00047761697178978616,
"loss": 5.3623,
"step": 295500
},
{
"epoch": 1.5768501353107887,
"grad_norm": 2.0361154079437256,
"learning_rate": 0.0004767251466160586,
"loss": 5.3572,
"step": 296000
},
{
"epoch": 1.5795137335123273,
"grad_norm": 2.006989002227783,
"learning_rate": 0.00047583332144233103,
"loss": 5.3702,
"step": 296500
},
{
"epoch": 1.5821773317138657,
"grad_norm": 2.0891811847686768,
"learning_rate": 0.0004749414962686035,
"loss": 5.3664,
"step": 297000
},
{
"epoch": 1.584840929915404,
"grad_norm": 2.023730754852295,
"learning_rate": 0.0004740514547452234,
"loss": 5.3668,
"step": 297500
},
{
"epoch": 1.5875045281169426,
"grad_norm": 1.8560234308242798,
"learning_rate": 0.0004731596295714958,
"loss": 5.3688,
"step": 298000
},
{
"epoch": 1.5901681263184813,
"grad_norm": 1.84561288356781,
"learning_rate": 0.0004722678043977683,
"loss": 5.3595,
"step": 298500
},
{
"epoch": 1.5928317245200196,
"grad_norm": 2.0453810691833496,
"learning_rate": 0.0004713759792240407,
"loss": 5.3612,
"step": 299000
},
{
"epoch": 1.595495322721558,
"grad_norm": 2.03952956199646,
"learning_rate": 0.0004704859377006607,
"loss": 5.3595,
"step": 299500
},
{
"epoch": 1.5981589209230966,
"grad_norm": 2.175218343734741,
"learning_rate": 0.00046959411252693313,
"loss": 5.3599,
"step": 300000
},
{
"epoch": 1.6008225191246352,
"grad_norm": 1.9432867765426636,
"learning_rate": 0.00046870228735320556,
"loss": 5.3579,
"step": 300500
},
{
"epoch": 1.6034861173261736,
"grad_norm": 2.0046420097351074,
"learning_rate": 0.00046781046217947805,
"loss": 5.3506,
"step": 301000
},
{
"epoch": 1.606149715527712,
"grad_norm": 1.9781187772750854,
"learning_rate": 0.00046692042065609796,
"loss": 5.3585,
"step": 301500
},
{
"epoch": 1.6088133137292506,
"grad_norm": 2.0884523391723633,
"learning_rate": 0.0004660285954823704,
"loss": 5.36,
"step": 302000
},
{
"epoch": 1.611476911930789,
"grad_norm": 2.0299806594848633,
"learning_rate": 0.0004651367703086429,
"loss": 5.3609,
"step": 302500
},
{
"epoch": 1.6141405101323274,
"grad_norm": 2.0034475326538086,
"learning_rate": 0.0004642449451349153,
"loss": 5.3621,
"step": 303000
},
{
"epoch": 1.616804108333866,
"grad_norm": 2.027804136276245,
"learning_rate": 0.00046335490361153523,
"loss": 5.3617,
"step": 303500
},
{
"epoch": 1.6194677065354046,
"grad_norm": 2.2879958152770996,
"learning_rate": 0.0004624630784378077,
"loss": 5.3597,
"step": 304000
},
{
"epoch": 1.622131304736943,
"grad_norm": 2.0821385383605957,
"learning_rate": 0.00046157125326408015,
"loss": 5.3539,
"step": 304500
},
{
"epoch": 1.6247949029384814,
"grad_norm": 2.0150811672210693,
"learning_rate": 0.00046067942809035263,
"loss": 5.3568,
"step": 305000
},
{
"epoch": 1.62745850114002,
"grad_norm": 1.944470763206482,
"learning_rate": 0.0004597893865669725,
"loss": 5.3618,
"step": 305500
},
{
"epoch": 1.6301220993415586,
"grad_norm": 1.8767342567443848,
"learning_rate": 0.000458897561393245,
"loss": 5.3572,
"step": 306000
},
{
"epoch": 1.632785697543097,
"grad_norm": 2.100074291229248,
"learning_rate": 0.0004580057362195174,
"loss": 5.3557,
"step": 306500
},
{
"epoch": 1.6354492957446354,
"grad_norm": 1.8953720331192017,
"learning_rate": 0.00045711569469613733,
"loss": 5.3603,
"step": 307000
},
{
"epoch": 1.638112893946174,
"grad_norm": 2.099968433380127,
"learning_rate": 0.0004562238695224098,
"loss": 5.3459,
"step": 307500
},
{
"epoch": 1.6407764921477126,
"grad_norm": 2.21608567237854,
"learning_rate": 0.00045533204434868225,
"loss": 5.3602,
"step": 308000
},
{
"epoch": 1.643440090349251,
"grad_norm": 2.0884177684783936,
"learning_rate": 0.0004544402191749547,
"loss": 5.3538,
"step": 308500
},
{
"epoch": 1.6461036885507894,
"grad_norm": 2.0560896396636963,
"learning_rate": 0.00045354839400122717,
"loss": 5.3618,
"step": 309000
},
{
"epoch": 1.648767286752328,
"grad_norm": 2.3166544437408447,
"learning_rate": 0.0004526565688274996,
"loss": 5.3446,
"step": 309500
},
{
"epoch": 1.6514308849538666,
"grad_norm": 1.9376626014709473,
"learning_rate": 0.0004517647436537721,
"loss": 5.3565,
"step": 310000
},
{
"epoch": 1.654094483155405,
"grad_norm": 1.8356984853744507,
"learning_rate": 0.0004508729184800445,
"loss": 5.3585,
"step": 310500
},
{
"epoch": 1.6567580813569434,
"grad_norm": 2.0316951274871826,
"learning_rate": 0.00044998287695666443,
"loss": 5.3615,
"step": 311000
},
{
"epoch": 1.659421679558482,
"grad_norm": 2.1165359020233154,
"learning_rate": 0.00044909283543328435,
"loss": 5.357,
"step": 311500
},
{
"epoch": 1.6620852777600206,
"grad_norm": 2.1769607067108154,
"learning_rate": 0.0004482010102595568,
"loss": 5.3567,
"step": 312000
},
{
"epoch": 1.664748875961559,
"grad_norm": 2.0454256534576416,
"learning_rate": 0.0004473091850858292,
"loss": 5.3573,
"step": 312500
},
{
"epoch": 1.6674124741630973,
"grad_norm": 2.1431968212127686,
"learning_rate": 0.0004464173599121017,
"loss": 5.3509,
"step": 313000
},
{
"epoch": 1.670076072364636,
"grad_norm": 2.0397841930389404,
"learning_rate": 0.00044552553473837413,
"loss": 5.3532,
"step": 313500
},
{
"epoch": 1.6727396705661746,
"grad_norm": 2.080476999282837,
"learning_rate": 0.0004446337095646467,
"loss": 5.3558,
"step": 314000
},
{
"epoch": 1.675403268767713,
"grad_norm": 1.9653671979904175,
"learning_rate": 0.0004437418843909191,
"loss": 5.3481,
"step": 314500
},
{
"epoch": 1.6780668669692513,
"grad_norm": 2.2119712829589844,
"learning_rate": 0.0004428500592171916,
"loss": 5.3555,
"step": 315000
},
{
"epoch": 1.68073046517079,
"grad_norm": 1.990404486656189,
"learning_rate": 0.00044196001769381145,
"loss": 5.3567,
"step": 315500
},
{
"epoch": 1.6833940633723286,
"grad_norm": 2.0500054359436035,
"learning_rate": 0.0004410681925200839,
"loss": 5.3503,
"step": 316000
},
{
"epoch": 1.686057661573867,
"grad_norm": 2.205277919769287,
"learning_rate": 0.00044017636734635637,
"loss": 5.3553,
"step": 316500
},
{
"epoch": 1.6887212597754053,
"grad_norm": 1.9659850597381592,
"learning_rate": 0.0004392845421726288,
"loss": 5.3456,
"step": 317000
},
{
"epoch": 1.691384857976944,
"grad_norm": 2.029604196548462,
"learning_rate": 0.0004383927169989013,
"loss": 5.3554,
"step": 317500
},
{
"epoch": 1.6940484561784825,
"grad_norm": 2.041193723678589,
"learning_rate": 0.0004375008918251737,
"loss": 5.3534,
"step": 318000
},
{
"epoch": 1.696712054380021,
"grad_norm": 2.068268299102783,
"learning_rate": 0.00043661085030179364,
"loss": 5.3564,
"step": 318500
},
{
"epoch": 1.6993756525815593,
"grad_norm": 2.0078883171081543,
"learning_rate": 0.0004357190251280661,
"loss": 5.3518,
"step": 319000
},
{
"epoch": 1.702039250783098,
"grad_norm": 1.9186288118362427,
"learning_rate": 0.00043482719995433856,
"loss": 5.3471,
"step": 319500
},
{
"epoch": 1.7047028489846365,
"grad_norm": 2.0289323329925537,
"learning_rate": 0.000433935374780611,
"loss": 5.3513,
"step": 320000
},
{
"epoch": 1.7073664471861747,
"grad_norm": 1.69050133228302,
"learning_rate": 0.0004330435496068835,
"loss": 5.3513,
"step": 320500
},
{
"epoch": 1.7100300453877133,
"grad_norm": 2.0047898292541504,
"learning_rate": 0.0004321517244331559,
"loss": 5.3531,
"step": 321000
},
{
"epoch": 1.712693643589252,
"grad_norm": 2.1100831031799316,
"learning_rate": 0.0004312616829097759,
"loss": 5.3494,
"step": 321500
},
{
"epoch": 1.7153572417907903,
"grad_norm": 2.053802013397217,
"learning_rate": 0.0004303698577360483,
"loss": 5.3573,
"step": 322000
},
{
"epoch": 1.7180208399923287,
"grad_norm": 1.9370436668395996,
"learning_rate": 0.00042947803256232074,
"loss": 5.3457,
"step": 322500
},
{
"epoch": 1.7206844381938673,
"grad_norm": 2.062244176864624,
"learning_rate": 0.00042858620738859323,
"loss": 5.3532,
"step": 323000
},
{
"epoch": 1.723348036395406,
"grad_norm": 2.129863739013672,
"learning_rate": 0.00042769438221486566,
"loss": 5.3469,
"step": 323500
},
{
"epoch": 1.7260116345969443,
"grad_norm": 2.1496474742889404,
"learning_rate": 0.0004268043406914855,
"loss": 5.3494,
"step": 324000
},
{
"epoch": 1.7286752327984827,
"grad_norm": 2.0887863636016846,
"learning_rate": 0.00042591251551775806,
"loss": 5.3483,
"step": 324500
},
{
"epoch": 1.7313388310000213,
"grad_norm": 2.4094293117523193,
"learning_rate": 0.0004250206903440305,
"loss": 5.3485,
"step": 325000
},
{
"epoch": 1.73400242920156,
"grad_norm": 2.046931266784668,
"learning_rate": 0.000424128865170303,
"loss": 5.345,
"step": 325500
},
{
"epoch": 1.7366660274030983,
"grad_norm": 2.1520516872406006,
"learning_rate": 0.0004232370399965754,
"loss": 5.351,
"step": 326000
},
{
"epoch": 1.7393296256046367,
"grad_norm": 2.006589651107788,
"learning_rate": 0.0004223469984731953,
"loss": 5.3511,
"step": 326500
},
{
"epoch": 1.7419932238061753,
"grad_norm": 1.9035310745239258,
"learning_rate": 0.00042145517329946776,
"loss": 5.3457,
"step": 327000
},
{
"epoch": 1.7446568220077139,
"grad_norm": 2.0777719020843506,
"learning_rate": 0.0004205633481257402,
"loss": 5.3519,
"step": 327500
},
{
"epoch": 1.7473204202092523,
"grad_norm": 2.2958412170410156,
"learning_rate": 0.0004196715229520127,
"loss": 5.3455,
"step": 328000
},
{
"epoch": 1.7499840184107907,
"grad_norm": 2.3482723236083984,
"learning_rate": 0.0004187796977782851,
"loss": 5.3513,
"step": 328500
},
{
"epoch": 1.7526476166123293,
"grad_norm": 2.4552931785583496,
"learning_rate": 0.00041788787260455755,
"loss": 5.3496,
"step": 329000
},
{
"epoch": 1.7553112148138679,
"grad_norm": 2.0816726684570312,
"learning_rate": 0.00041699604743083003,
"loss": 5.3434,
"step": 329500
},
{
"epoch": 1.7579748130154063,
"grad_norm": 1.869194746017456,
"learning_rate": 0.00041610600590744995,
"loss": 5.349,
"step": 330000
},
{
"epoch": 1.7606384112169446,
"grad_norm": 2.020172595977783,
"learning_rate": 0.00041521418073372243,
"loss": 5.3489,
"step": 330500
},
{
"epoch": 1.7633020094184833,
"grad_norm": 2.1260483264923096,
"learning_rate": 0.00041432235555999487,
"loss": 5.3523,
"step": 331000
},
{
"epoch": 1.7659656076200219,
"grad_norm": 2.1546857357025146,
"learning_rate": 0.0004134305303862673,
"loss": 5.3414,
"step": 331500
},
{
"epoch": 1.7686292058215602,
"grad_norm": 2.2955052852630615,
"learning_rate": 0.0004125387052125398,
"loss": 5.3489,
"step": 332000
},
{
"epoch": 1.7712928040230986,
"grad_norm": 2.0505149364471436,
"learning_rate": 0.0004116468800388122,
"loss": 5.3543,
"step": 332500
},
{
"epoch": 1.7739564022246372,
"grad_norm": 1.9976879358291626,
"learning_rate": 0.0004107550548650847,
"loss": 5.3455,
"step": 333000
},
{
"epoch": 1.7766200004261758,
"grad_norm": 2.1872785091400146,
"learning_rate": 0.00040986322969135714,
"loss": 5.345,
"step": 333500
},
{
"epoch": 1.7792835986277142,
"grad_norm": 2.025681257247925,
"learning_rate": 0.00040897318816797705,
"loss": 5.3559,
"step": 334000
},
{
"epoch": 1.7819471968292526,
"grad_norm": 2.051701307296753,
"learning_rate": 0.00040808136299424954,
"loss": 5.3424,
"step": 334500
},
{
"epoch": 1.7846107950307912,
"grad_norm": 2.161292314529419,
"learning_rate": 0.00040718953782052197,
"loss": 5.3418,
"step": 335000
},
{
"epoch": 1.7872743932323298,
"grad_norm": 2.1306283473968506,
"learning_rate": 0.00040629771264679446,
"loss": 5.352,
"step": 335500
},
{
"epoch": 1.7899379914338682,
"grad_norm": 2.1994986534118652,
"learning_rate": 0.00040540767112341437,
"loss": 5.348,
"step": 336000
},
{
"epoch": 1.7926015896354066,
"grad_norm": 2.3227968215942383,
"learning_rate": 0.00040451762960003423,
"loss": 5.3444,
"step": 336500
},
{
"epoch": 1.7952651878369452,
"grad_norm": 2.1397862434387207,
"learning_rate": 0.0004036258044263067,
"loss": 5.3556,
"step": 337000
},
{
"epoch": 1.7979287860384838,
"grad_norm": 2.0676870346069336,
"learning_rate": 0.00040273397925257915,
"loss": 5.3471,
"step": 337500
},
{
"epoch": 1.8005923842400222,
"grad_norm": 2.2523062229156494,
"learning_rate": 0.0004018421540788516,
"loss": 5.3431,
"step": 338000
},
{
"epoch": 1.8032559824415606,
"grad_norm": 2.1115000247955322,
"learning_rate": 0.00040095211255547155,
"loss": 5.3467,
"step": 338500
},
{
"epoch": 1.8059195806430992,
"grad_norm": 2.0157132148742676,
"learning_rate": 0.000400060287381744,
"loss": 5.3462,
"step": 339000
},
{
"epoch": 1.8085831788446376,
"grad_norm": 2.1384365558624268,
"learning_rate": 0.0003991684622080165,
"loss": 5.3381,
"step": 339500
},
{
"epoch": 1.811246777046176,
"grad_norm": 2.016707420349121,
"learning_rate": 0.0003982766370342889,
"loss": 5.3424,
"step": 340000
},
{
"epoch": 1.8139103752477146,
"grad_norm": 1.9890104532241821,
"learning_rate": 0.00039738481186056134,
"loss": 5.3459,
"step": 340500
},
{
"epoch": 1.8165739734492532,
"grad_norm": 1.997981309890747,
"learning_rate": 0.0003964947703371813,
"loss": 5.3415,
"step": 341000
},
{
"epoch": 1.8192375716507916,
"grad_norm": 2.077340602874756,
"learning_rate": 0.00039560294516345374,
"loss": 5.3401,
"step": 341500
},
{
"epoch": 1.82190116985233,
"grad_norm": 1.9495571851730347,
"learning_rate": 0.00039471111998972617,
"loss": 5.3461,
"step": 342000
},
{
"epoch": 1.8245647680538686,
"grad_norm": 2.086167097091675,
"learning_rate": 0.00039381929481599866,
"loss": 5.3457,
"step": 342500
},
{
"epoch": 1.8272283662554072,
"grad_norm": 1.9157156944274902,
"learning_rate": 0.0003929274696422711,
"loss": 5.3374,
"step": 343000
},
{
"epoch": 1.8298919644569456,
"grad_norm": 2.2283830642700195,
"learning_rate": 0.0003920356444685436,
"loss": 5.3403,
"step": 343500
},
{
"epoch": 1.832555562658484,
"grad_norm": 2.155780553817749,
"learning_rate": 0.00039114560294516344,
"loss": 5.3403,
"step": 344000
},
{
"epoch": 1.8352191608600226,
"grad_norm": 2.0122015476226807,
"learning_rate": 0.00039025377777143587,
"loss": 5.3485,
"step": 344500
},
{
"epoch": 1.8378827590615612,
"grad_norm": 2.1252944469451904,
"learning_rate": 0.00038936195259770836,
"loss": 5.3534,
"step": 345000
},
{
"epoch": 1.8405463572630996,
"grad_norm": 2.16573166847229,
"learning_rate": 0.00038847012742398084,
"loss": 5.3407,
"step": 345500
},
{
"epoch": 1.843209955464638,
"grad_norm": 2.043785810470581,
"learning_rate": 0.0003875800859006007,
"loss": 5.3441,
"step": 346000
},
{
"epoch": 1.8458735536661766,
"grad_norm": 2.0578818321228027,
"learning_rate": 0.0003866882607268732,
"loss": 5.344,
"step": 346500
},
{
"epoch": 1.8485371518677152,
"grad_norm": 2.344649076461792,
"learning_rate": 0.0003857964355531456,
"loss": 5.3401,
"step": 347000
},
{
"epoch": 1.8512007500692536,
"grad_norm": 2.2246205806732178,
"learning_rate": 0.0003849046103794181,
"loss": 5.3474,
"step": 347500
},
{
"epoch": 1.853864348270792,
"grad_norm": 2.3041775226593018,
"learning_rate": 0.00038401278520569054,
"loss": 5.3403,
"step": 348000
},
{
"epoch": 1.8565279464723305,
"grad_norm": 2.0579144954681396,
"learning_rate": 0.00038312096003196303,
"loss": 5.3388,
"step": 348500
},
{
"epoch": 1.8591915446738692,
"grad_norm": 2.1944098472595215,
"learning_rate": 0.00038223091850858294,
"loss": 5.3412,
"step": 349000
},
{
"epoch": 1.8618551428754075,
"grad_norm": 2.0834217071533203,
"learning_rate": 0.0003813390933348554,
"loss": 5.3465,
"step": 349500
},
{
"epoch": 1.864518741076946,
"grad_norm": 1.9777040481567383,
"learning_rate": 0.00038044726816112786,
"loss": 5.3394,
"step": 350000
},
{
"epoch": 1.8671823392784845,
"grad_norm": 2.341625690460205,
"learning_rate": 0.0003795554429874003,
"loss": 5.3414,
"step": 350500
},
{
"epoch": 1.8698459374800231,
"grad_norm": 1.9645224809646606,
"learning_rate": 0.0003786636178136728,
"loss": 5.3429,
"step": 351000
},
{
"epoch": 1.8725095356815615,
"grad_norm": 2.217845916748047,
"learning_rate": 0.0003777717926399452,
"loss": 5.3485,
"step": 351500
},
{
"epoch": 1.8751731338831,
"grad_norm": 2.2836930751800537,
"learning_rate": 0.00037687996746621765,
"loss": 5.3369,
"step": 352000
},
{
"epoch": 1.8778367320846385,
"grad_norm": 2.1809890270233154,
"learning_rate": 0.00037598814229249013,
"loss": 5.3375,
"step": 352500
},
{
"epoch": 1.8805003302861771,
"grad_norm": 2.4111125469207764,
"learning_rate": 0.00037509810076911005,
"loss": 5.3453,
"step": 353000
},
{
"epoch": 1.8831639284877155,
"grad_norm": 2.264157295227051,
"learning_rate": 0.0003742062755953825,
"loss": 5.3412,
"step": 353500
},
{
"epoch": 1.885827526689254,
"grad_norm": 2.232529878616333,
"learning_rate": 0.00037331445042165497,
"loss": 5.3481,
"step": 354000
},
{
"epoch": 1.8884911248907925,
"grad_norm": 2.0301549434661865,
"learning_rate": 0.00037242440889827483,
"loss": 5.3351,
"step": 354500
},
{
"epoch": 1.8911547230923311,
"grad_norm": 2.040621757507324,
"learning_rate": 0.0003715325837245473,
"loss": 5.3442,
"step": 355000
},
{
"epoch": 1.8938183212938695,
"grad_norm": 2.085535764694214,
"learning_rate": 0.0003706407585508198,
"loss": 5.3302,
"step": 355500
},
{
"epoch": 1.896481919495408,
"grad_norm": 2.1077394485473633,
"learning_rate": 0.00036974893337709223,
"loss": 5.3383,
"step": 356000
},
{
"epoch": 1.8991455176969465,
"grad_norm": 2.242241621017456,
"learning_rate": 0.0003688571082033647,
"loss": 5.3315,
"step": 356500
},
{
"epoch": 1.901809115898485,
"grad_norm": 2.2890877723693848,
"learning_rate": 0.00036796528302963715,
"loss": 5.3378,
"step": 357000
},
{
"epoch": 1.9044727141000233,
"grad_norm": 2.3517234325408936,
"learning_rate": 0.000367075241506257,
"loss": 5.3369,
"step": 357500
},
{
"epoch": 1.9071363123015619,
"grad_norm": 2.3767483234405518,
"learning_rate": 0.0003661834163325295,
"loss": 5.3365,
"step": 358000
},
{
"epoch": 1.9097999105031005,
"grad_norm": 2.2238335609436035,
"learning_rate": 0.00036529159115880193,
"loss": 5.3353,
"step": 358500
},
{
"epoch": 1.9124635087046389,
"grad_norm": 2.0594356060028076,
"learning_rate": 0.0003643997659850744,
"loss": 5.3346,
"step": 359000
},
{
"epoch": 1.9151271069061773,
"grad_norm": 2.1106550693511963,
"learning_rate": 0.00036350794081134685,
"loss": 5.3317,
"step": 359500
},
{
"epoch": 1.9177907051077159,
"grad_norm": 2.0819623470306396,
"learning_rate": 0.00036261611563761934,
"loss": 5.332,
"step": 360000
},
{
"epoch": 1.9204543033092545,
"grad_norm": 1.9421486854553223,
"learning_rate": 0.00036172607411423925,
"loss": 5.3425,
"step": 360500
},
{
"epoch": 1.9231179015107929,
"grad_norm": 2.304370641708374,
"learning_rate": 0.0003608342489405117,
"loss": 5.3278,
"step": 361000
},
{
"epoch": 1.9257814997123313,
"grad_norm": 1.9409058094024658,
"learning_rate": 0.00035994242376678417,
"loss": 5.3364,
"step": 361500
},
{
"epoch": 1.9284450979138699,
"grad_norm": 2.199068307876587,
"learning_rate": 0.0003590505985930566,
"loss": 5.3375,
"step": 362000
},
{
"epoch": 1.9311086961154085,
"grad_norm": 2.4809699058532715,
"learning_rate": 0.0003581587734193291,
"loss": 5.3304,
"step": 362500
},
{
"epoch": 1.9337722943169469,
"grad_norm": 1.8762375116348267,
"learning_rate": 0.000357268731895949,
"loss": 5.3396,
"step": 363000
},
{
"epoch": 1.9364358925184852,
"grad_norm": 2.14876651763916,
"learning_rate": 0.00035637690672222144,
"loss": 5.3295,
"step": 363500
},
{
"epoch": 1.9390994907200239,
"grad_norm": 2.0710737705230713,
"learning_rate": 0.0003554850815484939,
"loss": 5.3319,
"step": 364000
},
{
"epoch": 1.9417630889215625,
"grad_norm": 2.1879022121429443,
"learning_rate": 0.00035459325637476636,
"loss": 5.3353,
"step": 364500
},
{
"epoch": 1.9444266871231008,
"grad_norm": 2.2101471424102783,
"learning_rate": 0.0003537014312010388,
"loss": 5.3365,
"step": 365000
},
{
"epoch": 1.9470902853246392,
"grad_norm": 2.1538619995117188,
"learning_rate": 0.0003528113896776587,
"loss": 5.3345,
"step": 365500
},
{
"epoch": 1.9497538835261778,
"grad_norm": 2.3958141803741455,
"learning_rate": 0.0003519195645039312,
"loss": 5.3298,
"step": 366000
},
{
"epoch": 1.9524174817277165,
"grad_norm": 2.2059667110443115,
"learning_rate": 0.0003510277393302037,
"loss": 5.3228,
"step": 366500
},
{
"epoch": 1.9550810799292548,
"grad_norm": 2.0048577785491943,
"learning_rate": 0.0003501359141564761,
"loss": 5.3336,
"step": 367000
},
{
"epoch": 1.9577446781307932,
"grad_norm": 2.0165789127349854,
"learning_rate": 0.00034924408898274854,
"loss": 5.3342,
"step": 367500
},
{
"epoch": 1.9604082763323318,
"grad_norm": 2.2053885459899902,
"learning_rate": 0.00034835226380902103,
"loss": 5.3359,
"step": 368000
},
{
"epoch": 1.9630718745338704,
"grad_norm": 2.316288948059082,
"learning_rate": 0.0003474622222856409,
"loss": 5.3344,
"step": 368500
},
{
"epoch": 1.9657354727354088,
"grad_norm": 2.385871410369873,
"learning_rate": 0.0003465703971119133,
"loss": 5.3364,
"step": 369000
},
{
"epoch": 1.9683990709369472,
"grad_norm": 2.3206396102905273,
"learning_rate": 0.0003456785719381858,
"loss": 5.3309,
"step": 369500
},
{
"epoch": 1.9710626691384858,
"grad_norm": 2.172229766845703,
"learning_rate": 0.00034478674676445824,
"loss": 5.3338,
"step": 370000
},
{
"epoch": 1.9737262673400244,
"grad_norm": 2.3812954425811768,
"learning_rate": 0.0003438967052410782,
"loss": 5.3306,
"step": 370500
},
{
"epoch": 1.9763898655415628,
"grad_norm": 2.1423757076263428,
"learning_rate": 0.00034300488006735064,
"loss": 5.3406,
"step": 371000
},
{
"epoch": 1.9790534637431012,
"grad_norm": 2.2044973373413086,
"learning_rate": 0.0003421130548936231,
"loss": 5.3371,
"step": 371500
},
{
"epoch": 1.9817170619446398,
"grad_norm": 1.944014549255371,
"learning_rate": 0.00034122122971989556,
"loss": 5.3348,
"step": 372000
},
{
"epoch": 1.9843806601461784,
"grad_norm": 2.3091371059417725,
"learning_rate": 0.000340329404546168,
"loss": 5.3283,
"step": 372500
},
{
"epoch": 1.9870442583477168,
"grad_norm": 2.600417137145996,
"learning_rate": 0.0003394375793724405,
"loss": 5.3292,
"step": 373000
},
{
"epoch": 1.9897078565492552,
"grad_norm": 2.0236728191375732,
"learning_rate": 0.0003385457541987129,
"loss": 5.3353,
"step": 373500
},
{
"epoch": 1.9923714547507938,
"grad_norm": 2.298342227935791,
"learning_rate": 0.00033765392902498535,
"loss": 5.3355,
"step": 374000
},
{
"epoch": 1.9950350529523324,
"grad_norm": 1.945620059967041,
"learning_rate": 0.0003367638875016053,
"loss": 5.3302,
"step": 374500
},
{
"epoch": 1.9976986511538706,
"grad_norm": 2.1642651557922363,
"learning_rate": 0.0003358738459782252,
"loss": 5.3259,
"step": 375000
},
{
"epoch": 2.000362249355409,
"grad_norm": 2.149771213531494,
"learning_rate": 0.0003349820208044976,
"loss": 5.3347,
"step": 375500
},
{
"epoch": 2.003025847556948,
"grad_norm": 2.2164316177368164,
"learning_rate": 0.0003340901956307701,
"loss": 5.3308,
"step": 376000
},
{
"epoch": 2.0056894457584864,
"grad_norm": 2.2055323123931885,
"learning_rate": 0.0003331983704570426,
"loss": 5.332,
"step": 376500
},
{
"epoch": 2.0083530439600246,
"grad_norm": 2.1814560890197754,
"learning_rate": 0.00033230654528331507,
"loss": 5.3239,
"step": 377000
},
{
"epoch": 2.011016642161563,
"grad_norm": 2.1237363815307617,
"learning_rate": 0.0003314147201095875,
"loss": 5.3364,
"step": 377500
},
{
"epoch": 2.013680240363102,
"grad_norm": 2.1073851585388184,
"learning_rate": 0.00033052467858620736,
"loss": 5.3209,
"step": 378000
},
{
"epoch": 2.0163438385646404,
"grad_norm": 1.9759477376937866,
"learning_rate": 0.00032963285341247985,
"loss": 5.3272,
"step": 378500
},
{
"epoch": 2.0190074367661786,
"grad_norm": 2.100966691970825,
"learning_rate": 0.0003287410282387523,
"loss": 5.3226,
"step": 379000
},
{
"epoch": 2.021671034967717,
"grad_norm": 2.141537666320801,
"learning_rate": 0.00032784920306502477,
"loss": 5.3305,
"step": 379500
},
{
"epoch": 2.0243346331692558,
"grad_norm": 2.2714550495147705,
"learning_rate": 0.0003269573778912972,
"loss": 5.3335,
"step": 380000
},
{
"epoch": 2.0269982313707944,
"grad_norm": 2.1945018768310547,
"learning_rate": 0.0003260673363679171,
"loss": 5.3267,
"step": 380500
},
{
"epoch": 2.0296618295723325,
"grad_norm": 2.269015312194824,
"learning_rate": 0.0003251755111941896,
"loss": 5.3346,
"step": 381000
},
{
"epoch": 2.032325427773871,
"grad_norm": 2.194460391998291,
"learning_rate": 0.00032428368602046203,
"loss": 5.3216,
"step": 381500
},
{
"epoch": 2.0349890259754098,
"grad_norm": 2.1248984336853027,
"learning_rate": 0.0003233918608467345,
"loss": 5.3294,
"step": 382000
},
{
"epoch": 2.0376526241769484,
"grad_norm": 2.213801622390747,
"learning_rate": 0.00032250003567300695,
"loss": 5.3282,
"step": 382500
},
{
"epoch": 2.0403162223784865,
"grad_norm": 2.0801334381103516,
"learning_rate": 0.0003216082104992794,
"loss": 5.3293,
"step": 383000
},
{
"epoch": 2.042979820580025,
"grad_norm": 2.191882371902466,
"learning_rate": 0.00032071816897589935,
"loss": 5.3297,
"step": 383500
},
{
"epoch": 2.0456434187815637,
"grad_norm": 2.238471031188965,
"learning_rate": 0.0003198263438021718,
"loss": 5.3274,
"step": 384000
},
{
"epoch": 2.0483070169831024,
"grad_norm": 2.0454585552215576,
"learning_rate": 0.0003189345186284443,
"loss": 5.3335,
"step": 384500
},
{
"epoch": 2.0509706151846405,
"grad_norm": 2.449857473373413,
"learning_rate": 0.0003180426934547167,
"loss": 5.3243,
"step": 385000
},
{
"epoch": 2.053634213386179,
"grad_norm": 2.182969331741333,
"learning_rate": 0.00031715265193133657,
"loss": 5.3239,
"step": 385500
},
{
"epoch": 2.0562978115877177,
"grad_norm": 2.3800108432769775,
"learning_rate": 0.00031626082675760905,
"loss": 5.3263,
"step": 386000
},
{
"epoch": 2.058961409789256,
"grad_norm": 2.4917428493499756,
"learning_rate": 0.0003153690015838815,
"loss": 5.3252,
"step": 386500
},
{
"epoch": 2.0616250079907945,
"grad_norm": 2.25253963470459,
"learning_rate": 0.00031447717641015397,
"loss": 5.3323,
"step": 387000
},
{
"epoch": 2.064288606192333,
"grad_norm": 2.1959807872772217,
"learning_rate": 0.00031358535123642646,
"loss": 5.3257,
"step": 387500
},
{
"epoch": 2.0669522043938717,
"grad_norm": 2.202449321746826,
"learning_rate": 0.0003126935260626989,
"loss": 5.3256,
"step": 388000
},
{
"epoch": 2.06961580259541,
"grad_norm": 2.093303918838501,
"learning_rate": 0.0003118017008889714,
"loss": 5.3302,
"step": 388500
},
{
"epoch": 2.0722794007969485,
"grad_norm": 2.139282464981079,
"learning_rate": 0.0003109098757152438,
"loss": 5.3298,
"step": 389000
},
{
"epoch": 2.074942998998487,
"grad_norm": 2.004852533340454,
"learning_rate": 0.00031001983419186367,
"loss": 5.3329,
"step": 389500
},
{
"epoch": 2.0776065972000257,
"grad_norm": 2.385274648666382,
"learning_rate": 0.00030912800901813616,
"loss": 5.3266,
"step": 390000
},
{
"epoch": 2.080270195401564,
"grad_norm": 2.218735456466675,
"learning_rate": 0.0003082361838444086,
"loss": 5.329,
"step": 390500
},
{
"epoch": 2.0829337936031025,
"grad_norm": 2.271380662918091,
"learning_rate": 0.0003073443586706811,
"loss": 5.3239,
"step": 391000
},
{
"epoch": 2.085597391804641,
"grad_norm": 2.526583433151245,
"learning_rate": 0.000306454317147301,
"loss": 5.3287,
"step": 391500
},
{
"epoch": 2.0882609900061797,
"grad_norm": 2.1075544357299805,
"learning_rate": 0.0003055624919735734,
"loss": 5.3264,
"step": 392000
},
{
"epoch": 2.090924588207718,
"grad_norm": 2.0297112464904785,
"learning_rate": 0.0003046706667998459,
"loss": 5.3279,
"step": 392500
},
{
"epoch": 2.0935881864092565,
"grad_norm": 2.0166475772857666,
"learning_rate": 0.00030377884162611834,
"loss": 5.3279,
"step": 393000
},
{
"epoch": 2.096251784610795,
"grad_norm": 2.398573398590088,
"learning_rate": 0.00030288880010273826,
"loss": 5.325,
"step": 393500
},
{
"epoch": 2.0989153828123337,
"grad_norm": 2.2096564769744873,
"learning_rate": 0.00030199697492901075,
"loss": 5.3241,
"step": 394000
},
{
"epoch": 2.101578981013872,
"grad_norm": 2.2474560737609863,
"learning_rate": 0.0003011051497552832,
"loss": 5.3232,
"step": 394500
},
{
"epoch": 2.1042425792154105,
"grad_norm": 2.2487635612487793,
"learning_rate": 0.00030021332458155566,
"loss": 5.3191,
"step": 395000
},
{
"epoch": 2.106906177416949,
"grad_norm": 2.094921112060547,
"learning_rate": 0.0002993214994078281,
"loss": 5.3354,
"step": 395500
},
{
"epoch": 2.1095697756184877,
"grad_norm": 2.2288858890533447,
"learning_rate": 0.00029843145788444796,
"loss": 5.3254,
"step": 396000
},
{
"epoch": 2.112233373820026,
"grad_norm": 2.166731595993042,
"learning_rate": 0.00029753963271072044,
"loss": 5.3239,
"step": 396500
},
{
"epoch": 2.1148969720215645,
"grad_norm": 2.05653715133667,
"learning_rate": 0.00029664780753699293,
"loss": 5.3305,
"step": 397000
},
{
"epoch": 2.117560570223103,
"grad_norm": 2.08963942527771,
"learning_rate": 0.0002957559823632654,
"loss": 5.3255,
"step": 397500
},
{
"epoch": 2.1202241684246417,
"grad_norm": 2.268559217453003,
"learning_rate": 0.0002948659408398853,
"loss": 5.3238,
"step": 398000
},
{
"epoch": 2.12288776662618,
"grad_norm": 2.9195141792297363,
"learning_rate": 0.0002939741156661577,
"loss": 5.3211,
"step": 398500
},
{
"epoch": 2.1255513648277184,
"grad_norm": 2.2552900314331055,
"learning_rate": 0.0002930822904924302,
"loss": 5.3251,
"step": 399000
},
{
"epoch": 2.128214963029257,
"grad_norm": 2.294832706451416,
"learning_rate": 0.00029219046531870263,
"loss": 5.32,
"step": 399500
},
{
"epoch": 2.1308785612307957,
"grad_norm": 2.3486320972442627,
"learning_rate": 0.0002912986401449751,
"loss": 5.3197,
"step": 400000
},
{
"epoch": 2.133542159432334,
"grad_norm": 2.497387647628784,
"learning_rate": 0.00029040681497124755,
"loss": 5.3235,
"step": 400500
},
{
"epoch": 2.1362057576338724,
"grad_norm": 2.3829433917999268,
"learning_rate": 0.00028951498979752,
"loss": 5.3145,
"step": 401000
},
{
"epoch": 2.138869355835411,
"grad_norm": 2.064811944961548,
"learning_rate": 0.00028862316462379247,
"loss": 5.3168,
"step": 401500
},
{
"epoch": 2.1415329540369497,
"grad_norm": 2.194028377532959,
"learning_rate": 0.0002877331231004124,
"loss": 5.3221,
"step": 402000
},
{
"epoch": 2.144196552238488,
"grad_norm": 2.1182937622070312,
"learning_rate": 0.0002868412979266848,
"loss": 5.321,
"step": 402500
},
{
"epoch": 2.1468601504400264,
"grad_norm": 2.3992223739624023,
"learning_rate": 0.0002859494727529573,
"loss": 5.3237,
"step": 403000
},
{
"epoch": 2.149523748641565,
"grad_norm": 2.256955623626709,
"learning_rate": 0.00028505764757922973,
"loss": 5.3144,
"step": 403500
},
{
"epoch": 2.152187346843103,
"grad_norm": 2.3727059364318848,
"learning_rate": 0.0002841658224055022,
"loss": 5.3238,
"step": 404000
},
{
"epoch": 2.154850945044642,
"grad_norm": 2.1184160709381104,
"learning_rate": 0.00028327399723177465,
"loss": 5.3196,
"step": 404500
},
{
"epoch": 2.1575145432461804,
"grad_norm": 2.1502108573913574,
"learning_rate": 0.00028238217205804714,
"loss": 5.3141,
"step": 405000
},
{
"epoch": 2.160178141447719,
"grad_norm": 2.176964521408081,
"learning_rate": 0.00028149034688431957,
"loss": 5.3187,
"step": 405500
},
{
"epoch": 2.162841739649257,
"grad_norm": 2.144890069961548,
"learning_rate": 0.0002806003053609395,
"loss": 5.3199,
"step": 406000
},
{
"epoch": 2.165505337850796,
"grad_norm": 2.17976975440979,
"learning_rate": 0.000279708480187212,
"loss": 5.318,
"step": 406500
},
{
"epoch": 2.1681689360523344,
"grad_norm": 2.181568145751953,
"learning_rate": 0.0002788166550134844,
"loss": 5.3214,
"step": 407000
},
{
"epoch": 2.170832534253873,
"grad_norm": 2.299090623855591,
"learning_rate": 0.0002779248298397569,
"loss": 5.3225,
"step": 407500
},
{
"epoch": 2.173496132455411,
"grad_norm": 2.189419746398926,
"learning_rate": 0.0002770347883163768,
"loss": 5.3193,
"step": 408000
},
{
"epoch": 2.17615973065695,
"grad_norm": 2.274648904800415,
"learning_rate": 0.00027614296314264924,
"loss": 5.3218,
"step": 408500
},
{
"epoch": 2.1788233288584884,
"grad_norm": 2.1534972190856934,
"learning_rate": 0.0002752511379689217,
"loss": 5.3173,
"step": 409000
},
{
"epoch": 2.181486927060027,
"grad_norm": 2.3284084796905518,
"learning_rate": 0.00027435931279519416,
"loss": 5.3126,
"step": 409500
},
{
"epoch": 2.184150525261565,
"grad_norm": 2.286384344100952,
"learning_rate": 0.0002734674876214666,
"loss": 5.3232,
"step": 410000
},
{
"epoch": 2.1868141234631038,
"grad_norm": 2.111091375350952,
"learning_rate": 0.0002725774460980865,
"loss": 5.3163,
"step": 410500
},
{
"epoch": 2.1894777216646424,
"grad_norm": 2.361741304397583,
"learning_rate": 0.00027168562092435894,
"loss": 5.3212,
"step": 411000
},
{
"epoch": 2.192141319866181,
"grad_norm": 2.497840642929077,
"learning_rate": 0.0002707937957506314,
"loss": 5.3238,
"step": 411500
},
{
"epoch": 2.194804918067719,
"grad_norm": 2.227203607559204,
"learning_rate": 0.00026990197057690386,
"loss": 5.323,
"step": 412000
},
{
"epoch": 2.1974685162692578,
"grad_norm": 2.2768001556396484,
"learning_rate": 0.00026901192905352377,
"loss": 5.3182,
"step": 412500
},
{
"epoch": 2.2001321144707964,
"grad_norm": 2.157787799835205,
"learning_rate": 0.00026812010387979626,
"loss": 5.3246,
"step": 413000
},
{
"epoch": 2.202795712672335,
"grad_norm": 2.3759965896606445,
"learning_rate": 0.0002672282787060687,
"loss": 5.3207,
"step": 413500
},
{
"epoch": 2.205459310873873,
"grad_norm": 2.210963487625122,
"learning_rate": 0.0002663364535323411,
"loss": 5.3155,
"step": 414000
},
{
"epoch": 2.2081229090754118,
"grad_norm": 2.265197277069092,
"learning_rate": 0.0002654446283586136,
"loss": 5.3194,
"step": 414500
},
{
"epoch": 2.2107865072769504,
"grad_norm": 2.110173225402832,
"learning_rate": 0.0002645545868352335,
"loss": 5.3144,
"step": 415000
},
{
"epoch": 2.213450105478489,
"grad_norm": 2.235196590423584,
"learning_rate": 0.000263662761661506,
"loss": 5.323,
"step": 415500
},
{
"epoch": 2.216113703680027,
"grad_norm": 2.305601119995117,
"learning_rate": 0.00026277093648777844,
"loss": 5.3187,
"step": 416000
},
{
"epoch": 2.2187773018815657,
"grad_norm": 2.401959180831909,
"learning_rate": 0.0002618791113140509,
"loss": 5.3175,
"step": 416500
},
{
"epoch": 2.2214409000831044,
"grad_norm": 2.163121223449707,
"learning_rate": 0.0002609890697906708,
"loss": 5.3169,
"step": 417000
},
{
"epoch": 2.224104498284643,
"grad_norm": 2.265998363494873,
"learning_rate": 0.0002600972446169432,
"loss": 5.3173,
"step": 417500
},
{
"epoch": 2.226768096486181,
"grad_norm": 2.236154317855835,
"learning_rate": 0.00025920541944321577,
"loss": 5.3167,
"step": 418000
},
{
"epoch": 2.2294316946877197,
"grad_norm": 2.1707651615142822,
"learning_rate": 0.0002583135942694882,
"loss": 5.3184,
"step": 418500
},
{
"epoch": 2.2320952928892583,
"grad_norm": 2.121073007583618,
"learning_rate": 0.00025742355274610806,
"loss": 5.3171,
"step": 419000
},
{
"epoch": 2.234758891090797,
"grad_norm": 2.2292840480804443,
"learning_rate": 0.00025653172757238055,
"loss": 5.3185,
"step": 419500
},
{
"epoch": 2.237422489292335,
"grad_norm": 2.2376914024353027,
"learning_rate": 0.000255639902398653,
"loss": 5.3143,
"step": 420000
},
{
"epoch": 2.2400860874938737,
"grad_norm": 2.2844974994659424,
"learning_rate": 0.0002547480772249254,
"loss": 5.3039,
"step": 420500
},
{
"epoch": 2.2427496856954123,
"grad_norm": 2.278136968612671,
"learning_rate": 0.0002538562520511979,
"loss": 5.3159,
"step": 421000
},
{
"epoch": 2.2454132838969505,
"grad_norm": 2.3182220458984375,
"learning_rate": 0.00025296442687747033,
"loss": 5.319,
"step": 421500
},
{
"epoch": 2.248076882098489,
"grad_norm": 2.5095927715301514,
"learning_rate": 0.0002520743853540903,
"loss": 5.3174,
"step": 422000
},
{
"epoch": 2.2507404803000277,
"grad_norm": 2.3167264461517334,
"learning_rate": 0.00025118256018036273,
"loss": 5.3131,
"step": 422500
},
{
"epoch": 2.2534040785015663,
"grad_norm": 2.211766481399536,
"learning_rate": 0.00025029073500663516,
"loss": 5.325,
"step": 423000
},
{
"epoch": 2.256067676703105,
"grad_norm": 2.1502010822296143,
"learning_rate": 0.00024939890983290765,
"loss": 5.3139,
"step": 423500
},
{
"epoch": 2.258731274904643,
"grad_norm": 2.1429567337036133,
"learning_rate": 0.00024850886830952756,
"loss": 5.3147,
"step": 424000
},
{
"epoch": 2.2613948731061817,
"grad_norm": 2.272367238998413,
"learning_rate": 0.0002476170431358,
"loss": 5.3128,
"step": 424500
},
{
"epoch": 2.2640584713077203,
"grad_norm": 2.6372079849243164,
"learning_rate": 0.00024672521796207243,
"loss": 5.3134,
"step": 425000
},
{
"epoch": 2.2667220695092585,
"grad_norm": 2.4213263988494873,
"learning_rate": 0.0002458333927883449,
"loss": 5.3142,
"step": 425500
},
{
"epoch": 2.269385667710797,
"grad_norm": 2.2919113636016846,
"learning_rate": 0.0002449415676146174,
"loss": 5.3199,
"step": 426000
},
{
"epoch": 2.2720492659123357,
"grad_norm": 2.1887030601501465,
"learning_rate": 0.0002440515260912373,
"loss": 5.3168,
"step": 426500
},
{
"epoch": 2.2747128641138743,
"grad_norm": 2.2401158809661865,
"learning_rate": 0.00024315970091750975,
"loss": 5.3142,
"step": 427000
},
{
"epoch": 2.2773764623154125,
"grad_norm": 2.264155864715576,
"learning_rate": 0.0002422678757437822,
"loss": 5.3063,
"step": 427500
},
{
"epoch": 2.280040060516951,
"grad_norm": 2.372823476791382,
"learning_rate": 0.00024137605057005467,
"loss": 5.3146,
"step": 428000
},
{
"epoch": 2.2827036587184897,
"grad_norm": 2.5441572666168213,
"learning_rate": 0.00024048600904667456,
"loss": 5.3129,
"step": 428500
},
{
"epoch": 2.2853672569200283,
"grad_norm": 2.107741594314575,
"learning_rate": 0.00023959418387294702,
"loss": 5.3112,
"step": 429000
},
{
"epoch": 2.2880308551215665,
"grad_norm": 2.1812095642089844,
"learning_rate": 0.00023870235869921948,
"loss": 5.3144,
"step": 429500
},
{
"epoch": 2.290694453323105,
"grad_norm": 2.3959500789642334,
"learning_rate": 0.00023781053352549194,
"loss": 5.3108,
"step": 430000
},
{
"epoch": 2.2933580515246437,
"grad_norm": 2.3315865993499756,
"learning_rate": 0.00023691870835176437,
"loss": 5.3093,
"step": 430500
},
{
"epoch": 2.2960216497261823,
"grad_norm": 2.0199296474456787,
"learning_rate": 0.00023602688317803685,
"loss": 5.3209,
"step": 431000
},
{
"epoch": 2.2986852479277204,
"grad_norm": 2.2393200397491455,
"learning_rate": 0.00023513684165465677,
"loss": 5.3074,
"step": 431500
},
{
"epoch": 2.301348846129259,
"grad_norm": 2.4474637508392334,
"learning_rate": 0.00023424501648092923,
"loss": 5.3158,
"step": 432000
},
{
"epoch": 2.3040124443307977,
"grad_norm": 2.3248863220214844,
"learning_rate": 0.00023335319130720166,
"loss": 5.3157,
"step": 432500
},
{
"epoch": 2.3066760425323363,
"grad_norm": 2.4158935546875,
"learning_rate": 0.00023246136613347412,
"loss": 5.3092,
"step": 433000
},
{
"epoch": 2.3093396407338744,
"grad_norm": 2.084850549697876,
"learning_rate": 0.00023156954095974658,
"loss": 5.3178,
"step": 433500
},
{
"epoch": 2.312003238935413,
"grad_norm": 2.319776773452759,
"learning_rate": 0.0002306794994363665,
"loss": 5.3074,
"step": 434000
},
{
"epoch": 2.3146668371369516,
"grad_norm": 2.2137837409973145,
"learning_rate": 0.00022978767426263893,
"loss": 5.3073,
"step": 434500
},
{
"epoch": 2.31733043533849,
"grad_norm": 2.4062960147857666,
"learning_rate": 0.0002288958490889114,
"loss": 5.3112,
"step": 435000
},
{
"epoch": 2.3199940335400284,
"grad_norm": 2.27229380607605,
"learning_rate": 0.00022800402391518385,
"loss": 5.3114,
"step": 435500
},
{
"epoch": 2.322657631741567,
"grad_norm": 2.499032974243164,
"learning_rate": 0.00022711219874145633,
"loss": 5.3132,
"step": 436000
},
{
"epoch": 2.3253212299431056,
"grad_norm": 2.071829080581665,
"learning_rate": 0.00022622215721807625,
"loss": 5.3181,
"step": 436500
},
{
"epoch": 2.3279848281446442,
"grad_norm": 2.4178686141967773,
"learning_rate": 0.00022533033204434868,
"loss": 5.3079,
"step": 437000
},
{
"epoch": 2.3306484263461824,
"grad_norm": 2.431913375854492,
"learning_rate": 0.00022443850687062114,
"loss": 5.311,
"step": 437500
},
{
"epoch": 2.333312024547721,
"grad_norm": 2.3519508838653564,
"learning_rate": 0.0002235466816968936,
"loss": 5.3149,
"step": 438000
},
{
"epoch": 2.3359756227492596,
"grad_norm": 2.286878824234009,
"learning_rate": 0.00022265485652316606,
"loss": 5.312,
"step": 438500
},
{
"epoch": 2.338639220950798,
"grad_norm": 2.3200433254241943,
"learning_rate": 0.00022176481499978595,
"loss": 5.2989,
"step": 439000
},
{
"epoch": 2.3413028191523364,
"grad_norm": 2.165735960006714,
"learning_rate": 0.0002208729898260584,
"loss": 5.3169,
"step": 439500
},
{
"epoch": 2.343966417353875,
"grad_norm": 2.0269339084625244,
"learning_rate": 0.00021998116465233087,
"loss": 5.3088,
"step": 440000
},
{
"epoch": 2.3466300155554136,
"grad_norm": 2.2074029445648193,
"learning_rate": 0.00021908933947860333,
"loss": 5.3096,
"step": 440500
},
{
"epoch": 2.3492936137569522,
"grad_norm": 2.7109835147857666,
"learning_rate": 0.0002181975143048758,
"loss": 5.3089,
"step": 441000
},
{
"epoch": 2.3519572119584904,
"grad_norm": 2.2240071296691895,
"learning_rate": 0.0002173074727814957,
"loss": 5.3148,
"step": 441500
},
{
"epoch": 2.354620810160029,
"grad_norm": 2.26788330078125,
"learning_rate": 0.00021641564760776816,
"loss": 5.3113,
"step": 442000
},
{
"epoch": 2.3572844083615676,
"grad_norm": 2.389122486114502,
"learning_rate": 0.00021552382243404062,
"loss": 5.3133,
"step": 442500
},
{
"epoch": 2.3599480065631058,
"grad_norm": 2.382267475128174,
"learning_rate": 0.00021463199726031308,
"loss": 5.3129,
"step": 443000
},
{
"epoch": 2.3626116047646444,
"grad_norm": 2.411574363708496,
"learning_rate": 0.00021374195573693297,
"loss": 5.3022,
"step": 443500
},
{
"epoch": 2.365275202966183,
"grad_norm": 2.348522424697876,
"learning_rate": 0.00021285013056320543,
"loss": 5.3137,
"step": 444000
},
{
"epoch": 2.3679388011677216,
"grad_norm": 2.3230319023132324,
"learning_rate": 0.00021195830538947789,
"loss": 5.3059,
"step": 444500
},
{
"epoch": 2.3706023993692598,
"grad_norm": 2.2816174030303955,
"learning_rate": 0.00021106648021575035,
"loss": 5.3117,
"step": 445000
},
{
"epoch": 2.3732659975707984,
"grad_norm": 2.400097370147705,
"learning_rate": 0.0002101746550420228,
"loss": 5.3095,
"step": 445500
},
{
"epoch": 2.375929595772337,
"grad_norm": 2.470815896987915,
"learning_rate": 0.00020928461351864272,
"loss": 5.3027,
"step": 446000
},
{
"epoch": 2.3785931939738756,
"grad_norm": 2.1947262287139893,
"learning_rate": 0.00020839278834491518,
"loss": 5.3031,
"step": 446500
},
{
"epoch": 2.3812567921754138,
"grad_norm": 2.3549935817718506,
"learning_rate": 0.00020750096317118764,
"loss": 5.3083,
"step": 447000
},
{
"epoch": 2.3839203903769524,
"grad_norm": 2.457932949066162,
"learning_rate": 0.0002066091379974601,
"loss": 5.3052,
"step": 447500
},
{
"epoch": 2.386583988578491,
"grad_norm": 2.2867889404296875,
"learning_rate": 0.00020571909647407999,
"loss": 5.3155,
"step": 448000
},
{
"epoch": 2.3892475867800296,
"grad_norm": 2.061497688293457,
"learning_rate": 0.00020482727130035245,
"loss": 5.3087,
"step": 448500
},
{
"epoch": 2.3919111849815677,
"grad_norm": 2.2757697105407715,
"learning_rate": 0.0002039354461266249,
"loss": 5.3095,
"step": 449000
},
{
"epoch": 2.3945747831831063,
"grad_norm": 2.4835853576660156,
"learning_rate": 0.00020304362095289736,
"loss": 5.3091,
"step": 449500
},
{
"epoch": 2.397238381384645,
"grad_norm": 2.2896037101745605,
"learning_rate": 0.00020215357942951728,
"loss": 5.3124,
"step": 450000
},
{
"epoch": 2.3999019795861836,
"grad_norm": 2.31545090675354,
"learning_rate": 0.00020126175425578974,
"loss": 5.31,
"step": 450500
},
{
"epoch": 2.4025655777877217,
"grad_norm": 2.296827554702759,
"learning_rate": 0.0002003699290820622,
"loss": 5.3027,
"step": 451000
},
{
"epoch": 2.4052291759892603,
"grad_norm": 2.60396671295166,
"learning_rate": 0.00019947810390833466,
"loss": 5.312,
"step": 451500
},
{
"epoch": 2.407892774190799,
"grad_norm": 2.500142812728882,
"learning_rate": 0.00019858627873460712,
"loss": 5.2995,
"step": 452000
},
{
"epoch": 2.4105563723923376,
"grad_norm": 2.179241180419922,
"learning_rate": 0.000197696237211227,
"loss": 5.3034,
"step": 452500
},
{
"epoch": 2.4132199705938757,
"grad_norm": 2.5400588512420654,
"learning_rate": 0.00019680441203749947,
"loss": 5.3074,
"step": 453000
},
{
"epoch": 2.4158835687954143,
"grad_norm": 2.4482738971710205,
"learning_rate": 0.00019591258686377192,
"loss": 5.301,
"step": 453500
},
{
"epoch": 2.418547166996953,
"grad_norm": 2.3452165126800537,
"learning_rate": 0.00019502076169004438,
"loss": 5.311,
"step": 454000
},
{
"epoch": 2.4212107651984915,
"grad_norm": 2.1771457195281982,
"learning_rate": 0.0001941307201666643,
"loss": 5.3035,
"step": 454500
},
{
"epoch": 2.4238743634000297,
"grad_norm": 2.195034980773926,
"learning_rate": 0.00019323889499293676,
"loss": 5.3069,
"step": 455000
},
{
"epoch": 2.4265379616015683,
"grad_norm": 2.3099453449249268,
"learning_rate": 0.00019234706981920922,
"loss": 5.3075,
"step": 455500
},
{
"epoch": 2.429201559803107,
"grad_norm": 2.5112428665161133,
"learning_rate": 0.00019145524464548168,
"loss": 5.3093,
"step": 456000
},
{
"epoch": 2.431865158004645,
"grad_norm": 2.470879316329956,
"learning_rate": 0.00019056520312210157,
"loss": 5.3021,
"step": 456500
},
{
"epoch": 2.4345287562061837,
"grad_norm": 2.381201982498169,
"learning_rate": 0.00018967337794837403,
"loss": 5.304,
"step": 457000
},
{
"epoch": 2.4371923544077223,
"grad_norm": 2.30584454536438,
"learning_rate": 0.00018878155277464648,
"loss": 5.3063,
"step": 457500
},
{
"epoch": 2.439855952609261,
"grad_norm": 2.1264095306396484,
"learning_rate": 0.00018788972760091894,
"loss": 5.303,
"step": 458000
},
{
"epoch": 2.4425195508107995,
"grad_norm": 2.5097908973693848,
"learning_rate": 0.0001869979024271914,
"loss": 5.3028,
"step": 458500
},
{
"epoch": 2.4451831490123377,
"grad_norm": 2.1753334999084473,
"learning_rate": 0.00018610786090381132,
"loss": 5.303,
"step": 459000
},
{
"epoch": 2.4478467472138763,
"grad_norm": 2.393508195877075,
"learning_rate": 0.00018521603573008378,
"loss": 5.3065,
"step": 459500
},
{
"epoch": 2.450510345415415,
"grad_norm": 2.4845023155212402,
"learning_rate": 0.00018432421055635624,
"loss": 5.3055,
"step": 460000
},
{
"epoch": 2.453173943616953,
"grad_norm": 2.286433458328247,
"learning_rate": 0.0001834323853826287,
"loss": 5.3093,
"step": 460500
},
{
"epoch": 2.4558375418184917,
"grad_norm": 2.3205184936523438,
"learning_rate": 0.00018254056020890113,
"loss": 5.3046,
"step": 461000
},
{
"epoch": 2.4585011400200303,
"grad_norm": 2.2458608150482178,
"learning_rate": 0.00018165051868552104,
"loss": 5.3034,
"step": 461500
},
{
"epoch": 2.461164738221569,
"grad_norm": 2.4838719367980957,
"learning_rate": 0.0001807586935117935,
"loss": 5.3067,
"step": 462000
},
{
"epoch": 2.463828336423107,
"grad_norm": 2.363417148590088,
"learning_rate": 0.00017986686833806596,
"loss": 5.3075,
"step": 462500
},
{
"epoch": 2.4664919346246457,
"grad_norm": 2.1464176177978516,
"learning_rate": 0.0001789750431643384,
"loss": 5.2936,
"step": 463000
},
{
"epoch": 2.4691555328261843,
"grad_norm": 2.1444778442382812,
"learning_rate": 0.00017808321799061086,
"loss": 5.3012,
"step": 463500
},
{
"epoch": 2.471819131027723,
"grad_norm": 2.1136202812194824,
"learning_rate": 0.0001771931764672308,
"loss": 5.2991,
"step": 464000
},
{
"epoch": 2.474482729229261,
"grad_norm": 2.325840950012207,
"learning_rate": 0.00017630135129350326,
"loss": 5.3005,
"step": 464500
},
{
"epoch": 2.4771463274307997,
"grad_norm": 2.1854569911956787,
"learning_rate": 0.00017540952611977572,
"loss": 5.3041,
"step": 465000
},
{
"epoch": 2.4798099256323383,
"grad_norm": 2.247187614440918,
"learning_rate": 0.00017451770094604815,
"loss": 5.3038,
"step": 465500
},
{
"epoch": 2.482473523833877,
"grad_norm": 2.3324661254882812,
"learning_rate": 0.0001736258757723206,
"loss": 5.2999,
"step": 466000
},
{
"epoch": 2.485137122035415,
"grad_norm": 2.3304693698883057,
"learning_rate": 0.00017273583424894052,
"loss": 5.3022,
"step": 466500
},
{
"epoch": 2.4878007202369536,
"grad_norm": 2.5459063053131104,
"learning_rate": 0.00017184400907521298,
"loss": 5.3082,
"step": 467000
},
{
"epoch": 2.4904643184384923,
"grad_norm": 2.280992031097412,
"learning_rate": 0.00017095218390148542,
"loss": 5.3027,
"step": 467500
},
{
"epoch": 2.493127916640031,
"grad_norm": 2.204409599304199,
"learning_rate": 0.00017006035872775787,
"loss": 5.3056,
"step": 468000
},
{
"epoch": 2.495791514841569,
"grad_norm": 2.7257113456726074,
"learning_rate": 0.0001691703172043778,
"loss": 5.3043,
"step": 468500
},
{
"epoch": 2.4984551130431076,
"grad_norm": 2.262225866317749,
"learning_rate": 0.00016827849203065025,
"loss": 5.3022,
"step": 469000
},
{
"epoch": 2.5011187112446462,
"grad_norm": 2.167947769165039,
"learning_rate": 0.0001673866668569227,
"loss": 5.2977,
"step": 469500
},
{
"epoch": 2.5037823094461844,
"grad_norm": 2.434269428253174,
"learning_rate": 0.00016649484168319517,
"loss": 5.3003,
"step": 470000
},
{
"epoch": 2.506445907647723,
"grad_norm": 2.2088136672973633,
"learning_rate": 0.00016560480015981508,
"loss": 5.3048,
"step": 470500
},
{
"epoch": 2.5091095058492616,
"grad_norm": 2.268261194229126,
"learning_rate": 0.00016471297498608754,
"loss": 5.3048,
"step": 471000
},
{
"epoch": 2.5117731040508002,
"grad_norm": 2.462432384490967,
"learning_rate": 0.00016382114981235998,
"loss": 5.305,
"step": 471500
},
{
"epoch": 2.514436702252339,
"grad_norm": 2.6072680950164795,
"learning_rate": 0.00016292932463863243,
"loss": 5.2986,
"step": 472000
},
{
"epoch": 2.517100300453877,
"grad_norm": 2.600860118865967,
"learning_rate": 0.0001620374994649049,
"loss": 5.299,
"step": 472500
},
{
"epoch": 2.5197638986554156,
"grad_norm": 2.3521888256073,
"learning_rate": 0.00016114567429117735,
"loss": 5.2936,
"step": 473000
},
{
"epoch": 2.522427496856954,
"grad_norm": 2.712414026260376,
"learning_rate": 0.00016025563276779724,
"loss": 5.303,
"step": 473500
},
{
"epoch": 2.5250910950584924,
"grad_norm": 2.267749071121216,
"learning_rate": 0.0001593638075940697,
"loss": 5.3026,
"step": 474000
},
{
"epoch": 2.527754693260031,
"grad_norm": 2.206207275390625,
"learning_rate": 0.0001584719824203422,
"loss": 5.2967,
"step": 474500
},
{
"epoch": 2.5304182914615696,
"grad_norm": 2.3536181449890137,
"learning_rate": 0.00015758015724661465,
"loss": 5.2971,
"step": 475000
},
{
"epoch": 2.533081889663108,
"grad_norm": 2.229966163635254,
"learning_rate": 0.00015669011572323456,
"loss": 5.3002,
"step": 475500
},
{
"epoch": 2.535745487864647,
"grad_norm": 2.391902208328247,
"learning_rate": 0.000155798290549507,
"loss": 5.3046,
"step": 476000
},
{
"epoch": 2.538409086066185,
"grad_norm": 2.367274522781372,
"learning_rate": 0.00015490646537577945,
"loss": 5.3014,
"step": 476500
},
{
"epoch": 2.5410726842677236,
"grad_norm": 2.398796319961548,
"learning_rate": 0.00015401464020205191,
"loss": 5.3012,
"step": 477000
},
{
"epoch": 2.543736282469262,
"grad_norm": 2.2506918907165527,
"learning_rate": 0.00015312281502832437,
"loss": 5.3034,
"step": 477500
},
{
"epoch": 2.5463998806708004,
"grad_norm": 2.4038991928100586,
"learning_rate": 0.00015223277350494426,
"loss": 5.298,
"step": 478000
},
{
"epoch": 2.549063478872339,
"grad_norm": 2.2355668544769287,
"learning_rate": 0.00015134094833121672,
"loss": 5.2999,
"step": 478500
},
{
"epoch": 2.5517270770738776,
"grad_norm": 2.312537908554077,
"learning_rate": 0.00015044912315748918,
"loss": 5.2987,
"step": 479000
},
{
"epoch": 2.554390675275416,
"grad_norm": 2.4338889122009277,
"learning_rate": 0.00014955729798376164,
"loss": 5.2936,
"step": 479500
},
{
"epoch": 2.557054273476955,
"grad_norm": 2.303349018096924,
"learning_rate": 0.00014866725646038155,
"loss": 5.2941,
"step": 480000
},
{
"epoch": 2.559717871678493,
"grad_norm": 2.27744197845459,
"learning_rate": 0.00014777543128665401,
"loss": 5.2961,
"step": 480500
},
{
"epoch": 2.5623814698800316,
"grad_norm": 2.364135265350342,
"learning_rate": 0.00014688360611292647,
"loss": 5.2982,
"step": 481000
},
{
"epoch": 2.56504506808157,
"grad_norm": 2.652825355529785,
"learning_rate": 0.00014599178093919893,
"loss": 5.3072,
"step": 481500
},
{
"epoch": 2.5677086662831083,
"grad_norm": 2.2864181995391846,
"learning_rate": 0.00014510173941581882,
"loss": 5.3027,
"step": 482000
},
{
"epoch": 2.570372264484647,
"grad_norm": 2.1780378818511963,
"learning_rate": 0.00014420991424209128,
"loss": 5.2988,
"step": 482500
},
{
"epoch": 2.5730358626861856,
"grad_norm": 2.4762122631073,
"learning_rate": 0.00014331808906836374,
"loss": 5.2963,
"step": 483000
},
{
"epoch": 2.575699460887724,
"grad_norm": 2.3064920902252197,
"learning_rate": 0.0001424262638946362,
"loss": 5.304,
"step": 483500
},
{
"epoch": 2.5783630590892628,
"grad_norm": 2.17753529548645,
"learning_rate": 0.00014153443872090866,
"loss": 5.2909,
"step": 484000
},
{
"epoch": 2.581026657290801,
"grad_norm": 2.442643404006958,
"learning_rate": 0.00014064439719752857,
"loss": 5.3033,
"step": 484500
},
{
"epoch": 2.5836902554923395,
"grad_norm": 2.5781943798065186,
"learning_rate": 0.00013975257202380103,
"loss": 5.2969,
"step": 485000
},
{
"epoch": 2.586353853693878,
"grad_norm": 2.1409718990325928,
"learning_rate": 0.0001388607468500735,
"loss": 5.2987,
"step": 485500
},
{
"epoch": 2.5890174518954163,
"grad_norm": 2.23543381690979,
"learning_rate": 0.00013796892167634595,
"loss": 5.2989,
"step": 486000
},
{
"epoch": 2.591681050096955,
"grad_norm": 2.418957233428955,
"learning_rate": 0.00013707888015296584,
"loss": 5.2972,
"step": 486500
},
{
"epoch": 2.5943446482984935,
"grad_norm": 2.292370080947876,
"learning_rate": 0.0001361870549792383,
"loss": 5.3005,
"step": 487000
},
{
"epoch": 2.5970082465000317,
"grad_norm": 2.360339403152466,
"learning_rate": 0.00013529522980551076,
"loss": 5.2974,
"step": 487500
},
{
"epoch": 2.5996718447015703,
"grad_norm": 2.2026000022888184,
"learning_rate": 0.00013440340463178322,
"loss": 5.3012,
"step": 488000
},
{
"epoch": 2.602335442903109,
"grad_norm": 2.273235559463501,
"learning_rate": 0.00013351336310840313,
"loss": 5.2955,
"step": 488500
},
{
"epoch": 2.6049990411046475,
"grad_norm": 2.349081516265869,
"learning_rate": 0.0001326215379346756,
"loss": 5.3026,
"step": 489000
},
{
"epoch": 2.607662639306186,
"grad_norm": 2.4691007137298584,
"learning_rate": 0.00013172971276094805,
"loss": 5.2999,
"step": 489500
},
{
"epoch": 2.6103262375077243,
"grad_norm": 2.3375978469848633,
"learning_rate": 0.0001308378875872205,
"loss": 5.2849,
"step": 490000
},
{
"epoch": 2.612989835709263,
"grad_norm": 2.3784444332122803,
"learning_rate": 0.0001299478460638404,
"loss": 5.2937,
"step": 490500
},
{
"epoch": 2.6156534339108015,
"grad_norm": 2.4842257499694824,
"learning_rate": 0.00012905602089011286,
"loss": 5.2919,
"step": 491000
},
{
"epoch": 2.6183170321123397,
"grad_norm": 2.2826011180877686,
"learning_rate": 0.00012816419571638532,
"loss": 5.2902,
"step": 491500
},
{
"epoch": 2.6209806303138783,
"grad_norm": 2.300616979598999,
"learning_rate": 0.00012727237054265778,
"loss": 5.3024,
"step": 492000
},
{
"epoch": 2.623644228515417,
"grad_norm": 2.4524025917053223,
"learning_rate": 0.00012638232901927772,
"loss": 5.2908,
"step": 492500
},
{
"epoch": 2.6263078267169555,
"grad_norm": 2.3518335819244385,
"learning_rate": 0.00012549050384555015,
"loss": 5.2977,
"step": 493000
},
{
"epoch": 2.628971424918494,
"grad_norm": 2.5559749603271484,
"learning_rate": 0.0001245986786718226,
"loss": 5.2933,
"step": 493500
},
{
"epoch": 2.6316350231200323,
"grad_norm": 2.32487416267395,
"learning_rate": 0.00012370685349809507,
"loss": 5.2941,
"step": 494000
},
{
"epoch": 2.634298621321571,
"grad_norm": 2.384162187576294,
"learning_rate": 0.000122816811974715,
"loss": 5.2978,
"step": 494500
},
{
"epoch": 2.6369622195231095,
"grad_norm": 2.7350683212280273,
"learning_rate": 0.00012192498680098743,
"loss": 5.3015,
"step": 495000
},
{
"epoch": 2.6396258177246477,
"grad_norm": 2.5397427082061768,
"learning_rate": 0.00012103316162725988,
"loss": 5.2924,
"step": 495500
},
{
"epoch": 2.6422894159261863,
"grad_norm": 2.4719595909118652,
"learning_rate": 0.00012014133645353234,
"loss": 5.2982,
"step": 496000
},
{
"epoch": 2.644953014127725,
"grad_norm": 2.7110893726348877,
"learning_rate": 0.0001192495112798048,
"loss": 5.2908,
"step": 496500
},
{
"epoch": 2.6476166123292635,
"grad_norm": 2.5090041160583496,
"learning_rate": 0.00011835946975642471,
"loss": 5.2939,
"step": 497000
},
{
"epoch": 2.650280210530802,
"grad_norm": 2.5113580226898193,
"learning_rate": 0.00011746764458269717,
"loss": 5.2935,
"step": 497500
},
{
"epoch": 2.6529438087323403,
"grad_norm": 2.4266409873962402,
"learning_rate": 0.00011657581940896962,
"loss": 5.2931,
"step": 498000
},
{
"epoch": 2.655607406933879,
"grad_norm": 2.4426701068878174,
"learning_rate": 0.00011568399423524208,
"loss": 5.2909,
"step": 498500
},
{
"epoch": 2.6582710051354175,
"grad_norm": 2.5790412425994873,
"learning_rate": 0.00011479216906151452,
"loss": 5.2919,
"step": 499000
},
{
"epoch": 2.6609346033369556,
"grad_norm": 2.309144973754883,
"learning_rate": 0.00011390212753813445,
"loss": 5.2967,
"step": 499500
},
{
"epoch": 2.6635982015384942,
"grad_norm": 2.297360420227051,
"learning_rate": 0.0001130103023644069,
"loss": 5.2918,
"step": 500000
},
{
"epoch": 2.666261799740033,
"grad_norm": 2.539792776107788,
"learning_rate": 0.00011211847719067936,
"loss": 5.2914,
"step": 500500
},
{
"epoch": 2.6689253979415715,
"grad_norm": 2.246025800704956,
"learning_rate": 0.00011122665201695182,
"loss": 5.2968,
"step": 501000
},
{
"epoch": 2.67158899614311,
"grad_norm": 2.34342885017395,
"learning_rate": 0.00011033661049357173,
"loss": 5.2885,
"step": 501500
},
{
"epoch": 2.6742525943446482,
"grad_norm": 2.4776382446289062,
"learning_rate": 0.00010944478531984418,
"loss": 5.2944,
"step": 502000
},
{
"epoch": 2.676916192546187,
"grad_norm": 2.583674907684326,
"learning_rate": 0.00010855296014611664,
"loss": 5.2907,
"step": 502500
},
{
"epoch": 2.6795797907477255,
"grad_norm": 2.3661584854125977,
"learning_rate": 0.0001076611349723891,
"loss": 5.2969,
"step": 503000
},
{
"epoch": 2.6822433889492636,
"grad_norm": 2.3716771602630615,
"learning_rate": 0.00010677109344900901,
"loss": 5.2993,
"step": 503500
},
{
"epoch": 2.6849069871508022,
"grad_norm": 2.3315460681915283,
"learning_rate": 0.00010587926827528146,
"loss": 5.2914,
"step": 504000
},
{
"epoch": 2.687570585352341,
"grad_norm": 2.2361655235290527,
"learning_rate": 0.00010498744310155392,
"loss": 5.288,
"step": 504500
},
{
"epoch": 2.690234183553879,
"grad_norm": 2.3718972206115723,
"learning_rate": 0.00010409561792782638,
"loss": 5.2933,
"step": 505000
},
{
"epoch": 2.6928977817554176,
"grad_norm": 2.414783477783203,
"learning_rate": 0.0001032055764044463,
"loss": 5.2905,
"step": 505500
},
{
"epoch": 2.695561379956956,
"grad_norm": 2.5909764766693115,
"learning_rate": 0.00010231375123071875,
"loss": 5.2889,
"step": 506000
},
{
"epoch": 2.698224978158495,
"grad_norm": 2.2361748218536377,
"learning_rate": 0.0001014219260569912,
"loss": 5.2884,
"step": 506500
},
{
"epoch": 2.7008885763600334,
"grad_norm": 2.3554787635803223,
"learning_rate": 0.00010053010088326366,
"loss": 5.283,
"step": 507000
},
{
"epoch": 2.7035521745615716,
"grad_norm": 2.4235968589782715,
"learning_rate": 9.96382757095361e-05,
"loss": 5.2991,
"step": 507500
},
{
"epoch": 2.70621577276311,
"grad_norm": 2.334272861480713,
"learning_rate": 9.874645053580856e-05,
"loss": 5.2921,
"step": 508000
},
{
"epoch": 2.708879370964649,
"grad_norm": 2.443535566329956,
"learning_rate": 9.785640901242848e-05,
"loss": 5.2934,
"step": 508500
},
{
"epoch": 2.711542969166187,
"grad_norm": 2.4466655254364014,
"learning_rate": 9.696458383870094e-05,
"loss": 5.2915,
"step": 509000
},
{
"epoch": 2.7142065673677256,
"grad_norm": 2.1013219356536865,
"learning_rate": 9.60727586649734e-05,
"loss": 5.2942,
"step": 509500
},
{
"epoch": 2.716870165569264,
"grad_norm": 2.486953020095825,
"learning_rate": 9.518093349124584e-05,
"loss": 5.2948,
"step": 510000
},
{
"epoch": 2.719533763770803,
"grad_norm": 2.246967077255249,
"learning_rate": 9.429089196786576e-05,
"loss": 5.288,
"step": 510500
},
{
"epoch": 2.7221973619723414,
"grad_norm": 2.308177947998047,
"learning_rate": 9.339906679413822e-05,
"loss": 5.2925,
"step": 511000
},
{
"epoch": 2.7248609601738796,
"grad_norm": 2.3832600116729736,
"learning_rate": 9.250724162041068e-05,
"loss": 5.2925,
"step": 511500
},
{
"epoch": 2.727524558375418,
"grad_norm": 2.2219245433807373,
"learning_rate": 9.161541644668312e-05,
"loss": 5.294,
"step": 512000
},
{
"epoch": 2.730188156576957,
"grad_norm": 2.4265191555023193,
"learning_rate": 9.072537492330303e-05,
"loss": 5.2875,
"step": 512500
},
{
"epoch": 2.732851754778495,
"grad_norm": 2.553427219390869,
"learning_rate": 8.98335497495755e-05,
"loss": 5.2984,
"step": 513000
},
{
"epoch": 2.7355153529800336,
"grad_norm": 2.3475024700164795,
"learning_rate": 8.894172457584796e-05,
"loss": 5.2827,
"step": 513500
},
{
"epoch": 2.738178951181572,
"grad_norm": 2.5305187702178955,
"learning_rate": 8.80498994021204e-05,
"loss": 5.2937,
"step": 514000
},
{
"epoch": 2.740842549383111,
"grad_norm": 2.4398436546325684,
"learning_rate": 8.71598578787403e-05,
"loss": 5.2948,
"step": 514500
},
{
"epoch": 2.7435061475846494,
"grad_norm": 2.4077444076538086,
"learning_rate": 8.626803270501276e-05,
"loss": 5.2882,
"step": 515000
},
{
"epoch": 2.7461697457861876,
"grad_norm": 2.346778392791748,
"learning_rate": 8.537620753128524e-05,
"loss": 5.2875,
"step": 515500
},
{
"epoch": 2.748833343987726,
"grad_norm": 2.4900453090667725,
"learning_rate": 8.448438235755768e-05,
"loss": 5.2835,
"step": 516000
},
{
"epoch": 2.7514969421892648,
"grad_norm": 2.4355154037475586,
"learning_rate": 8.359255718383014e-05,
"loss": 5.29,
"step": 516500
},
{
"epoch": 2.754160540390803,
"grad_norm": 2.18061900138855,
"learning_rate": 8.270251566045004e-05,
"loss": 5.288,
"step": 517000
},
{
"epoch": 2.7568241385923415,
"grad_norm": 2.3646693229675293,
"learning_rate": 8.18106904867225e-05,
"loss": 5.2789,
"step": 517500
},
{
"epoch": 2.75948773679388,
"grad_norm": 2.369717836380005,
"learning_rate": 8.091886531299498e-05,
"loss": 5.2901,
"step": 518000
},
{
"epoch": 2.7621513349954188,
"grad_norm": 2.4666647911071777,
"learning_rate": 8.002704013926742e-05,
"loss": 5.2858,
"step": 518500
},
{
"epoch": 2.7648149331969574,
"grad_norm": 2.3375349044799805,
"learning_rate": 7.913699861588732e-05,
"loss": 5.2854,
"step": 519000
},
{
"epoch": 2.7674785313984955,
"grad_norm": 2.2538347244262695,
"learning_rate": 7.824517344215978e-05,
"loss": 5.2899,
"step": 519500
},
{
"epoch": 2.770142129600034,
"grad_norm": 2.5232772827148438,
"learning_rate": 7.735334826843224e-05,
"loss": 5.2948,
"step": 520000
},
{
"epoch": 2.7728057278015728,
"grad_norm": 2.3963685035705566,
"learning_rate": 7.646152309470469e-05,
"loss": 5.2919,
"step": 520500
},
{
"epoch": 2.775469326003111,
"grad_norm": 2.0667736530303955,
"learning_rate": 7.55714815713246e-05,
"loss": 5.2825,
"step": 521000
},
{
"epoch": 2.7781329242046495,
"grad_norm": 2.421602725982666,
"learning_rate": 7.467965639759706e-05,
"loss": 5.2949,
"step": 521500
},
{
"epoch": 2.780796522406188,
"grad_norm": 2.3447656631469727,
"learning_rate": 7.378783122386952e-05,
"loss": 5.2871,
"step": 522000
},
{
"epoch": 2.7834601206077263,
"grad_norm": 2.1411802768707275,
"learning_rate": 7.289600605014197e-05,
"loss": 5.2861,
"step": 522500
},
{
"epoch": 2.786123718809265,
"grad_norm": 2.5163323879241943,
"learning_rate": 7.200418087641443e-05,
"loss": 5.286,
"step": 523000
},
{
"epoch": 2.7887873170108035,
"grad_norm": 2.482067108154297,
"learning_rate": 7.111413935303434e-05,
"loss": 5.2863,
"step": 523500
},
{
"epoch": 2.791450915212342,
"grad_norm": 2.3614418506622314,
"learning_rate": 7.02223141793068e-05,
"loss": 5.2799,
"step": 524000
},
{
"epoch": 2.7941145134138807,
"grad_norm": 2.333521842956543,
"learning_rate": 6.933048900557925e-05,
"loss": 5.2873,
"step": 524500
},
{
"epoch": 2.796778111615419,
"grad_norm": 2.2536137104034424,
"learning_rate": 6.843866383185171e-05,
"loss": 5.2909,
"step": 525000
},
{
"epoch": 2.7994417098169575,
"grad_norm": 2.516286849975586,
"learning_rate": 6.754862230847162e-05,
"loss": 5.2944,
"step": 525500
},
{
"epoch": 2.802105308018496,
"grad_norm": 2.361598253250122,
"learning_rate": 6.665679713474408e-05,
"loss": 5.2872,
"step": 526000
},
{
"epoch": 2.8047689062200343,
"grad_norm": 2.387085199356079,
"learning_rate": 6.576497196101654e-05,
"loss": 5.291,
"step": 526500
},
{
"epoch": 2.807432504421573,
"grad_norm": 2.2874443531036377,
"learning_rate": 6.487314678728899e-05,
"loss": 5.29,
"step": 527000
},
{
"epoch": 2.8100961026231115,
"grad_norm": 2.4107890129089355,
"learning_rate": 6.39831052639089e-05,
"loss": 5.2781,
"step": 527500
},
{
"epoch": 2.81275970082465,
"grad_norm": 2.3214197158813477,
"learning_rate": 6.309128009018136e-05,
"loss": 5.2851,
"step": 528000
},
{
"epoch": 2.8154232990261887,
"grad_norm": 2.3806910514831543,
"learning_rate": 6.219945491645382e-05,
"loss": 5.2824,
"step": 528500
},
{
"epoch": 2.818086897227727,
"grad_norm": 2.4679012298583984,
"learning_rate": 6.130762974272627e-05,
"loss": 5.291,
"step": 529000
},
{
"epoch": 2.8207504954292655,
"grad_norm": 2.30574631690979,
"learning_rate": 6.041758821934619e-05,
"loss": 5.2901,
"step": 529500
},
{
"epoch": 2.823414093630804,
"grad_norm": 2.309056043624878,
"learning_rate": 5.9525763045618644e-05,
"loss": 5.2778,
"step": 530000
},
{
"epoch": 2.8260776918323423,
"grad_norm": 2.378755569458008,
"learning_rate": 5.8633937871891097e-05,
"loss": 5.2815,
"step": 530500
},
{
"epoch": 2.828741290033881,
"grad_norm": 2.6057322025299072,
"learning_rate": 5.7742112698163556e-05,
"loss": 5.2866,
"step": 531000
},
{
"epoch": 2.8314048882354195,
"grad_norm": 2.3079919815063477,
"learning_rate": 5.685028752443601e-05,
"loss": 5.2791,
"step": 531500
},
{
"epoch": 2.834068486436958,
"grad_norm": 2.2242472171783447,
"learning_rate": 5.5960246001055924e-05,
"loss": 5.2865,
"step": 532000
},
{
"epoch": 2.8367320846384967,
"grad_norm": 2.3489010334014893,
"learning_rate": 5.5068420827328383e-05,
"loss": 5.2872,
"step": 532500
},
{
"epoch": 2.839395682840035,
"grad_norm": 2.9294140338897705,
"learning_rate": 5.4176595653600836e-05,
"loss": 5.2796,
"step": 533000
},
{
"epoch": 2.8420592810415735,
"grad_norm": 2.325824499130249,
"learning_rate": 5.328477047987329e-05,
"loss": 5.2878,
"step": 533500
},
{
"epoch": 2.844722879243112,
"grad_norm": 2.3206863403320312,
"learning_rate": 5.23947289564932e-05,
"loss": 5.2827,
"step": 534000
},
{
"epoch": 2.8473864774446502,
"grad_norm": 2.241338014602661,
"learning_rate": 5.150290378276566e-05,
"loss": 5.2862,
"step": 534500
},
{
"epoch": 2.850050075646189,
"grad_norm": 2.3662049770355225,
"learning_rate": 5.0611078609038116e-05,
"loss": 5.2868,
"step": 535000
},
{
"epoch": 2.8527136738477274,
"grad_norm": 2.0729544162750244,
"learning_rate": 4.971925343531057e-05,
"loss": 5.2851,
"step": 535500
},
{
"epoch": 2.855377272049266,
"grad_norm": 2.1059601306915283,
"learning_rate": 4.8829211911930484e-05,
"loss": 5.2809,
"step": 536000
},
{
"epoch": 2.8580408702508047,
"grad_norm": 2.70766282081604,
"learning_rate": 4.793738673820294e-05,
"loss": 5.2896,
"step": 536500
},
{
"epoch": 2.860704468452343,
"grad_norm": 2.526292562484741,
"learning_rate": 4.704556156447539e-05,
"loss": 5.2828,
"step": 537000
},
{
"epoch": 2.8633680666538814,
"grad_norm": 2.246443510055542,
"learning_rate": 4.6153736390747856e-05,
"loss": 5.2847,
"step": 537500
},
{
"epoch": 2.86603166485542,
"grad_norm": 2.5226643085479736,
"learning_rate": 4.5263694867367764e-05,
"loss": 5.2871,
"step": 538000
},
{
"epoch": 2.868695263056958,
"grad_norm": 2.416816473007202,
"learning_rate": 4.437186969364022e-05,
"loss": 5.2825,
"step": 538500
},
{
"epoch": 2.871358861258497,
"grad_norm": 2.5631511211395264,
"learning_rate": 4.348004451991267e-05,
"loss": 5.2815,
"step": 539000
},
{
"epoch": 2.8740224594600354,
"grad_norm": 2.2883377075195312,
"learning_rate": 4.258821934618513e-05,
"loss": 5.2824,
"step": 539500
},
{
"epoch": 2.8766860576615736,
"grad_norm": 2.4545071125030518,
"learning_rate": 4.1698177822805044e-05,
"loss": 5.278,
"step": 540000
},
{
"epoch": 2.879349655863112,
"grad_norm": 2.2015092372894287,
"learning_rate": 4.08063526490775e-05,
"loss": 5.2806,
"step": 540500
},
{
"epoch": 2.882013254064651,
"grad_norm": 2.7558255195617676,
"learning_rate": 3.9914527475349956e-05,
"loss": 5.2857,
"step": 541000
},
{
"epoch": 2.8846768522661894,
"grad_norm": 2.376549005508423,
"learning_rate": 3.902270230162241e-05,
"loss": 5.2792,
"step": 541500
},
{
"epoch": 2.887340450467728,
"grad_norm": 2.3727259635925293,
"learning_rate": 3.813266077824232e-05,
"loss": 5.2843,
"step": 542000
},
{
"epoch": 2.890004048669266,
"grad_norm": 2.3833839893341064,
"learning_rate": 3.724083560451478e-05,
"loss": 5.2785,
"step": 542500
},
{
"epoch": 2.892667646870805,
"grad_norm": 2.4702396392822266,
"learning_rate": 3.6349010430787236e-05,
"loss": 5.2785,
"step": 543000
},
{
"epoch": 2.8953312450723434,
"grad_norm": 2.54264497756958,
"learning_rate": 3.545718525705969e-05,
"loss": 5.2813,
"step": 543500
},
{
"epoch": 2.8979948432738816,
"grad_norm": 2.356501579284668,
"learning_rate": 3.456536008333214e-05,
"loss": 5.2886,
"step": 544000
},
{
"epoch": 2.90065844147542,
"grad_norm": 2.546325445175171,
"learning_rate": 3.367531855995206e-05,
"loss": 5.2778,
"step": 544500
},
{
"epoch": 2.903322039676959,
"grad_norm": 2.3812687397003174,
"learning_rate": 3.2783493386224516e-05,
"loss": 5.284,
"step": 545000
},
{
"epoch": 2.9059856378784974,
"grad_norm": 2.3538711071014404,
"learning_rate": 3.189166821249697e-05,
"loss": 5.2755,
"step": 545500
},
{
"epoch": 2.908649236080036,
"grad_norm": 2.2477262020111084,
"learning_rate": 3.099984303876943e-05,
"loss": 5.2876,
"step": 546000
},
{
"epoch": 2.911312834281574,
"grad_norm": 2.2652475833892822,
"learning_rate": 3.0109801515389333e-05,
"loss": 5.2777,
"step": 546500
},
{
"epoch": 2.9139764324831128,
"grad_norm": 2.468841791152954,
"learning_rate": 2.9217976341661793e-05,
"loss": 5.2779,
"step": 547000
},
{
"epoch": 2.9166400306846514,
"grad_norm": 2.151130437850952,
"learning_rate": 2.832615116793425e-05,
"loss": 5.2883,
"step": 547500
},
{
"epoch": 2.9193036288861895,
"grad_norm": 2.464799404144287,
"learning_rate": 2.74343259942067e-05,
"loss": 5.2843,
"step": 548000
},
{
"epoch": 2.921967227087728,
"grad_norm": 2.6122734546661377,
"learning_rate": 2.6544284470826617e-05,
"loss": 5.2854,
"step": 548500
},
{
"epoch": 2.9246308252892668,
"grad_norm": 2.257554769515991,
"learning_rate": 2.565245929709907e-05,
"loss": 5.277,
"step": 549000
},
{
"epoch": 2.9272944234908054,
"grad_norm": 2.2422280311584473,
"learning_rate": 2.476063412337153e-05,
"loss": 5.2804,
"step": 549500
},
{
"epoch": 2.929958021692344,
"grad_norm": 2.4912326335906982,
"learning_rate": 2.3868808949643985e-05,
"loss": 5.2758,
"step": 550000
},
{
"epoch": 2.932621619893882,
"grad_norm": 2.305392265319824,
"learning_rate": 2.2978767426263897e-05,
"loss": 5.2831,
"step": 550500
},
{
"epoch": 2.9352852180954208,
"grad_norm": 2.699528217315674,
"learning_rate": 2.2086942252536353e-05,
"loss": 5.2841,
"step": 551000
},
{
"epoch": 2.9379488162969594,
"grad_norm": 2.3196749687194824,
"learning_rate": 2.1195117078808806e-05,
"loss": 5.2792,
"step": 551500
},
{
"epoch": 2.9406124144984975,
"grad_norm": 2.134294033050537,
"learning_rate": 2.0303291905081265e-05,
"loss": 5.2845,
"step": 552000
},
{
"epoch": 2.943276012700036,
"grad_norm": 2.25675892829895,
"learning_rate": 1.941146673135372e-05,
"loss": 5.2778,
"step": 552500
},
{
"epoch": 2.9459396109015747,
"grad_norm": 2.141127824783325,
"learning_rate": 1.852142520797363e-05,
"loss": 5.2738,
"step": 553000
},
{
"epoch": 2.9486032091031134,
"grad_norm": 2.3503618240356445,
"learning_rate": 1.762960003424609e-05,
"loss": 5.277,
"step": 553500
},
{
"epoch": 2.951266807304652,
"grad_norm": 2.2987284660339355,
"learning_rate": 1.673777486051854e-05,
"loss": 5.2864,
"step": 554000
},
{
"epoch": 2.95393040550619,
"grad_norm": 2.384070873260498,
"learning_rate": 1.5845949686791e-05,
"loss": 5.2798,
"step": 554500
},
{
"epoch": 2.9565940037077287,
"grad_norm": 2.272744655609131,
"learning_rate": 1.4954124513063455e-05,
"loss": 5.2806,
"step": 555000
},
{
"epoch": 2.9592576019092673,
"grad_norm": 2.2945611476898193,
"learning_rate": 1.4064082989683367e-05,
"loss": 5.2852,
"step": 555500
},
{
"epoch": 2.9619212001108055,
"grad_norm": 2.5340495109558105,
"learning_rate": 1.3172257815955822e-05,
"loss": 5.2764,
"step": 556000
},
{
"epoch": 2.964584798312344,
"grad_norm": 2.3637685775756836,
"learning_rate": 1.228043264222828e-05,
"loss": 5.28,
"step": 556500
},
{
"epoch": 2.9672483965138827,
"grad_norm": 2.401252031326294,
"learning_rate": 1.1388607468500735e-05,
"loss": 5.2809,
"step": 557000
},
{
"epoch": 2.9699119947154213,
"grad_norm": 2.256577253341675,
"learning_rate": 1.0498565945120647e-05,
"loss": 5.2798,
"step": 557500
},
{
"epoch": 2.9725755929169595,
"grad_norm": 2.1444365978240967,
"learning_rate": 9.606740771393103e-06,
"loss": 5.2761,
"step": 558000
},
{
"epoch": 2.975239191118498,
"grad_norm": 2.325979471206665,
"learning_rate": 8.714915597665558e-06,
"loss": 5.2804,
"step": 558500
},
{
"epoch": 2.9779027893200367,
"grad_norm": 2.1250107288360596,
"learning_rate": 7.823090423938014e-06,
"loss": 5.2767,
"step": 559000
},
{
"epoch": 2.9805663875215753,
"grad_norm": 2.4525716304779053,
"learning_rate": 6.933048900557926e-06,
"loss": 5.2805,
"step": 559500
},
{
"epoch": 2.9832299857231135,
"grad_norm": 2.176084041595459,
"learning_rate": 6.0412237268303826e-06,
"loss": 5.278,
"step": 560000
},
{
"epoch": 2.985893583924652,
"grad_norm": 2.607921600341797,
"learning_rate": 5.149398553102839e-06,
"loss": 5.2778,
"step": 560500
},
{
"epoch": 2.9885571821261907,
"grad_norm": 2.287775993347168,
"learning_rate": 4.257573379375294e-06,
"loss": 5.2721,
"step": 561000
},
{
"epoch": 2.991220780327729,
"grad_norm": 2.258080005645752,
"learning_rate": 3.3657482056477507e-06,
"loss": 5.2754,
"step": 561500
},
{
"epoch": 2.9938843785292675,
"grad_norm": 2.214787244796753,
"learning_rate": 2.475706682267662e-06,
"loss": 5.2853,
"step": 562000
},
{
"epoch": 2.996547976730806,
"grad_norm": 2.4470176696777344,
"learning_rate": 1.583881508540118e-06,
"loss": 5.2732,
"step": 562500
},
{
"epoch": 2.9992115749323447,
"grad_norm": 2.2027597427368164,
"learning_rate": 6.920563348125741e-07,
"loss": 5.2723,
"step": 563000
},
{
"epoch": 3.0,
"step": 563148,
"total_flos": 1.7947279651188326e+17,
"train_loss": 5.458770358526144,
"train_runtime": 36904.8634,
"train_samples_per_second": 976.604,
"train_steps_per_second": 15.259
}
],
"logging_steps": 500,
"max_steps": 563148,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.7947279651188326e+17,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}